From 79e7e08f8c3e62fbd6afb1d807aa4fdf9a9d4dc5 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 20 Jan 2025 09:47:47 +0200 Subject: [PATCH 01/17] Obtain the Ray cluster run ID from the user for KFP v2. Signed-off-by: Revital Sur --- kfp/doc/simple_transform_pipeline.md | 25 +++++++++++++++---- .../src/runtime_utils/kfp_utils.py | 7 +++--- .../templates/simple_pipeline.py | 25 ++++++++++--------- .../code2parquet/kfp_ray/code2parquet_wf.py | 24 +++++++++--------- .../code_quality/kfp_ray/code_quality_wf.py | 24 +++++++++--------- .../kfp_ray/header_cleanser_wf.py | 14 ++--------- .../kfp_ray/license_select_wf.py | 24 +++++++++--------- transforms/code/malware/kfp_ray/malware_wf.py | 24 +++++++++--------- .../kfp_ray/proglang_select_wf.py | 24 +++++++++--------- .../kfp_ray/repo_level_order_wf.py | 24 +++++++++--------- .../kfp_ray/doc_chunk_multiple_wf.py | 24 +++++++++--------- .../doc_chunk/kfp_ray/doc_chunk_wf.py | 24 +++++++++--------- .../kfp_ray/doc_quality_multiple_wf.py | 24 +++++++++--------- .../doc_quality/kfp_ray/doc_quality_wf.py | 25 ++++++++++--------- .../html2parquet/kfp_ray/html2parquet_wf.py | 24 +++++++++--------- .../lang_id/kfp_ray/lang_id_multiple_wf.py | 24 +++++++++--------- .../language/lang_id/kfp_ray/lang_id_wf.py | 24 +++++++++--------- .../kfp_ray/pdf2parquet_multiple_wf.py | 24 +++++++++--------- .../pdf2parquet/kfp_ray/pdf2parquet_wf.py | 24 +++++++++--------- .../pii_redactor/kfp_ray/pii_redactor_wf.py | 24 +++++++++--------- .../kfp_ray/text_encoder_multiple_wf.py | 24 +++++++++--------- .../text_encoder/kfp_ray/text_encoder_wf.py | 24 +++++++++--------- .../universal/doc_id/kfp_ray/doc_id_wf.py | 24 +++++++++--------- .../universal/ededup/kfp_ray/ededup_wf.py | 18 +++++++++---- .../universal/fdedup/kfp_ray/fdedup_wf.py | 18 +++++++++---- .../universal/filter/kfp_ray/filter_wf.py | 24 +++++++++--------- transforms/universal/hap/kfp_ray/hap_wf.py | 24 +++++++++--------- .../noop/kfp_ray/noop_multiple_wf.py | 24 +++++++++--------- transforms/universal/noop/kfp_ray/noop_wf.py | 24 +++++++++--------- .../universal/profiler/kfp_ray/profiler_wf.py | 12 +++++++++ .../universal/resize/kfp_ray/resize_wf.py | 24 +++++++++--------- .../tokenization/kfp_ray/tokenization_wf.py | 12 +++++++++ 32 files changed, 378 insertions(+), 330 deletions(-) diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index 10341c24b1..e49eef6252 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -57,11 +57,16 @@ Ray cluster. For each step we have to define a component that will execute them: ```python # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.2" - # compute execution parameters. Here different transforms might need different implementations. As - # a result, instead of creating a component we are creating it in place here. - compute_exec_params_op = comp.func_to_container_op( - func=ComponentUtils.default_compute_execution_params, base_image=base_kfp_image - ) + # KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the + # `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. + # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use + # this if/else statement and explicitly call the decorator. + if os.getenv("KFPv2", "0") == "1": + compute_exec_params_op = dsl.component_decorator.component( + func=compute_exec_params_func, base_image=base_kfp_image + ) + else: + compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) # create Ray cluster create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") # execute job @@ -148,6 +153,16 @@ Now, when all components and input parameters are defined, we can implement pipe component execution and parameters submitted to every component. ```python + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py index 7fa76453f2..3a281e48a3 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py +++ b/kfp/kfp_support_lib/shared_workflow_support/src/runtime_utils/kfp_utils.py @@ -81,9 +81,10 @@ def runtime_name(ray_name: str = "", run_id: str = "") -> str: # the return value plus namespace name will be the name of the Ray Route, # which length is restricted to 64 characters, # therefore we restrict the return name by 15 character. - if run_id != "": - return f"{ray_name[:9]}-{run_id[:5]}" - return ray_name[:15] + if run_id == "": + logger.error("Run ID must not be provided") + sys.exit(1) + return f"{ray_name[:9]}-{run_id[:5]}" @staticmethod def dict_to_req(d: dict[str, Any], executor: str = "transformer_launcher.py") -> str: diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index ce7657a5c8..2022e8359c 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -73,23 +73,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -111,9 +99,11 @@ def {{ pipeline_name }}( ray_name: str = "{{ pipeline_name }}-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed {%- if image_pull_secret != "" %} + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, {%- else %} + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, {%- endif %} @@ -142,6 +132,7 @@ def {{ pipeline_name }}( """ Pipeline to execute {{ pipeline_name }} transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -177,6 +168,16 @@ def {{ pipeline_name }}( {%- endfor %} :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index e506ab5b33..8afde87d49 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -77,23 +77,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster @@ -113,6 +101,7 @@ def compute_exec_params_func( ) def code2parquet( ray_name: str = "code2parquet-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -139,6 +128,7 @@ def code2parquet( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -178,6 +168,16 @@ def code2parquet( (here we are assuming that select language info is in S3, but potentially in the different bucket) :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index f37fb58704..ba2d8e53f7 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -74,23 +74,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster @@ -112,6 +100,7 @@ def compute_exec_params_func( def code_quality( # Ray cluster ray_name: str = "code_quality-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -136,6 +125,7 @@ def code_quality( """ Pipeline to execute Code Quality transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -171,6 +161,16 @@ def code_quality( :param cq_hf_token - Huggingface auth token to download and use the tokenizer :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 6fdf1862a5..107795463d 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -82,23 +82,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster @@ -120,6 +108,7 @@ def compute_exec_params_func( def header_cleanser( # Ray cluster ray_name: str = "header_cleanser-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -148,6 +137,7 @@ def header_cleanser( """ Pipeline to execute Header Cleanser transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 7c10b1c342..b92cb64980 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -106,6 +94,7 @@ def compute_exec_params_func( ) def license_select( ray_name: str = "license_select-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -135,6 +124,7 @@ def license_select( """ Pipeline to execute License Select transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -166,6 +156,16 @@ def license_select( :param lc_licenses_file - path to license list json file :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url) ComponentUtils.add_settings_to_component(clean_up_task, 60) diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 30525e8703..ad1bf4aaf8 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -70,23 +70,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -104,6 +92,7 @@ def compute_exec_params_func( ) def malware( ray_name: str = "malware-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -126,6 +115,7 @@ def malware( """ Pipeline to execute malware transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -160,6 +150,16 @@ def malware( :param malware_output_column - output column name :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index f1b271d3c6..3ba7d89269 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -106,6 +94,7 @@ def compute_exec_params_func( ) def lang_select( ray_name: str = "proglang-match-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -129,6 +118,7 @@ def lang_select( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -165,6 +155,16 @@ def lang_select( (here we are assuming that select language info is in S3, but potentially in the different bucket) :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 47388f3944..38099a1928 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -87,23 +87,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -123,6 +111,7 @@ def compute_exec_params_func( def repo_level_order( # Ray cluster ray_name: str = "repo_level_order-kfp-ray", + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { "replicas": 2, @@ -159,6 +148,7 @@ def repo_level_order( """ Pipeline to execute repo_level_order transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -201,6 +191,16 @@ def repo_level_order( :param repo_lvl_combine_rows - # If specified, output rows per repo are combined to form a single repo :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 62161be6d6..5518f0ba1d 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -73,23 +73,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -108,6 +96,7 @@ def compute_exec_params_func( def doc_chunk( # Ray cluster ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -139,6 +128,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -175,6 +165,16 @@ def doc_chunk( :param doc_chunk_dl_min_chunk_len - minimum chunk size :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 618c11d680..e671177a92 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -73,23 +73,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -109,6 +97,7 @@ def compute_exec_params_func( def doc_chunk( # Ray cluster ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -133,6 +122,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -169,6 +159,16 @@ def doc_chunk( :param doc_chunk_dl_min_chunk_len - minimum chunk size :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index 4a2d9de1d1..2830ce32ca 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -72,23 +72,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image, "image_pull_policy": "Always"}, ray_worker_options: dict = { "replicas": 2, @@ -137,6 +126,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -172,6 +162,16 @@ def doc_quality( :param docq_bad_word_filepath - a path to bad word file :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index e26efe832f..c4d6c7d43c 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -72,23 +72,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", ray_head_options: dict = { "cpu": 1, "memory": 4, @@ -143,6 +132,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -178,6 +168,17 @@ def doc_quality( :param docq_bad_word_filepath - a path to bad word file :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at + # compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER + # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index b6f5dff19b..b75064e795 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def html2parquet( # Ray cluster ray_name: str = "html2parquet-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -137,6 +126,7 @@ def html2parquet( """ Pipeline to execute html2parquet transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -171,6 +161,16 @@ def html2parquet( :param html2parquet_output_format - # Output format for the contents column. :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 941d326274..480f1a7384 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -77,23 +77,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -112,6 +100,7 @@ def compute_exec_params_func( def lang_id( # Ray cluster ray_name: str = "lang_id-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -145,6 +134,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -183,6 +173,16 @@ def lang_id( :param lang_id_output_score_column_name - name of the output column to hold score of prediction :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index fa4debbe33..b162437625 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -78,23 +78,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -113,6 +101,7 @@ def compute_exec_params_func( def lang_id( # Ray cluster ray_name: str = "lang_id-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -146,6 +135,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -184,6 +174,16 @@ def lang_id( :param lang_id_output_score_column_name - name of the output column to hold score of prediction :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 91d40567eb..f1796ee9f7 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -75,23 +75,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -110,6 +98,7 @@ def compute_exec_params_func( def pdf2parquet( # Ray cluster ray_name: str = "pdf2parquet-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -142,6 +131,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -179,6 +169,16 @@ def pdf2parquet( :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 4dab7d4afd..a6f308ea73 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -77,23 +77,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -113,6 +101,7 @@ def compute_exec_params_func( def pdf2parquet( # Ray cluster ray_name: str = "pdf2parquet-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -146,6 +135,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -183,6 +173,16 @@ def pdf2parquet( :param pdf2parquet_bitmap_area_threshold - threshold for bitmaps :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index b05aecd69d..fb70f789a0 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -66,23 +66,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -102,6 +90,7 @@ def compute_exec_params_func( def pii_redactor( # Ray cluster ray_name: str = "pii-redactor-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -130,6 +119,7 @@ def pii_redactor( """ Pipeline to execute pii_redactor transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -163,6 +153,16 @@ def pii_redactor( :param pii_redactor_contents - column that has pii data and needs to be transformed by pii redactor transform :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index 2005ee163b..f746f4aefd 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -106,6 +94,7 @@ def compute_exec_params_func( def text_encoder( # Ray cluster ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -129,6 +118,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -164,6 +154,16 @@ def text_encoder( :param text_encoder_output_embeddings_column_name - name of the output column :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index aa63e23f82..5e7421490e 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -71,23 +71,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def text_encoder( # Ray cluster ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -130,6 +119,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -165,6 +155,16 @@ def text_encoder( :param text_encoder_output_embeddings_column_name - name of the output column :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index c5d4cac6de..985139c92e 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -80,23 +80,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -115,6 +103,7 @@ def compute_exec_params_func( def doc_id( # Ray cluster ray_name: str = "doc_id-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -149,6 +138,7 @@ def doc_id( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -185,6 +175,16 @@ def doc_id( :param doc_id_start_id - starting id :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 17c85b6305..62db57fea6 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -49,11 +49,7 @@ compute_exec_params_op = dsl.component_decorator.component( func=ededup_compute_execution_params, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex + else: compute_exec_params_op = comp.create_component_from_func( func=ededup_compute_execution_params, base_image=base_kfp_image @@ -78,6 +74,7 @@ def ededup( # Ray cluster ray_name: str = "ededup-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -111,6 +108,7 @@ def ededup( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -148,6 +146,16 @@ def ededup( :param ededup_n_samples - number of samples for parameters computation :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 51ead9c791..bf45ac1975 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -64,11 +64,7 @@ compute_data_cleaning_exec_params_op = dsl.component_decorator.component( func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex + else: compute_common_params_op = comp.create_component_from_func(func=compute_common_params, base_image=base_kfp_image) compute_signature_calc_exec_params_op = comp.create_component_from_func( @@ -114,6 +110,7 @@ def fuzzydedup( # folders used # Ray cluster ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = { "cpu": 8, @@ -164,6 +161,7 @@ def fuzzydedup( """ Pipeline to execute FDEDUP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -208,6 +206,16 @@ def fuzzydedup( :param fdedup_n_samples - number of samples for parameters computation :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index a18d2796dd..26ae444892 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -72,23 +72,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -107,6 +95,7 @@ def compute_exec_params_func( def filtering( # Ray cluster ray_name: str = "filter-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -130,6 +119,7 @@ def filtering( """ Pipeline to execute Filtering transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -165,6 +155,16 @@ def filtering( :param filter_columns_to_drop - list of columns to drop after filtering :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 64c80fe37f..46d1dba1aa 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -79,23 +79,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -115,6 +103,7 @@ def compute_exec_params_func( def hap( # Ray cluster ray_name: str = "hap-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -149,6 +138,7 @@ def hap( """ Pipeline to execute hap transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -187,6 +177,16 @@ def hap( :param batch_size - # batch size :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 9ed874f3da..dd535db5cb 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -67,23 +67,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -102,6 +90,7 @@ def compute_exec_params_func( def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -123,6 +112,7 @@ def noop( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -156,6 +146,16 @@ def noop( :param noop_sleep_sec - noop sleep time :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 5a1ce393ae..0392e9ab57 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -67,23 +67,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -104,6 +92,7 @@ def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed + ray_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", @@ -125,6 +114,7 @@ def noop( """ Pipeline to execute noop transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -158,6 +148,16 @@ def noop( :param noop_sleep_sec - noop sleep time :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 7a157c1461..6300f62f88 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -78,6 +78,7 @@ def profiler( # Ray cluster ray_name: str = "profiler-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -102,6 +103,7 @@ def profiler( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -137,6 +139,16 @@ def profiler( :param profiler_n_samples - number of samples for parameters computation :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 6a1403f186..89007c8bea 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -76,23 +76,11 @@ def compute_exec_params_func( # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use # this if/else statement and explicitly call the decorator. if os.getenv("KFPv2", "0") == "1": - # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create - # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to - # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime we use a unique string created at - # compilation time. - import uuid - compute_exec_params_op = dsl.component_decorator.component( func=compute_exec_params_func, base_image=base_kfp_image ) - print( - "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " - + "same version of the same pipeline !!!" - ) - run_id = uuid.uuid4().hex else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) - run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") @@ -111,6 +99,7 @@ def compute_exec_params_func( def resize( # Ray cluster ray_name: str = "resize-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -137,6 +126,7 @@ def resize( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -172,6 +162,16 @@ def resize( :param resize_size_type - size type - disk/memory :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 82fc55ae2c..15958665b1 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -116,6 +116,7 @@ def compute_exec_params_func( def tokenization( # Ray cluster ray_name: str = "tkn-kfp-ray", # name of Ray cluster + ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -149,6 +150,7 @@ def tokenization( """ Pipeline to execute tokenization transform :param ray_name: name of the Ray cluster + :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -187,6 +189,16 @@ def tokenization( :param tkn_chunk_size - Specify >0 value to tokenize each row/text in chunks of characters (rounded in words) :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op( ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params From 070a9d8c8435052059a9b717d40916b820139d20 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 20 Jan 2025 06:00:16 -0600 Subject: [PATCH 02/17] Set default value for run id. Signed-off-by: Revital Sur --- transforms/.make.workflows | 3 +++ 1 file changed, 3 insertions(+) diff --git a/transforms/.make.workflows b/transforms/.make.workflows index a1e5acccea..16c6f38fb0 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -46,6 +46,9 @@ FORCE: ifeq ($(USE_DEV_IMAGES), 1) cd ${TRANSFORM_SRC} && $(MAKE) image && $(MAKE) kind-load-image cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) image && $(MAKE) kind-load-image +endif +ifeq ($(KFPv2), 1) + yq -i '.root.inputDefinitions.parameters.ray_id_KFPv2.defaultValue = "123"' ${CURDIR}/${PIPELINE_FILE} endif . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipeline_utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} -e ${KFP_ENDPOINT} From 31db4cc150efd5fb931e70c1b9d58f53f16ab183 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 20 Jan 2025 22:46:54 -0600 Subject: [PATCH 03/17] Add _set_run_id function. Signed-off-by: Revital Sur --- .../pipeline_utils/pipelines_tests_utils.py | 21 +++++++++++++++++++ transforms/.make.workflows | 3 --- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index 6b23067f95..05f7d8c040 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -52,6 +52,26 @@ def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/ logger.info(f"Pipeline {pipeline_name} successfully completed") return pipeline_name +def _set_run_id(pipeline_package_path: str): + """ + Assign a dummy run ID value for testing purposes. By default, this value + is empty and is set by the user during runtime. + + :param pipeline_package_path: Local path to the pipeline package. + """ + import yaml + + try: + stream = open(pipeline_package_path, "r") + docs = list(yaml.load_all(stream, yaml.FullLoader)) + for doc in docs: + if "root" in doc: + doc["root"]["inputDefinitions"]["parameters"]["ray_id_KFPv2"]["defaultValue"] = "123" + with open(pipeline_package_path, "w") as outfile: + yaml.dump_all(docs, outfile) + except Exception as e: + logger.error(f"Failed to update run id value, exception {e}") + sys.exit(1) if __name__ == "__main__": import argparse @@ -74,6 +94,7 @@ def run_test(pipeline_package_path: str, endpoint: str = "http://localhost:8080/ if pipeline is None: sys.exit(1) case "sanity-test": + _set_run_id(args.pipeline_package_path) run = run_test( endpoint=args.endpoint, pipeline_package_path=args.pipeline_package_path, diff --git a/transforms/.make.workflows b/transforms/.make.workflows index 16c6f38fb0..a1e5acccea 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -46,9 +46,6 @@ FORCE: ifeq ($(USE_DEV_IMAGES), 1) cd ${TRANSFORM_SRC} && $(MAKE) image && $(MAKE) kind-load-image cd ${REPOROOT}/kfp/kfp_ray_components && $(MAKE) image && $(MAKE) kind-load-image -endif -ifeq ($(KFPv2), 1) - yq -i '.root.inputDefinitions.parameters.ray_id_KFPv2.defaultValue = "123"' ${CURDIR}/${PIPELINE_FILE} endif . ${WORKFLOW_VENV_ACTIVATE} && ${PYTHON} -m workflow_support.pipeline_utils.pipelines_tests_utils -c "sanity-test" -p ${CURDIR}/${PIPELINE_FILE} -e ${KFP_ENDPOINT} From 5a8c36fb19eda482d8a235ed7c5a8e36e87bb13a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Tue, 21 Jan 2025 02:39:15 -0600 Subject: [PATCH 04/17] Minor fix. Signed-off-by: Revital Sur --- .../code/header_cleanser/kfp_ray/header_cleanser_wf.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 107795463d..a3b02c7c86 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -177,6 +177,16 @@ def header_cleanser( :param skip_timeout - Hold value true or false to skip removing copyright/header or not when scaning timeout. :return: None """ + # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create + # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to + # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert + # a unique string created at compilation time. + if os.getenv("KFPv2", "0") == "1": + print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " + "same version of the same pipeline !!!") + run_id = ray_id_KFPv2 + else: + run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) From 76620d4496f75d74ec9cf6a8db9665ca278a3d2e Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 22 Jan 2025 10:19:28 +0200 Subject: [PATCH 05/17] Add missing --chmod=775 --chown=ray:root in dockerfiles. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 2 +- transforms/code/code2parquet/ray/Dockerfile | 4 ++-- transforms/code/code_quality/ray/Dockerfile | 8 ++++---- transforms/code/header_cleanser/ray/Dockerfile | 2 +- transforms/code/license_select/ray/Dockerfile | 4 ++-- transforms/code/malware/ray/Dockerfile | 4 ++-- transforms/universal/noop/ray/Dockerfile | 8 ++++---- transforms/universal/profiler/ray/Dockerfile | 4 ++-- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 6f6bf323df..5b51b0d2af 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -30,7 +30,7 @@ RUN pip install --no-cache-dir pydantic==2.6.3 # remove credentials-containing file RUN rm requirements.txt # components -COPY ./src /pipelines/component/src +COPY --chmod=775 --chown=ray:root ./src /pipelines/component/src # Set environment ENV KFP_v2=$KFP_v2 diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index 1309416ea1..cf363def4c 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -28,10 +28,10 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY src/code2parquet_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . # copy some of the samples in -COPY src/code2parquet_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/code_quality/ray/Dockerfile b/transforms/code/code_quality/ray/Dockerfile index 54630e9d96..2127cfd818 100644 --- a/transforms/code/code_quality/ray/Dockerfile +++ b/transforms/code/code_quality/ray/Dockerfile @@ -33,14 +33,14 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/code_quality_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/code_quality_transform_ray.py . # copy some of the samples in -COPY ./src/code_quality_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/code_quality_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 1b21cef437..b5fc809aae 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -31,7 +31,7 @@ User ray # copy source data COPY ./src/header_cleanser_transform_ray.py . -COPY src/header_cleanser_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/header_cleanser_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index 6c8301c85a..184747ff59 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -27,8 +27,8 @@ COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data -COPY src/license_select_transform_ray.py . -COPY src/license_select_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index 24f43d053c..f06c2005c5 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -56,10 +56,10 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY src/malware_transform_ray.py ./ +COPY --chmod=775 --chown=ray:root src/malware_transform_ray.py ./ # copy some of the samples in -COPY src/malware_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/malware_local_ray.py local/ COPY test/ test/ COPY test-data/ test-data/ diff --git a/transforms/universal/noop/ray/Dockerfile b/transforms/universal/noop/ray/Dockerfile index 796a9559fb..bfca6fab4c 100644 --- a/transforms/universal/noop/ray/Dockerfile +++ b/transforms/universal/noop/ray/Dockerfile @@ -29,14 +29,14 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/noop_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/noop_transform_ray.py . # copy some of the samples in -COPY ./src/noop_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/noop_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/universal/profiler/ray/Dockerfile b/transforms/universal/profiler/ray/Dockerfile index 131229d1f2..9fdfa45940 100644 --- a/transforms/universal/profiler/ray/Dockerfile +++ b/transforms/universal/profiler/ray/Dockerfile @@ -30,10 +30,10 @@ COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY src/profiler_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/profiler_transform_ray.py . # copy some of the samples in -COPY src/profiler_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/profiler_local_ray.py local/ # copy test COPY test/ test/ From 84d48a06f03e6e68352cd308d60865b90f60cb49 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 22 Jan 2025 10:22:08 +0200 Subject: [PATCH 06/17] Minor fix. Signed-off-by: Revital Sur --- transforms/code/malware/ray/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index f06c2005c5..1c231275f4 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -61,8 +61,8 @@ COPY --chmod=775 --chown=ray:root src/malware_transform_ray.py ./ # copy some of the samples in COPY --chmod=775 --chown=ray:root src/malware_local_ray.py local/ -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ ENV PYTHONPATH /home/ray From 3cec0ab2881e65f9b4ddbfdec440064a975540fd Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 22 Jan 2025 12:25:20 +0200 Subject: [PATCH 07/17] Address review comments. Signed-off-by: Revital Sur --- kfp/doc/simple_transform_pipeline.md | 11 ++++++----- .../pipeline_utils/pipelines_tests_utils.py | 3 ++- .../single-pipeline/templates/simple_pipeline.py | 2 +- .../code/code2parquet/kfp_ray/code2parquet_wf.py | 2 +- .../code/code_quality/kfp_ray/code_quality_wf.py | 2 +- .../header_cleanser/kfp_ray/header_cleanser_wf.py | 2 +- .../code/license_select/kfp_ray/license_select_wf.py | 2 +- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- .../proglang_select/kfp_ray/proglang_select_wf.py | 2 +- .../kfp_ray/repo_level_order_wf.py | 2 +- .../doc_chunk/kfp_ray/doc_chunk_multiple_wf.py | 2 +- transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- .../language/html2parquet/kfp_ray/html2parquet_wf.py | 2 +- .../language/lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- transforms/language/lang_id/kfp_ray/lang_id_wf.py | 2 +- .../pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py | 2 +- .../language/pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- .../language/pii_redactor/kfp_ray/pii_redactor_wf.py | 2 +- .../text_encoder/kfp_ray/text_encoder_multiple_wf.py | 2 +- .../language/text_encoder/kfp_ray/text_encoder_wf.py | 2 +- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- transforms/universal/ededup/kfp_ray/ededup_wf.py | 2 +- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- transforms/universal/filter/kfp_ray/filter_wf.py | 2 +- transforms/universal/hap/kfp_ray/hap_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- transforms/universal/profiler/kfp_ray/profiler_wf.py | 2 +- transforms/universal/resize/kfp_ray/resize_wf.py | 2 +- .../universal/tokenization/kfp_ray/tokenization_wf.py | 2 +- 30 files changed, 36 insertions(+), 34 deletions(-) diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index e49eef6252..ccb4d16a75 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -112,6 +112,7 @@ The input parameters section defines all the parameters required for the pipelin The parameters used here are as follows: * ray_name: name of the Ray cluster +* ray_id_KFPv2: Ray cluster unique ID used only in KFP v2 * ray_head_options: head node options, containing the following: * cpu - number of cpus * memory - memory @@ -156,7 +157,7 @@ component execution and parameters submitted to every component. # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") @@ -164,7 +165,7 @@ component execution and parameters submitted to every component. else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=dsl.RUN_ID_PLACEHOLDER, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): @@ -177,7 +178,7 @@ component execution and parameters submitted to every component. # start Ray cluster ray_cluster = create_ray_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, ray_head_options=ray_head_options, ray_worker_options=ray_worker_options, server_url=server_url, @@ -188,7 +189,7 @@ component execution and parameters submitted to every component. # Execute job execute_job = execute_ray_jobs_op( ray_name=ray_name, - run_id=dsl.RUN_ID_PLACEHOLDER, + run_id=run_id, additional_params=additional_params, # note that the parameters below are specific for NOOP transform exec_params={ @@ -198,7 +199,7 @@ component execution and parameters submitted to every component. "num_workers": compute_exec_params.output, "worker_options": actor_options, "pipeline_id": pipeline_id, - "job_id": dsl.RUN_ID_PLACEHOLDER, + "job_id": run_id, "code_location": code_location, "noop_sleep_sec": noop_sleep_sec, }, diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index 05f7d8c040..41a392e264 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -60,13 +60,14 @@ def _set_run_id(pipeline_package_path: str): :param pipeline_package_path: Local path to the pipeline package. """ import yaml + import uuid try: stream = open(pipeline_package_path, "r") docs = list(yaml.load_all(stream, yaml.FullLoader)) for doc in docs: if "root" in doc: - doc["root"]["inputDefinitions"]["parameters"]["ray_id_KFPv2"]["defaultValue"] = "123" + doc["root"]["inputDefinitions"]["parameters"]["ray_id_KFPv2"]["defaultValue"] = uuid.uuid4().hex with open(pipeline_package_path, "w") as outfile: yaml.dump_all(docs, outfile) except Exception as e: diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index 2022e8359c..4191f5d462 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -171,7 +171,7 @@ def {{ pipeline_name }}( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index 8afde87d49..c78afc4a91 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -171,7 +171,7 @@ def code2parquet( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index ba2d8e53f7..466d15f638 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -164,7 +164,7 @@ def code_quality( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index a3b02c7c86..fc02f04ea0 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -180,7 +180,7 @@ def header_cleanser( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index b92cb64980..25bcc29f64 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -159,7 +159,7 @@ def license_select( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index ad1bf4aaf8..ef5a290ab4 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -153,7 +153,7 @@ def malware( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 3ba7d89269..b4ad500164 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -158,7 +158,7 @@ def lang_select( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 38099a1928..b882cf82bb 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -194,7 +194,7 @@ def repo_level_order( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 5518f0ba1d..4b33493897 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -168,7 +168,7 @@ def doc_chunk( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index e671177a92..8355147931 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -162,7 +162,7 @@ def doc_chunk( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index b75064e795..ce70c27b11 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -164,7 +164,7 @@ def html2parquet( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 480f1a7384..00c7e490d4 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -176,7 +176,7 @@ def lang_id( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index b162437625..66dade14c3 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -177,7 +177,7 @@ def lang_id( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index f1796ee9f7..b5e61e67f0 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -172,7 +172,7 @@ def pdf2parquet( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index a6f308ea73..3dce876aaa 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -176,7 +176,7 @@ def pdf2parquet( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index fb70f789a0..133a595d4f 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -156,7 +156,7 @@ def pii_redactor( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index f746f4aefd..fde0c89964 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -157,7 +157,7 @@ def text_encoder( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index 5e7421490e..223f0434cf 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -158,7 +158,7 @@ def text_encoder( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 985139c92e..9e0a98af56 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -178,7 +178,7 @@ def doc_id( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index 62db57fea6..c11cfc0504 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -149,7 +149,7 @@ def ededup( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index bf45ac1975..b2713c851c 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -209,7 +209,7 @@ def fuzzydedup( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 26ae444892..59cb52cc2d 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -158,7 +158,7 @@ def filtering( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 46d1dba1aa..0880349357 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -180,7 +180,7 @@ def hap( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index dd535db5cb..9b59ebbae5 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -149,7 +149,7 @@ def noop( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 0392e9ab57..8fbcde3008 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -151,7 +151,7 @@ def noop( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 6300f62f88..e39fe8c889 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -142,7 +142,7 @@ def profiler( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 89007c8bea..64b27c231f 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -165,7 +165,7 @@ def resize( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index 15958665b1..c9e2c5f495 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -192,7 +192,7 @@ def tokenization( # In KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. On another hand we cannot create # a unique string in a component (at runtime) and pass it to the `clean_up_task` of `ExitHandler`, due to # https://github.com/kubeflow/pipelines/issues/10187. Therefore, meantime the user is requested to insert - # a unique string created at compilation time. + # a unique string created at run creation time. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") From b420d1dfe66eab1ed0241ddec5ed7ea02616c73e Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Wed, 22 Jan 2025 12:49:41 +0200 Subject: [PATCH 08/17] Minor fix. Signed-off-by: Revital Sur --- kfp/doc/simple_transform_pipeline.md | 4 ++-- .../pipeline_utils/pipelines_tests_utils.py | 2 +- .../single-pipeline/templates/simple_pipeline.py | 8 ++++---- transforms/code/code2parquet/kfp_ray/code2parquet_wf.py | 6 +++--- transforms/code/code_quality/kfp_ray/code_quality_wf.py | 6 +++--- .../code/header_cleanser/kfp_ray/header_cleanser_wf.py | 6 +++--- .../code/license_select/kfp_ray/license_select_wf.py | 6 +++--- transforms/code/malware/kfp_ray/malware_wf.py | 6 +++--- .../code/proglang_select/kfp_ray/proglang_select_wf.py | 6 +++--- .../repo_level_ordering/kfp_ray/repo_level_order_wf.py | 6 +++--- .../language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py | 6 +++--- transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py | 6 +++--- .../doc_quality/kfp_ray/doc_quality_multiple_wf.py | 6 +++--- transforms/language/doc_quality/kfp_ray/doc_quality_wf.py | 6 +++--- .../language/html2parquet/kfp_ray/html2parquet_wf.py | 6 +++--- .../language/lang_id/kfp_ray/lang_id_multiple_wf.py | 6 +++--- transforms/language/lang_id/kfp_ray/lang_id_wf.py | 6 +++--- .../pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py | 6 +++--- transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py | 6 +++--- .../language/pii_redactor/kfp_ray/pii_redactor_wf.py | 6 +++--- .../text_encoder/kfp_ray/text_encoder_multiple_wf.py | 6 +++--- .../language/text_encoder/kfp_ray/text_encoder_wf.py | 6 +++--- transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 6 +++--- transforms/universal/ededup/kfp_ray/ededup_wf.py | 6 +++--- transforms/universal/fdedup/kfp_ray/fdedup_wf.py | 6 +++--- transforms/universal/filter/kfp_ray/filter_wf.py | 6 +++--- transforms/universal/hap/kfp_ray/hap_wf.py | 6 +++--- transforms/universal/noop/kfp_ray/noop_multiple_wf.py | 6 +++--- transforms/universal/noop/kfp_ray/noop_wf.py | 6 +++--- transforms/universal/profiler/kfp_ray/profiler_wf.py | 6 +++--- transforms/universal/resize/kfp_ray/resize_wf.py | 6 +++--- .../universal/tokenization/kfp_ray/tokenization_wf.py | 6 +++--- 32 files changed, 94 insertions(+), 94 deletions(-) diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index ccb4d16a75..00cd9e2042 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -112,7 +112,7 @@ The input parameters section defines all the parameters required for the pipelin The parameters used here are as follows: * ray_name: name of the Ray cluster -* ray_id_KFPv2: Ray cluster unique ID used only in KFP v2 +* ray_run_id_KFPv2: Ray cluster unique ID used only in KFP v2 * ray_head_options: head node options, containing the following: * cpu - number of cpus * memory - memory @@ -161,7 +161,7 @@ component execution and parameters submitted to every component. if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py index 41a392e264..00530406f7 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/pipeline_utils/pipelines_tests_utils.py @@ -67,7 +67,7 @@ def _set_run_id(pipeline_package_path: str): docs = list(yaml.load_all(stream, yaml.FullLoader)) for doc in docs: if "root" in doc: - doc["root"]["inputDefinitions"]["parameters"]["ray_id_KFPv2"]["defaultValue"] = uuid.uuid4().hex + doc["root"]["inputDefinitions"]["parameters"]["ray_run_id_KFPv2"]["defaultValue"] = uuid.uuid4().hex with open(pipeline_package_path, "w") as outfile: yaml.dump_all(docs, outfile) except Exception as e: diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index 4191f5d462..6a682a0f2e 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -99,11 +99,11 @@ def {{ pipeline_name }}( ray_name: str = "{{ pipeline_name }}-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed {%- if image_pull_secret != "" %} - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, {%- else %} - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, {%- endif %} @@ -132,7 +132,7 @@ def {{ pipeline_name }}( """ Pipeline to execute {{ pipeline_name }} transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -175,7 +175,7 @@ def {{ pipeline_name }}( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index c78afc4a91..7dd7111ceb 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -101,7 +101,7 @@ def compute_exec_params_func( ) def code2parquet( ray_name: str = "code2parquet-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -128,7 +128,7 @@ def code2parquet( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -175,7 +175,7 @@ def code2parquet( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 466d15f638..58a571e9bc 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -100,7 +100,7 @@ def compute_exec_params_func( def code_quality( # Ray cluster ray_name: str = "code_quality-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -125,7 +125,7 @@ def code_quality( """ Pipeline to execute Code Quality transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -168,7 +168,7 @@ def code_quality( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index fc02f04ea0..b6d15934bf 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -108,7 +108,7 @@ def compute_exec_params_func( def header_cleanser( # Ray cluster ray_name: str = "header_cleanser-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -137,7 +137,7 @@ def header_cleanser( """ Pipeline to execute Header Cleanser transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -184,7 +184,7 @@ def header_cleanser( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 25bcc29f64..55a176a0df 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -94,7 +94,7 @@ def compute_exec_params_func( ) def license_select( ray_name: str = "license_select-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -124,7 +124,7 @@ def license_select( """ Pipeline to execute License Select transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -163,7 +163,7 @@ def license_select( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index ef5a290ab4..9c18c5e301 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -92,7 +92,7 @@ def compute_exec_params_func( ) def malware( ray_name: str = "malware-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -115,7 +115,7 @@ def malware( """ Pipeline to execute malware transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -157,7 +157,7 @@ def malware( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index b4ad500164..31bad37987 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -94,7 +94,7 @@ def compute_exec_params_func( ) def lang_select( ray_name: str = "proglang-match-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -118,7 +118,7 @@ def lang_select( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -162,7 +162,7 @@ def lang_select( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index b882cf82bb..8fa1692091 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -111,7 +111,7 @@ def compute_exec_params_func( def repo_level_order( # Ray cluster ray_name: str = "repo_level_order-kfp-ray", - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { "replicas": 2, @@ -148,7 +148,7 @@ def repo_level_order( """ Pipeline to execute repo_level_order transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -198,7 +198,7 @@ def repo_level_order( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 4b33493897..7442aabe43 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -96,7 +96,7 @@ def compute_exec_params_func( def doc_chunk( # Ray cluster ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -128,7 +128,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -172,7 +172,7 @@ def doc_chunk( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 8355147931..975902797e 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -97,7 +97,7 @@ def compute_exec_params_func( def doc_chunk( # Ray cluster ray_name: str = "doc-json-chunk-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -122,7 +122,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -166,7 +166,7 @@ def doc_chunk( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index 2830ce32ca..d5a8abc9d6 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image, "image_pull_policy": "Always"}, ray_worker_options: dict = { "replicas": 2, @@ -126,7 +126,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -169,7 +169,7 @@ def doc_quality( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index c4d6c7d43c..6ba23c515a 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def doc_quality( # Ray cluster ray_name: str = "doc_quality-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = { "cpu": 1, "memory": 4, @@ -132,7 +132,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -175,7 +175,7 @@ def doc_quality( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index ce70c27b11..2207363d86 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def html2parquet( # Ray cluster ray_name: str = "html2parquet-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -126,7 +126,7 @@ def html2parquet( """ Pipeline to execute html2parquet transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -168,7 +168,7 @@ def html2parquet( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 00c7e490d4..680e27300a 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -100,7 +100,7 @@ def compute_exec_params_func( def lang_id( # Ray cluster ray_name: str = "lang_id-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -134,7 +134,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -180,7 +180,7 @@ def lang_id( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index 66dade14c3..a7cfe45094 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -101,7 +101,7 @@ def compute_exec_params_func( def lang_id( # Ray cluster ray_name: str = "lang_id-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -135,7 +135,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -181,7 +181,7 @@ def lang_id( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index b5e61e67f0..3895489f18 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -98,7 +98,7 @@ def compute_exec_params_func( def pdf2parquet( # Ray cluster ray_name: str = "pdf2parquet-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -131,7 +131,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -176,7 +176,7 @@ def pdf2parquet( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 3dce876aaa..13a39a1b8e 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -101,7 +101,7 @@ def compute_exec_params_func( def pdf2parquet( # Ray cluster ray_name: str = "pdf2parquet-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -135,7 +135,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -180,7 +180,7 @@ def pdf2parquet( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index 133a595d4f..50ee4c6533 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -90,7 +90,7 @@ def compute_exec_params_func( def pii_redactor( # Ray cluster ray_name: str = "pii-redactor-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -119,7 +119,7 @@ def pii_redactor( """ Pipeline to execute pii_redactor transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -160,7 +160,7 @@ def pii_redactor( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index fde0c89964..06c7a3253b 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -94,7 +94,7 @@ def compute_exec_params_func( def text_encoder( # Ray cluster ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -118,7 +118,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -161,7 +161,7 @@ def text_encoder( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index 223f0434cf..98011cb15f 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def text_encoder( # Ray cluster ray_name: str = "text-encoder-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -119,7 +119,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -162,7 +162,7 @@ def text_encoder( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 9e0a98af56..03cd29b0b9 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -103,7 +103,7 @@ def compute_exec_params_func( def doc_id( # Ray cluster ray_name: str = "doc_id-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -138,7 +138,7 @@ def doc_id( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -182,7 +182,7 @@ def doc_id( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index c11cfc0504..dba40490e9 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -74,7 +74,7 @@ def ededup( # Ray cluster ray_name: str = "ededup-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -108,7 +108,7 @@ def ededup( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -153,7 +153,7 @@ def ededup( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index b2713c851c..33782b07bb 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -110,7 +110,7 @@ def fuzzydedup( # folders used # Ray cluster ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = { "cpu": 8, @@ -161,7 +161,7 @@ def fuzzydedup( """ Pipeline to execute FDEDUP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -213,7 +213,7 @@ def fuzzydedup( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 59cb52cc2d..6b2f87f971 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -95,7 +95,7 @@ def compute_exec_params_func( def filtering( # Ray cluster ray_name: str = "filter-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -119,7 +119,7 @@ def filtering( """ Pipeline to execute Filtering transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -162,7 +162,7 @@ def filtering( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 0880349357..01f943bbc4 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -103,7 +103,7 @@ def compute_exec_params_func( def hap( # Ray cluster ray_name: str = "hap-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -138,7 +138,7 @@ def hap( """ Pipeline to execute hap transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -184,7 +184,7 @@ def hap( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 9b59ebbae5..4e25fd17ee 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -90,7 +90,7 @@ def compute_exec_params_func( def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -112,7 +112,7 @@ def noop( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -153,7 +153,7 @@ def noop( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 8fbcde3008..e4057632d6 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -92,7 +92,7 @@ def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed - ray_id_KFPv2: str = "", + ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", @@ -114,7 +114,7 @@ def noop( """ Pipeline to execute noop transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -155,7 +155,7 @@ def noop( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index e39fe8c889..f5dcefd355 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -78,7 +78,7 @@ def profiler( # Ray cluster ray_name: str = "profiler-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -103,7 +103,7 @@ def profiler( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -146,7 +146,7 @@ def profiler( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 64b27c231f..af917f1f6f 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -99,7 +99,7 @@ def compute_exec_params_func( def resize( # Ray cluster ray_name: str = "resize-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, @@ -126,7 +126,7 @@ def resize( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -169,7 +169,7 @@ def resize( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index c9e2c5f495..bbcb4a6a14 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -116,7 +116,7 @@ def compute_exec_params_func( def tokenization( # Ray cluster ray_name: str = "tkn-kfp-ray", # name of Ray cluster - ray_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = { @@ -150,7 +150,7 @@ def tokenization( """ Pipeline to execute tokenization transform :param ray_name: name of the Ray cluster - :param ray_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -196,7 +196,7 @@ def tokenization( if os.getenv("KFPv2", "0") == "1": print("WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " "same version of the same pipeline !!!") - run_id = ray_id_KFPv2 + run_id = ray_run_id_KFPv2 else: run_id = dsl.RUN_ID_PLACEHOLDER # create clean_up task From c6af2880a69e1f87c8f14c32aac5d0a991b720da Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 09:25:14 +0200 Subject: [PATCH 09/17] Address review comments. Signed-off-by: Revital Sur --- kfp/doc/simple_transform_pipeline.md | 25 ++++++++++++++----- .../example/pipeline_definitions.yaml | 2 +- .../templates/simple_pipeline.py | 5 ++-- .../code2parquet/kfp_ray/code2parquet_wf.py | 2 +- .../code_quality/kfp_ray/code_quality_wf.py | 2 +- .../kfp_ray/header_cleanser_wf.py | 2 +- .../kfp_ray/license_select_wf.py | 2 +- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- .../kfp_ray/proglang_select_wf.py | 2 +- .../kfp_ray/repo_level_order_wf.py | 2 +- .../kfp_ray/doc_chunk_multiple_wf.py | 2 +- .../doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- .../kfp_ray/doc_quality_multiple_wf.py | 2 +- .../doc_quality/kfp_ray/doc_quality_wf.py | 2 +- .../html2parquet/kfp_ray/html2parquet_wf.py | 2 +- .../lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- .../language/lang_id/kfp_ray/lang_id_wf.py | 2 +- .../kfp_ray/pdf2parquet_multiple_wf.py | 2 +- .../pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- .../pii_redactor/kfp_ray/pii_redactor_wf.py | 2 +- .../kfp_ray/text_encoder_multiple_wf.py | 2 +- .../text_encoder/kfp_ray/text_encoder_wf.py | 2 +- .../universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- .../universal/ededup/kfp_ray/ededup_wf.py | 2 +- .../universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- .../universal/filter/kfp_ray/filter_wf.py | 2 +- transforms/universal/hap/kfp_ray/hap_wf.py | 2 +- .../noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 7 ++++-- .../universal/profiler/kfp_ray/profiler_wf.py | 2 +- .../universal/resize/kfp_ray/resize_wf.py | 2 +- .../tokenization/kfp_ray/tokenization_wf.py | 2 +- 32 files changed, 55 insertions(+), 40 deletions(-) diff --git a/kfp/doc/simple_transform_pipeline.md b/kfp/doc/simple_transform_pipeline.md index 00cd9e2042..633c820593 100644 --- a/kfp/doc/simple_transform_pipeline.md +++ b/kfp/doc/simple_transform_pipeline.md @@ -41,7 +41,9 @@ Note: the project and the explanation below are based on [KFPv1](https://www.kub import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl +import os from kfp_support.workflow_support.runtime_utils import ( + DEFAULT_KFP_COMPONENT_SPEC_PATH, ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils, @@ -56,7 +58,8 @@ Ray cluster. For each step we have to define a component that will execute them: ```python # components - base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.0.2" + base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + component_spec_path = os.getenv("KFP_COMPONENT_SPEC_PATH", DEFAULT_KFP_COMPONENT_SPEC_PATH) # KFPv1 and KFP2 uses different methods to create a component from a function. KFPv1 uses the # `create_component_from_func` function, but it is deprecated by KFPv2 and so has a different import path. # KFPv2 recommends using the `@dsl.component` decorator, which doesn't exist in KFPv1. Therefore, here we use @@ -68,11 +71,11 @@ Ray cluster. For each step we have to define a component that will execute them: else: compute_exec_params_op = comp.create_component_from_func(func=compute_exec_params_func, base_image=base_kfp_image) # create Ray cluster - create_ray_op = comp.load_component_from_file("../../../kfp_ray_components/createRayComponent.yaml") + create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") # execute job - execute_ray_jobs_op = comp.load_component_from_file("../../../kfp_ray_components/executeRayJobComponent.yaml") + execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") # clean up Ray - cleanup_ray_op = comp.load_component_from_file("../../../kfp_ray_components/cleanupRayComponent.yaml") + cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. TASK_NAME: str = "noop" ``` @@ -89,6 +92,7 @@ The input parameters section defines all the parameters required for the pipelin ```python # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster + ray_run_id_KFPv2: str = "", ray_head_options: str = '{"cpu": 1, "memory": 4, \ "image": "' + task_image + '" }', ray_worker_options: str = '{"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, \ @@ -99,6 +103,7 @@ The input parameters section defines all the parameters required for the pipelin data_s3_access_secret: str = "s3-secret", data_max_files: int = -1, data_num_samples: int = -1, + data_checkpointing: bool = False, # orchestrator actor_options: str = "{'num_cpus': 0.8}", pipeline_id: str = "pipeline_id", @@ -171,8 +176,16 @@ component execution and parameters submitted to every component. with dsl.ExitHandler(clean_up_task): # compute execution params compute_exec_params = compute_exec_params_op( - worker_options=ray_worker_options, - actor_options=actor_options, + worker_options=ray_worker_options, + actor_options=runtime_actor_options, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + data_checkpointing=data_checkpointing, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + noop_sleep_sec=noop_sleep_sec, ) ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) # start Ray cluster diff --git a/kfp/pipeline_generator/single-pipeline/example/pipeline_definitions.yaml b/kfp/pipeline_generator/single-pipeline/example/pipeline_definitions.yaml index d703d36ca0..4f8088b19c 100644 --- a/kfp/pipeline_generator/single-pipeline/example/pipeline_definitions.yaml +++ b/kfp/pipeline_generator/single-pipeline/example/pipeline_definitions.yaml @@ -1,7 +1,7 @@ pipeline_parameters: name: "noop" description: "Pipeline for noop task" - script_name: "noop_transform.py" + script_name: "-m dpk_noop.ray.runtime" prefix: "" multi_s3: False compute_func_name: "" diff --git a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py index 6a682a0f2e..8d8207fc88 100644 --- a/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py +++ b/kfp/pipeline_generator/single-pipeline/templates/simple_pipeline.py @@ -97,13 +97,12 @@ def compute_exec_params_func( def {{ pipeline_name }}( # Ray cluster ray_name: str = "{{ pipeline_name }}-kfp-ray", # name of Ray cluster + ray_run_id_KFPv2: str = "", # Ray cluster unique ID used only in KFP v2 # Add image_pull_secret and image_pull_policy to ray workers if needed {%- if image_pull_secret != "" %} - ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image_pull_secret": "{{ image_pull_secret }}", "image": task_image}, {%- else %} - ray_run_id_KFPv2: str = "", ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, {%- endif %} @@ -132,7 +131,7 @@ def {{ pipeline_name }}( """ Pipeline to execute {{ pipeline_name }} transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index 7dd7111ceb..c5cba02308 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -128,7 +128,7 @@ def code2parquet( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 58a571e9bc..6aa4dc82c2 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -125,7 +125,7 @@ def code_quality( """ Pipeline to execute Code Quality transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index b6d15934bf..0f64bd4b0c 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -137,7 +137,7 @@ def header_cleanser( """ Pipeline to execute Header Cleanser transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 55a176a0df..f29f1c839c 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -124,7 +124,7 @@ def license_select( """ Pipeline to execute License Select transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index 9c18c5e301..77f5b56b65 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -115,7 +115,7 @@ def malware( """ Pipeline to execute malware transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 31bad37987..5a6d1d20cd 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -118,7 +118,7 @@ def lang_select( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 8fa1692091..4e753ecefb 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -148,7 +148,7 @@ def repo_level_order( """ Pipeline to execute repo_level_order transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 7442aabe43..f0408e2851 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -128,7 +128,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 975902797e..f6670ebef0 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -122,7 +122,7 @@ def doc_chunk( """ Pipeline to execute chunk documents transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index d5a8abc9d6..d85430259b 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -126,7 +126,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index 6ba23c515a..bd3a358944 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -132,7 +132,7 @@ def doc_quality( """ Pipeline to execute Document Quality transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index 2207363d86..855eac46d7 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -126,7 +126,7 @@ def html2parquet( """ Pipeline to execute html2parquet transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index 680e27300a..1faaa0d2ed 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -134,7 +134,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index a7cfe45094..a9e39edac2 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -135,7 +135,7 @@ def lang_id( """ Pipeline to execute Language Identification transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 3895489f18..8b8797c3d3 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -131,7 +131,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index 13a39a1b8e..e9ee062380 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -135,7 +135,7 @@ def pdf2parquet( """ Pipeline to execute PDF2PARQUET transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py index 50ee4c6533..a3fd5e1eff 100644 --- a/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py +++ b/transforms/language/pii_redactor/kfp_ray/pii_redactor_wf.py @@ -119,7 +119,7 @@ def pii_redactor( """ Pipeline to execute pii_redactor transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index 06c7a3253b..a515b7c268 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -118,7 +118,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index 98011cb15f..0b51e7f87b 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -119,7 +119,7 @@ def text_encoder( """ Pipeline to execute TextEncoder transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 03cd29b0b9..0b9ccd42d0 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -138,7 +138,7 @@ def doc_id( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index dba40490e9..941678d60e 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -108,7 +108,7 @@ def ededup( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 33782b07bb..4f8e067d9a 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -161,7 +161,7 @@ def fuzzydedup( """ Pipeline to execute FDEDUP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index 6b2f87f971..167b862bc1 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -119,7 +119,7 @@ def filtering( """ Pipeline to execute Filtering transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/hap/kfp_ray/hap_wf.py b/transforms/universal/hap/kfp_ray/hap_wf.py index 01f943bbc4..37e377abd7 100644 --- a/transforms/universal/hap/kfp_ray/hap_wf.py +++ b/transforms/universal/hap/kfp_ray/hap_wf.py @@ -138,7 +138,7 @@ def hap( """ Pipeline to execute hap transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 5b3e2e41fe..ae614a2b26 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -112,7 +112,7 @@ def noop( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index 748ceb52b1..1b65ad85d9 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -42,6 +42,7 @@ def compute_exec_params_func( data_s3_config: str, data_max_files: int, data_num_samples: int, + data_checkpointing: bool, runtime_pipeline_id: str, runtime_job_id: str, runtime_code_location: dict, @@ -53,6 +54,7 @@ def compute_exec_params_func( "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, + "data_checkpointing": data_checkpointing, "runtime_num_workers": KFPUtils.default_compute_execution_params(str(worker_options), str(actor_options)), "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, @@ -91,8 +93,8 @@ def compute_exec_params_func( def noop( # Ray cluster ray_name: str = "noop-kfp-ray", # name of Ray cluster - # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed ray_run_id_KFPv2: str = "", + # Add image_pull_secret, image_pull_policy and tolerations to ray options if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", @@ -114,7 +116,7 @@ def noop( """ Pipeline to execute noop transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory @@ -172,6 +174,7 @@ def noop( data_s3_config=data_s3_config, data_max_files=data_max_files, data_num_samples=data_num_samples, + data_checkpointing=data_checkpointing, runtime_pipeline_id=runtime_pipeline_id, runtime_job_id=run_id, runtime_code_location=runtime_code_location, diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 53c9ab5905..52d2a4dc11 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -103,7 +103,7 @@ def profiler( """ Pipeline to execute EDEDUP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 071d69bdca..bb958688a8 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -126,7 +126,7 @@ def resize( """ Pipeline to execute NOOP transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index bbcb4a6a14..5e3c23ddee 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -150,7 +150,7 @@ def tokenization( """ Pipeline to execute tokenization transform :param ray_name: name of the Ray cluster - :param ray_run_id_KFPv2: string holding the id used for the Ray cluster used only in KFP v2 + :param ray_run_id_KFPv2: a unique string id used for the Ray cluster, applicable only in KFP v2. :param ray_head_options: head node options, containing the following: cpu - number of cpus memory - memory From d7fa55d8110b093a52f16dbf1ed6dad9ce467813 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 14:00:38 +0200 Subject: [PATCH 10/17] Address review comments. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 2 +- tools/ingest2parquet/Dockerfile | 2 +- transforms/Dockerfile.ray.template | 2 +- transforms/code/code2parquet/ray/Dockerfile | 2 +- transforms/code/code_profiler/Dockerfile.ray | 2 +- transforms/code/code_quality/ray/Dockerfile | 2 +- transforms/code/header_cleanser/ray/Dockerfile | 2 +- transforms/code/license_select/ray/Dockerfile | 2 +- transforms/code/malware/ray/Dockerfile | 2 +- transforms/code/proglang_select/ray/Dockerfile | 2 +- transforms/code/repo_level_ordering/ray/Dockerfile | 2 +- transforms/language/doc_chunk/Dockerfile.ray | 2 +- transforms/language/doc_quality/Dockerfile.ray | 2 +- transforms/language/html2parquet/Dockerfile.ray | 2 +- transforms/language/lang_id/Dockerfile.ray | 2 +- transforms/language/pdf2parquet/Dockerfile.ray | 2 +- transforms/language/pii_redactor/Dockerfile.ray | 2 +- transforms/language/text_encoder/Dockerfile.ray | 2 +- transforms/universal/doc_id/Dockerfile.ray | 2 +- transforms/universal/ededup/Dockerfile.ray | 2 +- transforms/universal/fdedup/Dockerfile.ray | 2 +- transforms/universal/filter/Dockerfile.ray | 2 +- transforms/universal/hap/Dockerfile.ray | 2 +- transforms/universal/profiler/Dockerfile.ray | 2 +- transforms/universal/resize/Dockerfile.ray | 2 +- transforms/universal/tokenization/Dockerfile.ray | 2 +- 26 files changed, 26 insertions(+), 26 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index 5b51b0d2af..dd3670c521 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install libraries diff --git a/tools/ingest2parquet/Dockerfile b/tools/ingest2parquet/Dockerfile index a4319c1052..02bdebb847 100644 --- a/tools/ingest2parquet/Dockerfile +++ b/tools/ingest2parquet/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install pytest diff --git a/transforms/Dockerfile.ray.template b/transforms/Dockerfile.ray.template index b8e52425b0..30b1da9594 100644 --- a/transforms/Dockerfile.ray.template +++ b/transforms/Dockerfile.ray.template @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index cf363def4c..1f683ed041 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/code_profiler/Dockerfile.ray b/transforms/code/code_profiler/Dockerfile.ray index c308c284c5..440b7f9779 100644 --- a/transforms/code/code_profiler/Dockerfile.ray +++ b/transforms/code/code_profiler/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/code_quality/ray/Dockerfile b/transforms/code/code_quality/ray/Dockerfile index 2127cfd818..e06ee8c7ac 100644 --- a/transforms/code/code_quality/ray/Dockerfile +++ b/transforms/code/code_quality/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index b5fc809aae..21bd02b2b0 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -2,7 +2,7 @@ FROM docker.io/rayproject/ray:2.24.0-py310 # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install pytest diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index 184747ff59..49ada2fda8 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index 1c231275f4..284c9fac4b 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} AS base # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/ray/Dockerfile index f13ea34442..f7ef64f170 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/code/repo_level_ordering/ray/Dockerfile b/transforms/code/repo_level_ordering/ray/Dockerfile index 79806dd73b..6b308fdbf7 100644 --- a/transforms/code/repo_level_ordering/ray/Dockerfile +++ b/transforms/code/repo_level_ordering/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/doc_chunk/Dockerfile.ray b/transforms/language/doc_chunk/Dockerfile.ray index 63f2981c3d..f0514943b4 100644 --- a/transforms/language/doc_chunk/Dockerfile.ray +++ b/transforms/language/doc_chunk/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install pytest diff --git a/transforms/language/doc_quality/Dockerfile.ray b/transforms/language/doc_quality/Dockerfile.ray index 6b4ce18a15..fe0fe5b3b2 100644 --- a/transforms/language/doc_quality/Dockerfile.ray +++ b/transforms/language/doc_quality/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/html2parquet/Dockerfile.ray b/transforms/language/html2parquet/Dockerfile.ray index f246116f47..43ddfa97d3 100644 --- a/transforms/language/html2parquet/Dockerfile.ray +++ b/transforms/language/html2parquet/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/lang_id/Dockerfile.ray b/transforms/language/lang_id/Dockerfile.ray index ce81c320fc..8e2de45ba8 100644 --- a/transforms/language/lang_id/Dockerfile.ray +++ b/transforms/language/lang_id/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/pdf2parquet/Dockerfile.ray b/transforms/language/pdf2parquet/Dockerfile.ray index e295a9e7fe..3a11b7ee69 100644 --- a/transforms/language/pdf2parquet/Dockerfile.ray +++ b/transforms/language/pdf2parquet/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/pii_redactor/Dockerfile.ray b/transforms/language/pii_redactor/Dockerfile.ray index a95ce7cbe7..93b6bf4202 100644 --- a/transforms/language/pii_redactor/Dockerfile.ray +++ b/transforms/language/pii_redactor/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/language/text_encoder/Dockerfile.ray b/transforms/language/text_encoder/Dockerfile.ray index 4b6bee7917..9a6fa04d8a 100644 --- a/transforms/language/text_encoder/Dockerfile.ray +++ b/transforms/language/text_encoder/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray # install pytest diff --git a/transforms/universal/doc_id/Dockerfile.ray b/transforms/universal/doc_id/Dockerfile.ray index b8e52425b0..30b1da9594 100644 --- a/transforms/universal/doc_id/Dockerfile.ray +++ b/transforms/universal/doc_id/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/ededup/Dockerfile.ray b/transforms/universal/ededup/Dockerfile.ray index 2584f79793..10faaf2978 100644 --- a/transforms/universal/ededup/Dockerfile.ray +++ b/transforms/universal/ededup/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/fdedup/Dockerfile.ray b/transforms/universal/fdedup/Dockerfile.ray index da1c668f1e..8d36e6a35f 100644 --- a/transforms/universal/fdedup/Dockerfile.ray +++ b/transforms/universal/fdedup/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/filter/Dockerfile.ray b/transforms/universal/filter/Dockerfile.ray index b8e52425b0..30b1da9594 100644 --- a/transforms/universal/filter/Dockerfile.ray +++ b/transforms/universal/filter/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/hap/Dockerfile.ray b/transforms/universal/hap/Dockerfile.ray index b8e52425b0..30b1da9594 100644 --- a/transforms/universal/hap/Dockerfile.ray +++ b/transforms/universal/hap/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/profiler/Dockerfile.ray b/transforms/universal/profiler/Dockerfile.ray index b8e52425b0..30b1da9594 100644 --- a/transforms/universal/profiler/Dockerfile.ray +++ b/transforms/universal/profiler/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/resize/Dockerfile.ray b/transforms/universal/resize/Dockerfile.ray index b8e52425b0..30b1da9594 100644 --- a/transforms/universal/resize/Dockerfile.ray +++ b/transforms/universal/resize/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip diff --git a/transforms/universal/tokenization/Dockerfile.ray b/transforms/universal/tokenization/Dockerfile.ray index 50e6ff7a30..0469e7d9ba 100644 --- a/transforms/universal/tokenization/Dockerfile.ray +++ b/transforms/universal/tokenization/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod 775 /home/ray +RUN chown ray:root /home/ray && chmod -R g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip From 1fdb7fa6a7089cff8c935cad197d0c64571fe4c9 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 14:18:49 +0200 Subject: [PATCH 11/17] Minor fix. Signed-off-by: Revital Sur --- transforms/universal/noop/Dockerfile.ray | 33 ++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 transforms/universal/noop/Dockerfile.ray diff --git a/transforms/universal/noop/Dockerfile.ray b/transforms/universal/noop/Dockerfile.ray new file mode 100644 index 0000000000..30b1da9594 --- /dev/null +++ b/transforms/universal/noop/Dockerfile.ray @@ -0,0 +1,33 @@ +ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 +FROM ${BASE_IMAGE} + +# see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images +USER root +RUN chown ray:root /home/ray && chmod -R g=u /home/ray +USER ray + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME +ARG TRANSFORM_NAME + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] + + +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +# Set environment +ENV PYTHONPATH /home/ray + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT From 535048176057f7ede9358431225fa5a8971aa0a0 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 16:49:21 +0200 Subject: [PATCH 12/17] Address review comments. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 10 +++++----- tools/ingest2parquet/Dockerfile | 10 +++++----- transforms/Dockerfile.ray.template | 8 ++++---- transforms/code/code2parquet/ray/Dockerfile | 14 +++++++------- transforms/code/code_profiler/Dockerfile.ray | 8 ++++---- transforms/code/code_quality/ray/Dockerfile | 18 +++++++++--------- transforms/code/header_cleanser/ray/Dockerfile | 12 ++++++------ transforms/code/license_select/ray/Dockerfile | 16 ++++++++-------- transforms/code/malware/ray/Dockerfile | 18 +++++++++--------- transforms/code/proglang_select/ray/Dockerfile | 10 +++++----- .../code/repo_level_ordering/ray/Dockerfile | 10 +++++----- transforms/language/doc_chunk/Dockerfile.ray | 8 ++++---- transforms/language/doc_quality/Dockerfile.ray | 8 ++++---- .../language/html2parquet/Dockerfile.ray | 8 ++++---- transforms/language/lang_id/Dockerfile.ray | 8 ++++---- transforms/language/pdf2parquet/Dockerfile.ray | 8 ++++---- .../language/pii_redactor/Dockerfile.ray | 4 ++-- .../language/text_encoder/Dockerfile.ray | 8 ++++---- transforms/universal/doc_id/Dockerfile.ray | 8 ++++---- transforms/universal/ededup/Dockerfile.ray | 10 +++++----- transforms/universal/fdedup/Dockerfile.ray | 8 ++++---- transforms/universal/filter/Dockerfile.ray | 8 ++++---- transforms/universal/hap/Dockerfile.ray | 8 ++++---- transforms/universal/noop/Dockerfile.ray | 8 ++++---- transforms/universal/profiler/Dockerfile.ray | 8 ++++---- transforms/universal/resize/Dockerfile.ray | 8 ++++---- .../universal/tokenization/Dockerfile.ray | 8 ++++---- 27 files changed, 130 insertions(+), 130 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index dd3670c521..f33c415f65 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install libraries @@ -15,13 +15,13 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root shared_workflow_support_lib shared_workflow_support_lib/ +COPY --chmod=g=u --chown=ray:root shared_workflow_support_lib shared_workflow_support_lib/ RUN cd shared_workflow_support_lib && pip install --no-cache-dir -e . -COPY --chmod=775 --chown=ray:root workflow_support_lib workflow_support_lib/ +COPY --chmod=g=u --chown=ray:root workflow_support_lib workflow_support_lib/ RUN cd workflow_support_lib && pip install --no-cache-dir -e . # overwriting the installation of old versions of pydantic @@ -30,7 +30,7 @@ RUN pip install --no-cache-dir pydantic==2.6.3 # remove credentials-containing file RUN rm requirements.txt # components -COPY --chmod=775 --chown=ray:root ./src /pipelines/component/src +COPY --chmod=g=u --chown=ray:root ./src /pipelines/component/src # Set environment ENV KFP_v2=$KFP_v2 diff --git a/tools/ingest2parquet/Dockerfile b/tools/ingest2parquet/Dockerfile index 02bdebb847..6809535d25 100644 --- a/tools/ingest2parquet/Dockerfile +++ b/tools/ingest2parquet/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install pytest @@ -13,7 +13,7 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] COPY requirements.txt requirements.txt @@ -21,9 +21,9 @@ RUN pip install --no-cache-dir -r requirements.txt RUN rm requirements.txt # copy source -COPY --chmod=775 --chown=ray:root ./src . +COPY --chmod=g=u --chown=ray:root ./src . # copy test -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +COPY --chmod=g=u --chown=ray:root test/ test/ +COPY --chmod=g=u --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/Dockerfile.ray.template b/transforms/Dockerfile.ray.template index 30b1da9594..07a22fac7e 100644 --- a/transforms/Dockerfile.ray.template +++ b/transforms/Dockerfile.ray.template @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index 1f683ed041..f3e091c62b 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -16,22 +16,22 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . +COPY --chmod=g=u --chown=ray:root src/code2parquet_transform_ray.py . # copy some of the samples in -COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root src/code2parquet_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/code_profiler/Dockerfile.ray b/transforms/code/code_profiler/Dockerfile.ray index 440b7f9779..0d501f5477 100644 --- a/transforms/code/code_profiler/Dockerfile.ray +++ b/transforms/code/code_profiler/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_code_profiler/ dpk_code_profiler/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_code_profiler/ dpk_code_profiler/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/code/code_quality/ray/Dockerfile b/transforms/code/code_quality/ray/Dockerfile index e06ee8c7ac..f34572b272 100644 --- a/transforms/code/code_quality/ray/Dockerfile +++ b/transforms/code/code_quality/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -19,28 +19,28 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt #RUN pip install --no-cache-dir -r requirements.txt -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=775 --chown=ray:root ./src/code_quality_transform_ray.py . +COPY --chmod=g=u --chown=ray:root ./src/code_quality_transform_ray.py . # copy some of the samples in -COPY --chmod=775 --chown=ray:root ./src/code_quality_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root ./src/code_quality_local_ray.py local/ # copy test -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +COPY --chmod=g=u --chown=ray:root test/ test/ +COPY --chmod=g=u --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 21bd02b2b0..465b8c7a91 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -2,7 +2,7 @@ FROM docker.io/rayproject/ray:2.24.0-py310 # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install pytest @@ -12,14 +12,14 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # Install system dependencies, including libgomp1 @@ -31,7 +31,7 @@ User ray # copy source data COPY ./src/header_cleanser_transform_ray.py . -COPY --chmod=775 --chown=ray:root src/header_cleanser_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root src/header_cleanser_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index 49ada2fda8..c4604d2758 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,20 +15,20 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=775 --chown=ray:root README.md README.md +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data -COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . -COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root src/license_select_transform_ray.py . +COPY --chmod=g=u --chown=ray:root src/license_select_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index 284c9fac4b..2d2dd5e101 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} AS base # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -45,24 +45,24 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=775 --chown=ray:root src/malware_transform_ray.py ./ +COPY --chmod=g=u --chown=ray:root src/malware_transform_ray.py ./ # copy some of the samples in -COPY --chmod=775 --chown=ray:root src/malware_local_ray.py local/ +COPY --chmod=g=u --chown=ray:root src/malware_local_ray.py local/ -COPY --chmod=775 --chown=ray:root test/ test/ -COPY --chmod=775 --chown=ray:root test-data/ test-data/ +COPY --chmod=g=u --chown=ray:root test/ test/ +COPY --chmod=g=u --chown=ray:root test-data/ test-data/ ENV PYTHONPATH /home/ray diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/ray/Dockerfile index f7ef64f170..65ff9b15e4 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,17 +15,17 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ +COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt #RUN pip install --no-cache-dir -r requirements.txt -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/repo_level_ordering/ray/Dockerfile b/transforms/code/repo_level_ordering/ray/Dockerfile index 6b308fdbf7..69bd33f098 100644 --- a/transforms/code/repo_level_ordering/ray/Dockerfile +++ b/transforms/code/repo_level_ordering/ray/Dockerfile @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root src/ src/ -COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=775 --chown=ray:root README.md README.md +COPY --chmod=g=u --chown=ray:root src/ src/ +COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=g=u --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/language/doc_chunk/Dockerfile.ray b/transforms/language/doc_chunk/Dockerfile.ray index f0514943b4..d2b4d464c6 100644 --- a/transforms/language/doc_chunk/Dockerfile.ray +++ b/transforms/language/doc_chunk/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install pytest @@ -14,12 +14,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_doc_chunk/ dpk_doc_chunk/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_doc_chunk/ dpk_doc_chunk/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/language/doc_quality/Dockerfile.ray b/transforms/language/doc_quality/Dockerfile.ray index fe0fe5b3b2..f1f6858b2c 100644 --- a/transforms/language/doc_quality/Dockerfile.ray +++ b/transforms/language/doc_quality/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_doc_quality/ dpk_doc_quality/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_doc_quality/ dpk_doc_quality/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/language/html2parquet/Dockerfile.ray b/transforms/language/html2parquet/Dockerfile.ray index 43ddfa97d3..9ed6c8e506 100644 --- a/transforms/language/html2parquet/Dockerfile.ray +++ b/transforms/language/html2parquet/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_html2parquet/ dpk_html2parquet/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_html2parquet/ dpk_html2parquet/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/language/lang_id/Dockerfile.ray b/transforms/language/lang_id/Dockerfile.ray index 8e2de45ba8..91b05d6ee0 100644 --- a/transforms/language/lang_id/Dockerfile.ray +++ b/transforms/language/lang_id/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -21,12 +21,12 @@ USER ray # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_lang_id/ dpk_lang_id/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_lang_id/ dpk_lang_id/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # clean up apt diff --git a/transforms/language/pdf2parquet/Dockerfile.ray b/transforms/language/pdf2parquet/Dockerfile.ray index 3a11b7ee69..f3b03f596e 100644 --- a/transforms/language/pdf2parquet/Dockerfile.ray +++ b/transforms/language/pdf2parquet/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -23,13 +23,13 @@ RUN \ # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt diff --git a/transforms/language/pii_redactor/Dockerfile.ray b/transforms/language/pii_redactor/Dockerfile.ray index 93b6bf4202..40d6f8e6a6 100644 --- a/transforms/language/pii_redactor/Dockerfile.ray +++ b/transforms/language/pii_redactor/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,7 +15,7 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform diff --git a/transforms/language/text_encoder/Dockerfile.ray b/transforms/language/text_encoder/Dockerfile.ray index 9a6fa04d8a..638a02e9f7 100644 --- a/transforms/language/text_encoder/Dockerfile.ray +++ b/transforms/language/text_encoder/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray # install pytest @@ -13,11 +13,11 @@ ARG PIP_INSTALL_EXTRA_ARGS ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_text_encoder/ dpk_text_encoder/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_text_encoder/ dpk_text_encoder/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/doc_id/Dockerfile.ray b/transforms/universal/doc_id/Dockerfile.ray index 30b1da9594..07a22fac7e 100644 --- a/transforms/universal/doc_id/Dockerfile.ray +++ b/transforms/universal/doc_id/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/ededup/Dockerfile.ray b/transforms/universal/ededup/Dockerfile.ray index 10faaf2978..01d60d3b95 100644 --- a/transforms/universal/ededup/Dockerfile.ray +++ b/transforms/universal/ededup/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,14 +15,14 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] # Install ray project source -COPY --chmod=775 --chown=ray:root dpk_ededup/ dpk_ededup/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt -COPY --chmod=775 --chown=ray:root README.md README.md +COPY --chmod=g=u --chown=ray:root dpk_ededup/ dpk_ededup/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root README.md README.md RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/fdedup/Dockerfile.ray b/transforms/universal/fdedup/Dockerfile.ray index 8d36e6a35f..bae6cd9ef1 100644 --- a/transforms/universal/fdedup/Dockerfile.ray +++ b/transforms/universal/fdedup/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=775 --chown=ray:root dpk_fdedup/ dpk_fdedup/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_fdedup/ dpk_fdedup/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/universal/filter/Dockerfile.ray b/transforms/universal/filter/Dockerfile.ray index 30b1da9594..07a22fac7e 100644 --- a/transforms/universal/filter/Dockerfile.ray +++ b/transforms/universal/filter/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/hap/Dockerfile.ray b/transforms/universal/hap/Dockerfile.ray index 30b1da9594..07a22fac7e 100644 --- a/transforms/universal/hap/Dockerfile.ray +++ b/transforms/universal/hap/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/noop/Dockerfile.ray b/transforms/universal/noop/Dockerfile.ray index 30b1da9594..07a22fac7e 100644 --- a/transforms/universal/noop/Dockerfile.ray +++ b/transforms/universal/noop/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/profiler/Dockerfile.ray b/transforms/universal/profiler/Dockerfile.ray index 30b1da9594..07a22fac7e 100644 --- a/transforms/universal/profiler/Dockerfile.ray +++ b/transforms/universal/profiler/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/resize/Dockerfile.ray b/transforms/universal/resize/Dockerfile.ray index 30b1da9594..07a22fac7e 100644 --- a/transforms/universal/resize/Dockerfile.ray +++ b/transforms/universal/resize/Dockerfile.ray @@ -3,7 +3,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/tokenization/Dockerfile.ray b/transforms/universal/tokenization/Dockerfile.ray index 0469e7d9ba..2988d89388 100644 --- a/transforms/universal/tokenization/Dockerfile.ray +++ b/transforms/universal/tokenization/Dockerfile.ray @@ -4,7 +4,7 @@ FROM ${BASE_IMAGE} # see https://docs.openshift.com/container-platform/4.17/openshift_images/create-images.html#use-uid_create-images USER root -RUN chown ray:root /home/ray && chmod -R g=u /home/ray +RUN chown ray:root /home/ray && chmod g=u /home/ray USER ray RUN pip install --upgrade --no-cache-dir pip @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=775 --chown=ray:root dpk_tokenization/ dpk_tokenization/ -COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=g=u --chown=ray:root dpk_tokenization/ dpk_tokenization/ +COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment From 6253bacd32f59249f6485b03a20fe0b05b53f7c4 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 17:36:32 +0200 Subject: [PATCH 13/17] A fix. Signed-off-by: Revital Sur --- kfp/kfp_ray_components/Dockerfile | 8 ++++---- tools/ingest2parquet/Dockerfile | 8 ++++---- transforms/Dockerfile.ray.template | 6 +++--- transforms/code/code2parquet/ray/Dockerfile | 12 ++++++------ transforms/code/code_profiler/Dockerfile.ray | 6 +++--- transforms/code/code_quality/ray/Dockerfile | 16 ++++++++-------- transforms/code/header_cleanser/ray/Dockerfile | 10 +++++----- transforms/code/license_select/ray/Dockerfile | 14 +++++++------- transforms/code/malware/ray/Dockerfile | 16 ++++++++-------- transforms/code/proglang_select/ray/Dockerfile | 8 ++++---- .../code/repo_level_ordering/ray/Dockerfile | 8 ++++---- transforms/language/doc_chunk/Dockerfile.ray | 6 +++--- transforms/language/doc_quality/Dockerfile.ray | 6 +++--- transforms/language/html2parquet/Dockerfile.ray | 6 +++--- transforms/language/lang_id/Dockerfile.ray | 6 +++--- transforms/language/pdf2parquet/Dockerfile.ray | 6 +++--- transforms/language/pii_redactor/Dockerfile.ray | 2 +- transforms/language/text_encoder/Dockerfile.ray | 6 +++--- transforms/universal/doc_id/Dockerfile.ray | 6 +++--- transforms/universal/ededup/Dockerfile.ray | 8 ++++---- transforms/universal/fdedup/Dockerfile.ray | 6 +++--- transforms/universal/filter/Dockerfile.ray | 6 +++--- transforms/universal/hap/Dockerfile.ray | 6 +++--- transforms/universal/noop/Dockerfile.ray | 6 +++--- transforms/universal/profiler/Dockerfile.ray | 6 +++--- transforms/universal/resize/Dockerfile.ray | 6 +++--- transforms/universal/tokenization/Dockerfile.ray | 6 +++--- 27 files changed, 103 insertions(+), 103 deletions(-) diff --git a/kfp/kfp_ray_components/Dockerfile b/kfp/kfp_ray_components/Dockerfile index f33c415f65..5adbd533ae 100644 --- a/kfp/kfp_ray_components/Dockerfile +++ b/kfp/kfp_ray_components/Dockerfile @@ -15,13 +15,13 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root shared_workflow_support_lib shared_workflow_support_lib/ +COPY --chmod=775 --chown=ray:root shared_workflow_support_lib shared_workflow_support_lib/ RUN cd shared_workflow_support_lib && pip install --no-cache-dir -e . -COPY --chmod=g=u --chown=ray:root workflow_support_lib workflow_support_lib/ +COPY --chmod=775 --chown=ray:root workflow_support_lib workflow_support_lib/ RUN cd workflow_support_lib && pip install --no-cache-dir -e . # overwriting the installation of old versions of pydantic @@ -30,7 +30,7 @@ RUN pip install --no-cache-dir pydantic==2.6.3 # remove credentials-containing file RUN rm requirements.txt # components -COPY --chmod=g=u --chown=ray:root ./src /pipelines/component/src +COPY --chmod=775 --chown=ray:root ./src /pipelines/component/src # Set environment ENV KFP_v2=$KFP_v2 diff --git a/tools/ingest2parquet/Dockerfile b/tools/ingest2parquet/Dockerfile index 6809535d25..c37739a6aa 100644 --- a/tools/ingest2parquet/Dockerfile +++ b/tools/ingest2parquet/Dockerfile @@ -13,7 +13,7 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] COPY requirements.txt requirements.txt @@ -21,9 +21,9 @@ RUN pip install --no-cache-dir -r requirements.txt RUN rm requirements.txt # copy source -COPY --chmod=g=u --chown=ray:root ./src . +COPY --chmod=775 --chown=ray:root ./src . # copy test -COPY --chmod=g=u --chown=ray:root test/ test/ -COPY --chmod=g=u --chown=ray:root test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/Dockerfile.ray.template b/transforms/Dockerfile.ray.template index 07a22fac7e..837a3ffda3 100644 --- a/transforms/Dockerfile.ray.template +++ b/transforms/Dockerfile.ray.template @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index f3e091c62b..74e6577ed7 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -16,22 +16,22 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=g=u --chown=ray:root src/code2parquet_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . # copy some of the samples in -COPY --chmod=g=u --chown=ray:root src/code2parquet_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/code_profiler/Dockerfile.ray b/transforms/code/code_profiler/Dockerfile.ray index 0d501f5477..9a63ee9fb3 100644 --- a/transforms/code/code_profiler/Dockerfile.ray +++ b/transforms/code/code_profiler/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_code_profiler/ dpk_code_profiler/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_code_profiler/ dpk_code_profiler/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/code/code_quality/ray/Dockerfile b/transforms/code/code_quality/ray/Dockerfile index f34572b272..9906a50b7b 100644 --- a/transforms/code/code_quality/ray/Dockerfile +++ b/transforms/code/code_quality/ray/Dockerfile @@ -19,28 +19,28 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt #RUN pip install --no-cache-dir -r requirements.txt -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=g=u --chown=ray:root ./src/code_quality_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/code_quality_transform_ray.py . # copy some of the samples in -COPY --chmod=g=u --chown=ray:root ./src/code_quality_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/code_quality_local_ray.py local/ # copy test -COPY --chmod=g=u --chown=ray:root test/ test/ -COPY --chmod=g=u --chown=ray:root test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 465b8c7a91..056f64c0ac 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -12,14 +12,14 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform +COPY --chmod=775 --chown=ray:root python-transform/ python-transform RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # Install system dependencies, including libgomp1 @@ -31,7 +31,7 @@ User ray # copy source data COPY ./src/header_cleanser_transform_ray.py . -COPY --chmod=g=u --chown=ray:root src/header_cleanser_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/header_cleanser_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index c4604d2758..8d1c457e55 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -15,20 +15,20 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=g=u --chown=ray:root README.md README.md +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data -COPY --chmod=g=u --chown=ray:root src/license_select_transform_ray.py . -COPY --chmod=g=u --chown=ray:root src/license_select_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . +COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ # copy test COPY test/ test/ diff --git a/transforms/code/malware/ray/Dockerfile b/transforms/code/malware/ray/Dockerfile index 2d2dd5e101..56825ffa62 100644 --- a/transforms/code/malware/ray/Dockerfile +++ b/transforms/code/malware/ray/Dockerfile @@ -45,24 +45,24 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY --chmod=g=u --chown=ray:root src/malware_transform_ray.py ./ +COPY --chmod=775 --chown=ray:root src/malware_transform_ray.py ./ # copy some of the samples in -COPY --chmod=g=u --chown=ray:root src/malware_local_ray.py local/ +COPY --chmod=775 --chown=ray:root src/malware_local_ray.py local/ -COPY --chmod=g=u --chown=ray:root test/ test/ -COPY --chmod=g=u --chown=ray:root test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ ENV PYTHONPATH /home/ray diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/ray/Dockerfile index 65ff9b15e4..f7ccd1ca20 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/ray/Dockerfile @@ -15,17 +15,17 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root python-transform/ python-transform/ +COPY --chmod=775 --chown=ray:root python-transform/ python-transform/ RUN cd python-transform && pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt #RUN pip install --no-cache-dir -r requirements.txt -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/repo_level_ordering/ray/Dockerfile b/transforms/code/repo_level_ordering/ray/Dockerfile index 69bd33f098..5284397227 100644 --- a/transforms/code/repo_level_ordering/ray/Dockerfile +++ b/transforms/code/repo_level_ordering/ray/Dockerfile @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root src/ src/ -COPY --chmod=g=u --chown=ray:root pyproject.toml pyproject.toml -COPY --chmod=g=u --chown=ray:root README.md README.md +COPY --chmod=775 --chown=ray:root src/ src/ +COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml +COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/language/doc_chunk/Dockerfile.ray b/transforms/language/doc_chunk/Dockerfile.ray index d2b4d464c6..3a541de1b8 100644 --- a/transforms/language/doc_chunk/Dockerfile.ray +++ b/transforms/language/doc_chunk/Dockerfile.ray @@ -14,12 +14,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_doc_chunk/ dpk_doc_chunk/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_doc_chunk/ dpk_doc_chunk/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/language/doc_quality/Dockerfile.ray b/transforms/language/doc_quality/Dockerfile.ray index f1f6858b2c..7ba61b5441 100644 --- a/transforms/language/doc_quality/Dockerfile.ray +++ b/transforms/language/doc_quality/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_doc_quality/ dpk_doc_quality/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_doc_quality/ dpk_doc_quality/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/language/html2parquet/Dockerfile.ray b/transforms/language/html2parquet/Dockerfile.ray index 9ed6c8e506..742cea06a2 100644 --- a/transforms/language/html2parquet/Dockerfile.ray +++ b/transforms/language/html2parquet/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_html2parquet/ dpk_html2parquet/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_html2parquet/ dpk_html2parquet/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/language/lang_id/Dockerfile.ray b/transforms/language/lang_id/Dockerfile.ray index 91b05d6ee0..93df29982d 100644 --- a/transforms/language/lang_id/Dockerfile.ray +++ b/transforms/language/lang_id/Dockerfile.ray @@ -21,12 +21,12 @@ USER ray # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_lang_id/ dpk_lang_id/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_lang_id/ dpk_lang_id/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # clean up apt diff --git a/transforms/language/pdf2parquet/Dockerfile.ray b/transforms/language/pdf2parquet/Dockerfile.ray index f3b03f596e..4dc62538ec 100644 --- a/transforms/language/pdf2parquet/Dockerfile.ray +++ b/transforms/language/pdf2parquet/Dockerfile.ray @@ -23,13 +23,13 @@ RUN \ # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_pdf2parquet/ dpk_pdf2parquet/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt diff --git a/transforms/language/pii_redactor/Dockerfile.ray b/transforms/language/pii_redactor/Dockerfile.ray index 40d6f8e6a6..791cfd2a9a 100644 --- a/transforms/language/pii_redactor/Dockerfile.ray +++ b/transforms/language/pii_redactor/Dockerfile.ray @@ -15,7 +15,7 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform diff --git a/transforms/language/text_encoder/Dockerfile.ray b/transforms/language/text_encoder/Dockerfile.ray index 638a02e9f7..ba0913bad5 100644 --- a/transforms/language/text_encoder/Dockerfile.ray +++ b/transforms/language/text_encoder/Dockerfile.ray @@ -13,11 +13,11 @@ ARG PIP_INSTALL_EXTRA_ARGS ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_text_encoder/ dpk_text_encoder/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_text_encoder/ dpk_text_encoder/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/doc_id/Dockerfile.ray b/transforms/universal/doc_id/Dockerfile.ray index 07a22fac7e..837a3ffda3 100644 --- a/transforms/universal/doc_id/Dockerfile.ray +++ b/transforms/universal/doc_id/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/ededup/Dockerfile.ray b/transforms/universal/ededup/Dockerfile.ray index 01d60d3b95..bb1ffae5dc 100644 --- a/transforms/universal/ededup/Dockerfile.ray +++ b/transforms/universal/ededup/Dockerfile.ray @@ -15,14 +15,14 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] # Install ray project source -COPY --chmod=g=u --chown=ray:root dpk_ededup/ dpk_ededup/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt -COPY --chmod=g=u --chown=ray:root README.md README.md +COPY --chmod=775 --chown=ray:root dpk_ededup/ dpk_ededup/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/fdedup/Dockerfile.ray b/transforms/universal/fdedup/Dockerfile.ray index bae6cd9ef1..09ebced24d 100644 --- a/transforms/universal/fdedup/Dockerfile.ray +++ b/transforms/universal/fdedup/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform -COPY --chmod=g=u --chown=ray:root dpk_fdedup/ dpk_fdedup/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_fdedup/ dpk_fdedup/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install -r requirements.txt # Set environment diff --git a/transforms/universal/filter/Dockerfile.ray b/transforms/universal/filter/Dockerfile.ray index 07a22fac7e..837a3ffda3 100644 --- a/transforms/universal/filter/Dockerfile.ray +++ b/transforms/universal/filter/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/hap/Dockerfile.ray b/transforms/universal/hap/Dockerfile.ray index 07a22fac7e..837a3ffda3 100644 --- a/transforms/universal/hap/Dockerfile.ray +++ b/transforms/universal/hap/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/noop/Dockerfile.ray b/transforms/universal/noop/Dockerfile.ray index 07a22fac7e..837a3ffda3 100644 --- a/transforms/universal/noop/Dockerfile.ray +++ b/transforms/universal/noop/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/profiler/Dockerfile.ray b/transforms/universal/profiler/Dockerfile.ray index 07a22fac7e..837a3ffda3 100644 --- a/transforms/universal/profiler/Dockerfile.ray +++ b/transforms/universal/profiler/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/resize/Dockerfile.ray b/transforms/universal/resize/Dockerfile.ray index 07a22fac7e..837a3ffda3 100644 --- a/transforms/universal/resize/Dockerfile.ray +++ b/transforms/universal/resize/Dockerfile.ray @@ -15,12 +15,12 @@ ARG TRANSFORM_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_${TRANSFORM_NAME}/ dpk_${TRANSFORM_NAME}/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment diff --git a/transforms/universal/tokenization/Dockerfile.ray b/transforms/universal/tokenization/Dockerfile.ray index 2988d89388..26d4a24d6c 100644 --- a/transforms/universal/tokenization/Dockerfile.ray +++ b/transforms/universal/tokenization/Dockerfile.ray @@ -15,12 +15,12 @@ ARG DPK_WHEEL_FILE_NAME # Copy and install data processing libraries # These are expected to be placed in the docker context before this is run (see the make image). -COPY --chmod=g=u --chown=ray:root data-processing-dist data-processing-dist +COPY --chmod=775 --chown=ray:root data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -COPY --chmod=g=u --chown=ray:root dpk_tokenization/ dpk_tokenization/ -COPY --chmod=g=u --chown=ray:root requirements.txt requirements.txt +COPY --chmod=775 --chown=ray:root dpk_tokenization/ dpk_tokenization/ +COPY --chmod=775 --chown=ray:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt # Set environment From 40c2889f2d9ae7ddc6c9d034c8a235ab1300b693 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Sun, 26 Jan 2025 18:52:26 +0200 Subject: [PATCH 14/17] More changes. Signed-off-by: Revital Sur --- transforms/code/code2parquet/ray/Dockerfile | 4 ++-- transforms/code/header_cleanser/ray/Dockerfile | 6 +++--- transforms/code/license_select/ray/Dockerfile | 4 ++-- transforms/code/proglang_select/ray/Dockerfile | 8 ++++---- transforms/code/repo_level_ordering/ray/Dockerfile | 10 +++++----- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/transforms/code/code2parquet/ray/Dockerfile b/transforms/code/code2parquet/ray/Dockerfile index 74e6577ed7..6681f09d50 100644 --- a/transforms/code/code2parquet/ray/Dockerfile +++ b/transforms/code/code2parquet/ray/Dockerfile @@ -34,8 +34,8 @@ COPY --chmod=775 --chown=ray:root src/code2parquet_transform_ray.py . COPY --chmod=775 --chown=ray:root src/code2parquet_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/header_cleanser/ray/Dockerfile b/transforms/code/header_cleanser/ray/Dockerfile index 056f64c0ac..0bef909d84 100644 --- a/transforms/code/header_cleanser/ray/Dockerfile +++ b/transforms/code/header_cleanser/ray/Dockerfile @@ -30,12 +30,12 @@ RUN sudo apt-get update && sudo apt-get install -y \ User ray # copy source data -COPY ./src/header_cleanser_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/header_cleanser_transform_ray.py . COPY --chmod=775 --chown=ray:root src/header_cleanser_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/license_select/ray/Dockerfile b/transforms/code/license_select/ray/Dockerfile index 8d1c457e55..d7b3be5f80 100644 --- a/transforms/code/license_select/ray/Dockerfile +++ b/transforms/code/license_select/ray/Dockerfile @@ -31,8 +31,8 @@ COPY --chmod=775 --chown=ray:root src/license_select_transform_ray.py . COPY --chmod=775 --chown=ray:root src/license_select_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Put these at the end since they seem to upset the docker cache. ARG BUILD_DATE diff --git a/transforms/code/proglang_select/ray/Dockerfile b/transforms/code/proglang_select/ray/Dockerfile index f7ccd1ca20..7f457ed4e6 100644 --- a/transforms/code/proglang_select/ray/Dockerfile +++ b/transforms/code/proglang_select/ray/Dockerfile @@ -29,14 +29,14 @@ COPY --chmod=775 --chown=ray:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy the main() entry point to the image -COPY ./src/proglang_select_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/proglang_select_transform_ray.py . # copy some of the samples in -COPY ./src/proglang_select_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/proglang_select_local_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/code/repo_level_ordering/ray/Dockerfile b/transforms/code/repo_level_ordering/ray/Dockerfile index 5284397227..a533a281e0 100644 --- a/transforms/code/repo_level_ordering/ray/Dockerfile +++ b/transforms/code/repo_level_ordering/ray/Dockerfile @@ -24,13 +24,13 @@ COPY --chmod=775 --chown=ray:root README.md README.md RUN pip install --no-cache-dir -e . # copy source data -COPY ./src/repo_level_order_transform_ray.py . -COPY ./src/repo_level_order_local_ray.py local/ -COPY ./src/repo_level_order_s3_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/repo_level_order_transform_ray.py . +COPY --chmod=775 --chown=ray:root ./src/repo_level_order_local_ray.py local/ +COPY --chmod=775 --chown=ray:root ./src/repo_level_order_s3_ray.py local/ # copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chmod=775 --chown=ray:root test/ test/ +COPY --chmod=775 --chown=ray:root test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray:/home/ray/src From c5117e54c52e324f29182c07fcbe3a613768e09a Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 27 Jan 2025 06:11:40 +0200 Subject: [PATCH 15/17] Fix super pipeline kfp v2. Signed-off-by: Revital Sur --- examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md | 1 + .../superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py | 2 ++ transforms/universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md index f68c1aaf7e..2a16be57f1 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md @@ -21,6 +21,7 @@ Another useful feature of the KFP v2 is the `Json` editor for the `dict` type in - It creates just one run that includes all the nested transfroms and their sub-tasks. - No need for additional component as `executeSubWorkflowComponent.yaml`. All the implementation in the same pipeline file. - In superpipelines of KFP v1 there exists an option to override the common parameters with specific values for each one of the transforms. This option is missing in the KFP v2 superpipelines. +- In kfp V2 pipelines the user is requested to insert a unique string for the ray cluster created at run creation time (called `ray_run_id_KFPv2`). This is because in KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. ### How to compile the superpipeline ``` diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py index 434d84ab0d..7c82ab79ad 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py @@ -62,6 +62,7 @@ def super_pipeline( p2_skip: bool = False, p2_noop_sleep_sec: int = 10, p2_ray_name: str = "noop-kfp-ray", + p2_ray_run_id_KFPv2: str = "", p2_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": noop_image}, p2_ray_worker_options: dict = { "replicas": 2, @@ -75,6 +76,7 @@ def super_pipeline( # Document ID step parameters p3_name: str = "doc_id", p3_ray_name: str = "docid-kfp-ray", + p3_ray_run_id_KFPv2: str = "", p3_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": doc_id_image}, p3_ray_worker_options: dict = { "replicas": 2, diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index 0b9ccd42d0..2542a876c4 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -27,7 +27,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "-m dpk_doc_id.ray.transform" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.3" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" # path to kfp component specifications files component_spec_path = os.getenv( From 5bba22cf78f4767add841383478558396ea59348 Mon Sep 17 00:00:00 2001 From: Revital Sur Date: Mon, 27 Jan 2025 06:53:53 -0600 Subject: [PATCH 16/17] Address review comments. Signed-off-by: Revital Sur --- examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md | 2 +- .../ray/kfp_v2/superpipeline_noop_docId_v2_wf.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md index 2a16be57f1..69d315efe5 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/README.md @@ -21,7 +21,7 @@ Another useful feature of the KFP v2 is the `Json` editor for the `dict` type in - It creates just one run that includes all the nested transfroms and their sub-tasks. - No need for additional component as `executeSubWorkflowComponent.yaml`. All the implementation in the same pipeline file. - In superpipelines of KFP v1 there exists an option to override the common parameters with specific values for each one of the transforms. This option is missing in the KFP v2 superpipelines. -- In kfp V2 pipelines the user is requested to insert a unique string for the ray cluster created at run creation time (called `ray_run_id_KFPv2`). This is because in KFPv2 dsl.RUN_ID_PLACEHOLDER is deprecated and cannot be used since SDK 2.5.0. +- In kfp V2 pipelines the user is requested to insert a unique string for the ray cluster created at run creation time (called `ray_run_id_KFPv2`). This is because in KFPv2 `dsl.RUN_ID_PLACEHOLDER` is deprecated and cannot be used since SDK 2.5.0 and we cannot generate a unique string at run-time, see https://github.com/kubeflow/pipelines/issues/10187. ### How to compile the superpipeline ``` diff --git a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py index 7c82ab79ad..5d3846540a 100644 --- a/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py +++ b/examples/kfp-pipelines/superworkflows/ray/kfp_v2/superpipeline_noop_docId_v2_wf.py @@ -57,12 +57,12 @@ def super_pipeline( p1_pipeline_data_max_files: int = -1, p1_pipeline_data_num_samples: int = -1, p1_pipeline_data_checkpointing: bool = False, + p1_pipeline_ray_run_id_KFPv2: str = "", # noop step parameters p2_name: str = "noop", p2_skip: bool = False, p2_noop_sleep_sec: int = 10, p2_ray_name: str = "noop-kfp-ray", - p2_ray_run_id_KFPv2: str = "", p2_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": noop_image}, p2_ray_worker_options: dict = { "replicas": 2, @@ -76,7 +76,6 @@ def super_pipeline( # Document ID step parameters p3_name: str = "doc_id", p3_ray_name: str = "docid-kfp-ray", - p3_ray_run_id_KFPv2: str = "", p3_ray_head_options: dict = {"cpu": 1, "memory": 4, "image_pull_secret": "", "image": doc_id_image}, p3_ray_worker_options: dict = { "replicas": 2, From a306cbfd21973ed26c713cd0e51a86f74d0dbe02 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 27 Jan 2025 14:02:39 -0500 Subject: [PATCH 17/17] fix dependency issue breaking test-src Signed-off-by: Maroun Touma --- transforms/universal/fdedup/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/universal/fdedup/requirements.txt b/transforms/universal/fdedup/requirements.txt index b28fac8598..42af99d8bf 100644 --- a/transforms/universal/fdedup/requirements.txt +++ b/transforms/universal/fdedup/requirements.txt @@ -6,4 +6,4 @@ disjoint-set>=0.8.0 scipy>=1.12.1, <2.0.0 numpy<1.29.0 sentencepiece>=0.2.0 -mmh3>=4.1.0 +mmh3>=4.1.0, <=5.0.1