Skip to content

Commit

Permalink
Address reviw comments.
Browse files Browse the repository at this point in the history
Signed-off-by: Revital Sur <[email protected]>
  • Loading branch information
revit13 committed Feb 10, 2025
1 parent 6a3fa28 commit 293b3e0
Show file tree
Hide file tree
Showing 29 changed files with 149 additions and 62 deletions.
9 changes: 6 additions & 3 deletions transforms/code/code2parquet/kfp_ray/code2parquet_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
)


# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "code2parquet_transform_ray.py"

Expand Down Expand Up @@ -108,7 +111,7 @@ def code2parquet(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/code2parquet/input', 'output_folder': 'test/code2parquet/output/'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
data_files_to_use: str = "['.zip']",
Expand All @@ -121,7 +124,7 @@ def code2parquet(
code2parquet_detect_programming_lang: bool = True,
code2parquet_domain: str = "code",
code2parquet_snapshot: str = "github",
code2parquet_s3_access_secret: str = "s3-secret",
code2parquet_s3_access_secret: str = S3_SECRET,
# additional parameters
additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}',
) -> None:
Expand Down Expand Up @@ -229,7 +232,7 @@ def code2parquet(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env(prefix=PREFIX)
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, code2parquet_s3_access_secret, prefix=PREFIX)
execute_job.after(ray_cluster)
Expand Down
7 changes: 5 additions & 2 deletions transforms/code/code_quality/kfp_ray/code_quality_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
)


# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "code_quality_transform_ray.py"
PREFIX: str = ""
Expand Down Expand Up @@ -107,7 +110,7 @@ def code_quality(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/code_quality/input/', 'output_folder': 'test/code_quality/output/'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -222,7 +225,7 @@ def code_quality(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)

Expand Down
7 changes: 5 additions & 2 deletions transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
)


# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "header_cleanser_transform_ray.py"
PREFIX: str = ""
Expand Down Expand Up @@ -115,7 +118,7 @@ def header_cleanser(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/header_cleanser/input/', 'output_folder': 'test/header_cleanser/output/'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -242,7 +245,7 @@ def header_cleanser(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)

Expand Down
7 changes: 5 additions & 2 deletions transforms/code/license_select/kfp_ray/license_select_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
)


# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "license_select_transform_ray.py"

Expand Down Expand Up @@ -108,7 +111,7 @@ def license_select(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/license_select/input/', 'output_folder': 'test/license_select/output/'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -213,7 +216,7 @@ def license_select(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
7 changes: 5 additions & 2 deletions transforms/code/malware/kfp_ray/malware_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
)


# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "malware_transform_ray.py"

Expand Down Expand Up @@ -99,7 +102,7 @@ def malware(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/malware/input', 'output_folder': 'test/malware/output'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -207,7 +210,7 @@ def malware(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
9 changes: 6 additions & 3 deletions transforms/code/proglang_select/kfp_ray/proglang_select_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
)


# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "proglang_select_transform_ray.py"

Expand Down Expand Up @@ -101,7 +104,7 @@ def lang_select(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/proglang_select/input/', 'output_folder': 'test/proglang_select/output/'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand All @@ -111,7 +114,7 @@ def lang_select(
# Proglang match parameters
proglang_select_allowed_langs_file: str = "test/proglang_select/languages/allowed-code-languages.txt",
proglang_select_language_column: str = "language",
proglang_select_s3_access_secret: str = "s3-secret",
proglang_select_s3_access_secret: str = S3_SECRET,
# additional parameters
additional_params: str = '{"wait_interval": 2, "wait_cluster_ready_tmout": 400, "wait_cluster_up_tmout": 300, "wait_job_ready_tmout": 400, "wait_print_tmout": 30, "http_retries": 5, "delete_cluster_delay_minutes": 0}',
) -> None:
Expand Down Expand Up @@ -213,7 +216,7 @@ def lang_select(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env(prefix=PREFIX)
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, proglang_select_s3_access_secret, prefix=PREFIX)
execute_job.after(ray_cluster)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

task_image = "quay.io/dataprep1/data-prep-kit/repo_level_order-ray:latest"

# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "repo_level_order_transform_ray.py"

Expand Down Expand Up @@ -124,7 +127,7 @@ def repo_level_order(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/repo_level_ordering/input', 'output_folder': 'test/repo_level_ordering/output'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -259,7 +262,7 @@ def repo_level_order(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@

task_image = "quay.io/dataprep1/data-prep-kit/doc_chunk-ray:latest"

# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "-m dpk_doc_chunk.ray.transform"

Expand Down Expand Up @@ -110,7 +113,7 @@ def doc_chunk(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "[{'input_folder': 'test/doc_chunk/input/', 'output_folder': 'test/doc_chunk/output/'}]",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -226,7 +229,7 @@ def doc_chunk(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
7 changes: 5 additions & 2 deletions transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@

task_image = "quay.io/dataprep1/data-prep-kit/doc_chunk-ray:latest"

# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "-m dpk_doc_chunk.ray.transform"

Expand Down Expand Up @@ -104,7 +107,7 @@ def doc_chunk(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/doc_chunk/input/', 'output_folder': 'test/doc_chunk/output/'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -218,7 +221,7 @@ def doc_chunk(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@

task_image = "quay.io/dataprep1/data-prep-kit/doc_quality-ray:latest"

# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "-m dpk_doc_quality.ray.transform"

Expand Down Expand Up @@ -109,7 +112,7 @@ def doc_quality(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "[{'input_folder': 'test/doc_quality/input/', 'output_folder': 'test/doc_quality/output/'}]",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -222,7 +225,7 @@ def doc_quality(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
7 changes: 5 additions & 2 deletions transforms/language/doc_quality/kfp_ray/doc_quality_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@

task_image = "quay.io/dataprep1/data-prep-kit/doc_quality-ray:latest"

# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "-m dpk_doc_quality.ray.transform"

Expand Down Expand Up @@ -115,7 +118,7 @@ def doc_quality(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/doc_quality/input/', 'output_folder': 'test/doc_quality/output/'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -229,7 +232,7 @@ def doc_quality(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
7 changes: 5 additions & 2 deletions transforms/language/html2parquet/kfp_ray/html2parquet_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@

task_image = "quay.io/dataprep1/data-prep-kit/html2parquet-ray:latest"

# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "-m dpk_html2parquet.ray.transform"

Expand Down Expand Up @@ -109,7 +112,7 @@ def html2parquet(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/html2parquet/input/', 'output_folder': 'test/html2parquet/output/'}",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
data_checkpointing: bool = False,
Expand Down Expand Up @@ -222,7 +225,7 @@ def html2parquet(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
7 changes: 5 additions & 2 deletions transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@

task_image = "quay.io/dataprep1/data-prep-kit/lang_id-ray:latest"

# The secret name containing the s3 credentials.
S3_SECRET = "s3-secret"

# the name of the job script
EXEC_SCRIPT_NAME: str = "-m dpk_lang_id.ray.transform"

Expand Down Expand Up @@ -114,7 +117,7 @@ def lang_id(
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "[{'input_folder': 'test/lang_id/input/', 'output_folder': 'test/lang_id/output/'}]",
data_s3_access_secret: str = "s3-secret",
data_s3_access_secret: str = S3_SECRET,
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
Expand Down Expand Up @@ -236,7 +239,7 @@ def lang_id(
# FIXME: Due to kubeflow/pipelines#10914, secret names cannot be provided as pipeline arguments.
# As a workaround, the secret name is hard coded.
env2key = ComponentUtils.set_secret_key_to_env()
kubernetes.use_secret_as_env(task=execute_job, secret_name="s3-secret", secret_key_to_env=env2key)
kubernetes.use_secret_as_env(task=execute_job, secret_name=S3_SECRET, secret_key_to_env=env2key)
else:
ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret)
execute_job.after(ray_cluster)
Expand Down
Loading

0 comments on commit 293b3e0

Please sign in to comment.