From dae2276edf28bcf3c7eb570e7a7d6c8a8fd41e38 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 26 Nov 2024 13:45:12 -0500 Subject: [PATCH] build transforms==0.2.3.dev0 release with toolkit==0.2.2 Signed-off-by: Maroun Touma --- .make.versions | 4 +- data-processing-lib/pyproject.toml | 2 +- .../createRayClusterComponent.yaml | 2 +- .../deleteRayClusterComponent.yaml | 2 +- .../executeRayJobComponent.yaml | 2 +- .../executeRayJobComponent_multi_s3.yaml | 2 +- .../executeSubWorkflowComponent.yaml | 2 +- .../kfp_v1_workflow_support/pyproject.toml | 4 +- .../kfp_v2_workflow_support/pyproject.toml | 4 +- .../shared_workflow_support/pyproject.toml | 4 +- transforms/Makefile | 2 +- .../code2parquet/kfp_ray/code2parquet_wf.py | 2 +- .../code/code2parquet/python/pyproject.toml | 2 +- .../code/code2parquet/python/requirements.txt | 2 +- .../code/code2parquet/ray/pyproject.toml | 6 +- .../code/code_profiler/python/pyproject.toml | 2 +- .../code_profiler/python/requirements.txt | 2 +- .../code/code_profiler/ray/pyproject.toml | 6 +- .../code_quality/kfp_ray/code_quality_wf.py | 2 +- .../code/code_quality/python/pyproject.toml | 2 +- .../code/code_quality/python/requirements.txt | 2 +- .../code/code_quality/ray/pyproject.toml | 6 +- .../kfp_ray/header_cleanser_wf.py | 2 +- .../header_cleanser/python/pyproject.toml | 2 +- .../header_cleanser/python/requirements.txt | 2 +- .../code/header_cleanser/ray/pyproject.toml | 6 +- .../kfp_ray/license_select_wf.py | 2 +- .../code/license_select/python/pyproject.toml | 2 +- .../license_select/python/requirements.txt | 2 +- .../code/license_select/ray/pyproject.toml | 6 +- transforms/code/malware/kfp_ray/malware_wf.py | 2 +- transforms/code/malware/python/pyproject.toml | 4 +- transforms/code/malware/ray/pyproject.toml | 6 +- .../kfp_ray/proglang_select_wf.py | 2 +- .../proglang_select/python/pyproject.toml | 2 +- .../proglang_select/python/requirements.txt | 2 +- .../code/proglang_select/ray/pyproject.toml | 6 +- .../kfp_ray/repo_level_order_wf.py | 2 +- .../repo_level_ordering/ray/pyproject.toml | 4 +- .../kfp_ray/doc_chunk_multiple_wf.py | 2 +- .../doc_chunk/kfp_ray/doc_chunk_wf.py | 2 +- .../doc_chunk/python/requirements.txt | 2 +- .../language/doc_chunk/ray/pyproject.toml | 2 +- .../kfp_ray/doc_quality_multiple_wf.py | 2 +- .../doc_quality/kfp_ray/doc_quality_wf.py | 2 +- .../doc_quality/python/pyproject.toml | 2 +- .../doc_quality/python/requirements.txt | 2 +- .../language/doc_quality/ray/pyproject.toml | 6 +- .../html2parquet/kfp_ray/html2parquet_wf.py | 2 +- .../html2parquet/python/pyproject.toml | 2 +- .../html2parquet/python/requirements.txt | 2 +- .../language/html2parquet/ray/pyproject.toml | 2 +- .../html2parquet/ray/requirements.txt | 4 +- .../lang_id/kfp_ray/lang_id_multiple_wf.py | 2 +- .../language/lang_id/kfp_ray/lang_id_wf.py | 2 +- .../language/lang_id/python/pyproject.toml | 2 +- .../language/lang_id/python/requirements.txt | 2 +- .../language/lang_id/ray/pyproject.toml | 6 +- .../kfp_ray/pdf2parquet_multiple_wf.py | 2 +- .../pdf2parquet/kfp_ray/pdf2parquet_wf.py | 2 +- .../pdf2parquet/python/requirements.txt | 2 +- .../language/pdf2parquet/ray/requirements.txt | 2 +- .../pii_redactor/python/pyproject.toml | 2 +- .../pii_redactor/python/requirements.txt | 2 +- .../language/pii_redactor/ray/pyproject.toml | 6 +- .../kfp_ray/text_encoder_multiple_wf.py | 2 +- .../text_encoder/kfp_ray/text_encoder_wf.py | 2 +- .../text_encoder/python/pyproject.toml | 2 +- .../text_encoder/python/requirements.txt | 2 +- .../language/text_encoder/ray/pyproject.toml | 6 +- transforms/requirements-ray.txt | 2 +- transforms/requirements.txt | 2 +- .../universal/doc_id/kfp_ray/doc_id_wf.py | 2 +- .../universal/doc_id/python/pyproject.toml | 2 +- .../universal/doc_id/python/requirements.txt | 2 +- .../universal/doc_id/ray/pyproject.toml | 6 +- .../universal/doc_id/spark/pyproject.toml | 4 +- .../universal/ededup/kfp_ray/ededup_wf.py | 2 +- .../universal/ededup/python/pyproject.toml | 2 +- .../universal/ededup/python/requirements.txt | 2 +- .../universal/ededup/ray/pyproject.toml | 6 +- .../universal/fdedup/fdedup_python.ipynb | 377 +++++++++++++++- transforms/universal/fdedup/fdedup_ray.ipynb | 417 +++++++++++++++++- .../universal/fdedup/kfp_ray/fdedup_wf.py | 2 +- .../universal/fdedup/python/pyproject.toml | 2 +- .../universal/fdedup/python/requirements.txt | 2 +- .../universal/fdedup/ray/pyproject.toml | 2 +- .../universal/fdedup/ray/requirements.txt | 4 +- .../universal/fdedup/spark/pyproject.toml | 2 +- .../universal/fdedup/spark/requirements.txt | 4 +- .../universal/filter/kfp_ray/filter_wf.py | 2 +- .../universal/filter/python/pyproject.toml | 2 +- .../universal/filter/python/requirements.txt | 2 +- .../universal/filter/ray/pyproject.toml | 6 +- .../universal/filter/spark/pyproject.toml | 4 +- .../universal/hap/kfp_ray.disable/hap_wf.py | 2 +- .../universal/hap/python/pyproject.toml | 2 +- .../universal/hap/python/requirements.txt | 2 +- transforms/universal/hap/ray/pyproject.toml | 2 +- transforms/universal/hap/ray/requirements.txt | 4 +- .../noop/kfp_ray/noop_multiple_wf.py | 2 +- transforms/universal/noop/kfp_ray/noop_wf.py | 2 +- .../universal/noop/python/pyproject.toml | 4 +- transforms/universal/noop/ray/pyproject.toml | 6 +- .../universal/noop/spark/pyproject.toml | 6 +- .../universal/profiler/kfp_ray/profiler_wf.py | 2 +- .../universal/profiler/python/pyproject.toml | 2 +- .../profiler/python/requirements.txt | 2 +- .../universal/profiler/ray/pyproject.toml | 6 +- .../universal/profiler/spark/pyproject.toml | 6 +- .../universal/resize/kfp_ray/resize_wf.py | 2 +- .../universal/resize/python/pyproject.toml | 2 +- .../universal/resize/python/requirements.txt | 2 +- .../universal/resize/ray/pyproject.toml | 6 +- .../universal/resize/spark/pyproject.toml | 6 +- .../tokenization/kfp_ray/tokenization_wf.py | 2 +- .../tokenization/python/pyproject.toml | 2 +- .../tokenization/python/requirements.txt | 2 +- .../universal/tokenization/ray/pyproject.toml | 6 +- .../universal/web2parquet/requirements.txt | 2 +- 120 files changed, 938 insertions(+), 202 deletions(-) diff --git a/.make.versions b/.make.versions index 564caa214..ba5e87b0f 100644 --- a/.make.versions +++ b/.make.versions @@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0 # The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release. DPK_MINOR_VERSION=2 # The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set. -DPK_MICRO_VERSION=3 +DPK_MICRO_VERSION=2 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev0 +DPK_VERSION_SUFFIX= DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index a347a14a1..4f5734be1 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.3.dev0" +version = "0.2.2" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Ray and Python" diff --git a/kfp/kfp_ray_components/createRayClusterComponent.yaml b/kfp/kfp_ray_components/createRayClusterComponent.yaml index 30b0b66d8..78976a97c 100644 --- a/kfp/kfp_ray_components/createRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/createRayClusterComponent.yaml @@ -11,7 +11,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml index 44e199c47..c75554d5f 100644 --- a/kfp/kfp_ray_components/deleteRayClusterComponent.yaml +++ b/kfp/kfp_ray_components/deleteRayClusterComponent.yaml @@ -9,7 +9,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent.yaml b/kfp/kfp_ray_components/executeRayJobComponent.yaml index 7ab517bff..2e02c3adf 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent.yaml @@ -12,7 +12,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml index 9b98912f0..37c0198bf 100644 --- a/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml +++ b/kfp/kfp_ray_components/executeRayJobComponent_multi_s3.yaml @@ -13,7 +13,7 @@ inputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml index 6b261a003..ec82e9484 100644 --- a/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml +++ b/kfp/kfp_ray_components/executeSubWorkflowComponent.yaml @@ -27,7 +27,7 @@ outputs: implementation: container: - image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" + image: "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # command is a list of strings (command-line arguments). # The YAML language has two syntaxes for lists, and you can use either of them. # Here we use the "flow syntax" - comma-separated strings inside square brackets. diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index f09b2f32a..daa903aaf 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.3.dev0", + "data-prep-toolkit-kfp-shared==0.2.2", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 01c5b3e17..61f54663f 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.3.dev0", + "data-prep-toolkit-kfp-shared==0.2.2", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index aa7a6dd3a..17ed57ea9 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit[ray]==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/Makefile b/transforms/Makefile index 3e8b9cfde..ed492db4d 100644 --- a/transforms/Makefile +++ b/transforms/Makefile @@ -107,7 +107,7 @@ build-pkg-dist: -rm -fr src mkdir src # Copy all the src folders recursively (not clear if they have subfolders) - for x in $(shell find . | grep '[ray| python]/src$$') ; do \ + for x in $(shell find . | grep '[ray| python | spark]/src$$') ; do \ echo $$x ; \ if [ -d "$$x" ]; then \ cp -r $$x/* src ; \ diff --git a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py index f3f491e4b..3e5f262b9 100644 --- a/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py +++ b/transforms/code/code2parquet/kfp_ray/code2parquet_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index be84b2f20..d4f8c11cf 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt index cec7f9c5f..0ce538837 100644 --- a/transforms/code/code2parquet/python/requirements.txt +++ b/transforms/code/code2parquet/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 parameterized pandas diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index d56fed1e8..666551e94 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.3.dev0", - "dpk-code2parquet-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", + "dpk-code2parquet-transform-python==0.2.2", "parameterized", "pandas", ] diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml index 334c86fed..d3c2c2196 100644 --- a/transforms/code/code_profiler/python/pyproject.toml +++ b/transforms/code/code_profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt index 27706b467..a38213e0f 100644 --- a/transforms/code/code_profiler/python/requirements.txt +++ b/transforms/code/code_profiler/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 parameterized pandas aiolimiter==1.1.0 diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml index 9b760c1c3..773ae353b 100644 --- a/transforms/code/code_profiler/ray/pyproject.toml +++ b/transforms/code/code_profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, ] dependencies = [ - "dpk-code-profiler-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-code-profiler-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/code_quality/kfp_ray/code_quality_wf.py b/transforms/code/code_quality/kfp_ray/code_quality_wf.py index 6a4ccec1b..7f5aa9768 100644 --- a/transforms/code/code_quality/kfp_ray/code_quality_wf.py +++ b/transforms/code/code_quality/kfp_ray/code_quality_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/code_quality-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 17cbce67d..d7b452d6b 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt index ef627d39f..10eb1001b 100644 --- a/transforms/code/code_quality/python/requirements.txt +++ b/transforms/code/code_quality/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index eceee32ed..5bf3d2dff 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-code-quality-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py index 9bb315569..5049a9c11 100644 --- a/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py +++ b/transforms/code/header_cleanser/kfp_ray/header_cleanser_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/header_cleanser-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index 3703ec55f..2dadeaf02 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt index 915a462dc..9123fc955 100644 --- a/transforms/code/header_cleanser/python/requirements.txt +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index 5fb1bcf26..179aa7769 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-header-cleanser-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/license_select/kfp_ray/license_select_wf.py b/transforms/code/license_select/kfp_ray/license_select_wf.py index 7dba0d9d1..9bdcc6e96 100644 --- a/transforms/code/license_select/kfp_ray/license_select_wf.py +++ b/transforms/code/license_select/kfp_ray/license_select_wf.py @@ -25,7 +25,7 @@ # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 3345d3a5a..b445c6b09 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License Select Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt index 2f67f6a80..e9abc2535 100644 --- a/transforms/code/license_select/python/requirements.txt +++ b/transforms/code/license_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index ce5979d62..96b293364 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-license-select-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/malware/kfp_ray/malware_wf.py b/transforms/code/malware/kfp_ray/malware_wf.py index bede80b88..89eb9d730 100644 --- a/transforms/code/malware/kfp_ray/malware_wf.py +++ b/transforms/code/malware/kfp_ray/malware_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/malware-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index a1bc05ab4..4dc1a9012 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.3.dev0", + "data-prep-toolkit>=0.2.2", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index 659ee62ef..22e7ecc28 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-malware-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py index 11f001bfa..bb114e3d6 100644 --- a/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py +++ b/transforms/code/proglang_select/kfp_ray/proglang_select_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/proglang_select-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index e5736a9c7..e20a62f7c 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt index 2f67f6a80..e9abc2535 100644 --- a/transforms/code/proglang_select/python/requirements.txt +++ b/transforms/code/proglang_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index d8288d189..3d330d3cc 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-proglang-select-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py index 38a829fab..fa739bfd0 100644 --- a/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py +++ b/transforms/code/repo_level_ordering/kfp_ray/repo_level_order_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "repo_level_order_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index 9581c8941..602799503 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py index 7e30ee8b8..1fd927356 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 387c3bda7..e128df8b0 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index 207ab9249..2d282a8ac 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 docling-core==2.3.0 pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 4fb356038..6694456ce 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] dependencies = [ "dpk-doc-chunk-transform-python==0.3.0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py index 436d93ff3..f103b7269 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py index f39fd7e39..0ca4fb865 100644 --- a/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py +++ b/transforms/language/doc_quality/kfp_ray/doc_quality_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "doc_quality_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index 23538b8c7..f3abe0337 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt index 4aa2d8111..2e29c9cb4 100644 --- a/transforms/language/doc_quality/python/requirements.txt +++ b/transforms/language/doc_quality/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index ec56ac2c7..62f97e538 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-doc_quality-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py index 4eb8b9de1..4eaef2fea 100644 --- a/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py +++ b/transforms/language/html2parquet/kfp_ray/html2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "html2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml index 3a7a6efbc..af6b64763 100644 --- a/transforms/language/html2parquet/python/pyproject.toml +++ b/transforms/language/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt index f21e65774..42e2459b2 100644 --- a/transforms/language/html2parquet/python/requirements.txt +++ b/transforms/language/html2parquet/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 trafilatura==1.12.0 diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml index 5e888748c..859706621 100644 --- a/transforms/language/html2parquet/ray/pyproject.toml +++ b/transforms/language/html2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index 9aa193432..700267692 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,3 +1,3 @@ -dpk-html2parquet-transform-python==0.2.3.dev0 -data-prep-toolkit[ray]==0.2.3.dev0 +dpk-html2parquet-transform-python==0.2.2 +data-prep-toolkit[ray]>=0.2.2 trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py index a89c54ab3..e853c2328 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/kfp_ray/lang_id_wf.py b/transforms/language/lang_id/kfp_ray/lang_id_wf.py index 2ac84645d..5aed719c5 100644 --- a/transforms/language/lang_id/kfp_ray/lang_id_wf.py +++ b/transforms/language/lang_id/kfp_ray/lang_id_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "lang_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index a69724a2d..43650a50a 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt index 06bec1ab9..1f90bcd54 100644 --- a/transforms/language/lang_id/python/requirements.txt +++ b/transforms/language/lang_id/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 fasttext==0.9.2 langcodes==3.3.0 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index dba929905..b60a3a5bb 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-lang_id-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py index 8992f1145..56e881b5e 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py index c9cdbf652..395918ac3 100644 --- a/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py +++ b/transforms/language/pdf2parquet/kfp_ray/pdf2parquet_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "pdf2parquet_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 310909164..1d1aa2570 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 docling-core==2.3.0 docling-ibm-models==2.0.3 deepsearch-glm==0.26.1 diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 34831cde8..40650d1a5 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,5 +1,5 @@ dpk-pdf2parquet-transform-python==0.3.0 -data-prep-toolkit[ray]==0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.2 # docling-core==1.7.2 # docling-ibm-models==2.0.0 # deepsearch-glm==0.22.0 diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index 72c1bf783..4a159bba0 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_python" -version = "0.2.2.dev2" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "PII redactor Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt index 0abcc1d96..51fbd2494 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index 4549851d0..a65aa5913 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk_pii_redactor_transform_python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py index e522737a1..bad5e24cd 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py index f88fe9eef..5c762c2a1 100644 --- a/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py +++ b/transforms/language/text_encoder/kfp_ray/text_encoder_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "text_encoder_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index dc15beb6e..62182b27b 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt index 3ac880bba..0d8160151 100644 --- a/transforms/language/text_encoder/python/requirements.txt +++ b/transforms/language/text_encoder/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 sentence-transformers==3.0.1 diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index f1b2c09d5..2f8483e2d 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-text_encoder-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index b0527bdd6..11d0decf5 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.2 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/requirements.txt b/transforms/requirements.txt index 934c95182..7317d33e3 100644 --- a/transforms/requirements.txt +++ b/transforms/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.3.dev0 +data-prep-toolkit>=0.2.2 diff --git a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py index f41231159..7e1bd0b8e 100644 --- a/transforms/universal/doc_id/kfp_ray/doc_id_wf.py +++ b/transforms/universal/doc_id/kfp_ray/doc_id_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "doc_id_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index 1a962662d..a9e69f0bf 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt index 2f67f6a80..e9abc2535 100644 --- a/transforms/universal/doc_id/python/requirements.txt +++ b/transforms/universal/doc_id/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index da34dded3..ee022af54 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk_doc_id_transform_python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index 369a1bb72..f50d4f70d 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.3.dev0", + "data-prep-toolkit[spark]==0.2.2", ] [build-system] diff --git a/transforms/universal/ededup/kfp_ray/ededup_wf.py b/transforms/universal/ededup/kfp_ray/ededup_wf.py index ab46daadb..d878bd3e2 100644 --- a/transforms/universal/ededup/kfp_ray/ededup_wf.py +++ b/transforms/universal/ededup/kfp_ray/ededup_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "ededup_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index da28e715f..67fd0f758 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt index aa73a106a..9fe419975 100644 --- a/transforms/universal/ededup/python/requirements.txt +++ b/transforms/universal/ededup/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 424e220fd..58b39d7d7 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.3.dev0", - "dpk_ededup_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", + "dpk_ededup_transform_python==0.2.2", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb index 83f9bd600..684583ffd 100644 --- a/transforms/universal/fdedup/fdedup_python.ipynb +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -37,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, "outputs": [], @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], @@ -102,10 +102,102 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:29 INFO - Starting SignatureCalculation step\n", + "13:30:29 INFO - Got parameters for SignatureCalculation\n", + "13:30:29 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "13:30:29 INFO - data factory scdata_ is using local configuration without input/output path\n", + "13:30:29 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "13:30:29 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:29 INFO - pipeline id pipeline_id\n", + "13:30:29 INFO - code location None\n", + "13:30:29 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n", + "13:30:29 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:29 INFO - orchestrator minhash started at 2024-11-26 13:30:29\n", + "13:30:29 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n", + "13:30:33 INFO - Completed 1 files (50.0%) in 0.074 min\n", + "13:30:33 INFO - Completed 2 files (100.0%) in 0.074 min\n", + "13:30:33 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:30:33 INFO - Starting flush()\n", + "13:30:34 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", + "13:30:34 INFO - done flushing in 0.063 sec\n", + "13:30:34 INFO - Completed execution in 0.075 min, execution result 0\n", + "13:30:34 INFO - SignatureCalculation completed successfully\n", + "13:30:34 INFO - Starting ClusterAnalysis step\n", + "13:30:34 INFO - Got parameters for ClusterAnalysis\n", + "13:30:34 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/docs_to_remove\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator cluster started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of folders is 14\n", + "13:30:34 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "13:30:34 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "13:30:34 INFO - Completed 3 files (21.43%) in 0.001 min\n", + "13:30:34 INFO - Completed 4 files (28.57%) in 0.001 min\n", + "13:30:34 INFO - Completed 5 files (35.71%) in 0.001 min\n", + "13:30:34 INFO - Completed 6 files (42.86%) in 0.001 min\n", + "13:30:34 INFO - Completed 7 files (50.0%) in 0.001 min\n", + "13:30:34 INFO - Completed 8 files (57.14%) in 0.002 min\n", + "13:30:34 INFO - Completed 9 files (64.29%) in 0.002 min\n", + "13:30:34 INFO - Completed 10 files (71.43%) in 0.002 min\n", + "13:30:34 INFO - Completed 11 files (78.57%) in 0.002 min\n", + "13:30:34 INFO - Completed 12 files (85.71%) in 0.002 min\n", + "13:30:34 INFO - Completed 13 files (92.86%) in 0.002 min\n", + "13:30:34 INFO - Completed 14 files (100.0%) in 0.003 min\n", + "13:30:34 INFO - Done processing 14 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.003 min, execution result 0\n", + "13:30:34 INFO - ClusterAnalysis completed successfully\n", + "13:30:34 INFO - Starting GetDuplicateList step\n", + "13:30:34 INFO - Got parameters for GetDuplicateList\n", + "13:30:34 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator fdlist started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of folders is 1\n", + "13:30:34 INFO - Get Duplicate List for folder docs_to_remove\n", + "13:30:34 INFO - 8 documents marked as duplicates\n", + "13:30:34 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "13:30:34 INFO - Done processing 1 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.001 min, execution result 0\n", + "13:30:34 INFO - GetDuplicateList completed successfully\n", + "13:30:34 INFO - Starting DataCleaning step\n", + "13:30:34 INFO - Got parameters for DataCleaning\n", + "13:30:34 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "13:30:34 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "13:30:34 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/cleaned\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator fdclean started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n", + "13:30:34 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "13:30:34 INFO - Completed 2 files (100.0%) in 0.0 min\n", + "13:30:34 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.0 min, execution result 0\n", + "13:30:34 INFO - DataCleaning completed successfully\n" + ] + } + ], "source": [ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -126,10 +218,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/cleaned/metadata.json',\n", + " 'python/output/cleaned/data_1',\n", + " 'python/output/cleaned/data_2']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import glob\n", "glob.glob(\"python/output/cleaned/*\")" @@ -145,10 +250,167 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (12, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 3 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 5 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ │\n", + "│ 6 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", + "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", + "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", + "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", + "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", + "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", + "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", + "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", + "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", + "│ ┆ another capy capably chewing on cantaloupe │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", + "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 17 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], "source": [ "import polars as pl\n", "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", @@ -169,10 +431,97 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (4, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], "source": [ "import polars as pl\n", "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", @@ -193,9 +542,9 @@ ], "metadata": { "kernelspec": { - "display_name": "fdedup_ray", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "fdedup_ray" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -207,7 +556,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb index 533ca019f..bb69579a9 100644 --- a/transforms/universal/fdedup/fdedup_ray.ipynb +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -14,7 +14,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", "metadata": {}, "outputs": [], @@ -37,10 +37,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-26 13:30:56,482\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], "source": [ "import ast\n", "import os\n", @@ -73,7 +81,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "e90a853e-412f-45d7-af3d-959e755aeebb", "metadata": {}, "outputs": [], @@ -106,10 +114,126 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "0775e400-7469-49a6-8998-bd4772931459", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:57 INFO - Starting SignatureCalculation step\n", + "13:30:57 INFO - Got parameters for SignatureCalculation\n", + "13:30:57 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "13:30:57 INFO - data factory scdata_ is using local configuration without input/output path\n", + "13:30:57 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "13:30:57 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:57 INFO - pipeline id pipeline_id\n", + "13:30:57 INFO - code location None\n", + "13:30:57 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:30:57 INFO - actor creation delay 0\n", + "13:30:57 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:30:57 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", + "13:30:57 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:57 INFO - Running locally\n", + "2024-11-26 13:31:08,860\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - orchestrator started at 2024-11-26 13:31:12\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.162438202649355, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed processing 1 files in 0.002 min\n", + "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Starting flush()\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - done flushing in 0.045 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", + "13:31:24 INFO - Completed execution in 0.446 min, execution result 0\n", + "13:31:26 INFO - SignatureCalculation completed successfully\n", + "13:31:26 INFO - Starting ClusterAnalysis step\n", + "13:31:26 INFO - Got parameters for ClusterAnalysis\n", + "13:31:26 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "13:31:26 INFO - pipeline id pipeline_id\n", + "13:31:26 INFO - code location None\n", + "13:31:26 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:26 INFO - actor creation delay 0\n", + "13:31:26 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:26 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/docs_to_remove\n", + "13:31:26 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:26 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:26 INFO - Running locally\n", + "2024-11-26 13:31:28,318\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - orchestrator started at 2024-11-26 13:31:31\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of folders is 14\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.77626838721335, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 2 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 3 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 4 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 5 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 6 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 7 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 8 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 9 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 10 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files (78.571%) in 0.001 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed processing 14 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", + "13:31:43 INFO - Completed execution in 0.292 min, execution result 0\n", + "13:31:45 INFO - ClusterAnalysis completed successfully\n", + "13:31:45 INFO - Starting GetDuplicateList step\n", + "13:31:45 INFO - Got parameters for GetDuplicateList\n", + "13:31:45 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "13:31:45 INFO - pipeline id pipeline_id\n", + "13:31:45 INFO - code location None\n", + "13:31:45 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:45 INFO - actor creation delay 0\n", + "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:45 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", + "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:45 INFO - Running locally\n", + "2024-11-26 13:31:47,311\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - orchestrator started at 2024-11-26 13:31:50\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of folders is 1\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.749520111829042, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed processing 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - Get Duplicate List for folder docs_to_remove\n", + "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - 8 documents marked as duplicates\n", + "13:32:02 INFO - Completed execution in 0.295 min, execution result 0\n", + "13:32:04 INFO - GetDuplicateList completed successfully\n", + "13:32:04 INFO - Starting DataCleaning step\n", + "13:32:04 INFO - Got parameters for DataCleaning\n", + "13:32:04 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "13:32:04 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "13:32:04 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "13:32:04 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:04 INFO - pipeline id pipeline_id\n", + "13:32:04 INFO - code location None\n", + "13:32:04 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:04 INFO - actor creation delay 0\n", + "13:32:04 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:04 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/cleaned\n", + "13:32:04 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:04 INFO - Running locally\n", + "2024-11-26 13:32:07,526\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - orchestrator started at 2024-11-26 13:32:10\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.738976669497788, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed processing 1 files in 0.002 min\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - done flushing in 0.003 sec\n", + "13:32:23 INFO - Completed execution in 0.313 min, execution result 0\n", + "13:32:24 INFO - DataCleaning completed successfully\n" + ] + } + ], "source": [ "\n", "sys.argv = ParamsUtils.dict_to_req(d=params)\n", @@ -130,10 +254,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "7276fe84-6512-4605-ab65-747351e13a7c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['ray/output/cleaned/metadata.json', 'ray/output/cleaned/df1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import glob\n", "glob.glob(\"ray/output/cleaned/*\")" @@ -149,10 +284,167 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (12, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 3 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 5 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ │\n", + "│ 6 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", + "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", + "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", + "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", + "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", + "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", + "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", + "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", + "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", + "│ ┆ another capy capably chewing on cantaloupe │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", + "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 17 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], "source": [ "import polars as pl\n", "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", @@ -170,10 +462,97 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (4, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], "source": [ "import polars as pl\n", "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", @@ -188,13 +567,21 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c11d3a4b-8ef9-417d-a8a2-f688db067a52", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "fdedup_ray", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "fdedup_ray" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -206,7 +593,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.11.10" } }, "nbformat": 4, diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index ffc6f79bc..8e8795cce 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -34,7 +34,7 @@ DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/fdedup/python/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml index 08b20ed75..ff3666695 100644 --- a/transforms/universal/fdedup/python/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt index 3e5dfc16d..4cd06d819 100644 --- a/transforms/universal/fdedup/python/requirements.txt +++ b/transforms/universal/fdedup/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 485d6de21..fa0627f00 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt index 81e48e5ee..ecb79fa77 100644 --- a/transforms/universal/fdedup/ray/requirements.txt +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]==0.2.3.dev0 -dpk_fdedup_transform_python==0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.2 +dpk_fdedup_transform_python==0.2.2 mmh3>=4.1.0 xxhash==3.4.1 tqdm==4.66.3 diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml index 8a072b31b..798931552 100644 --- a/transforms/universal/fdedup/spark/pyproject.toml +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Fuzzy Dedup Spark Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt index bfb0f04a2..e70a880bd 100644 --- a/transforms/universal/fdedup/spark/requirements.txt +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -1,5 +1,5 @@ -dpk_fdedup_transform_python==0.2.3.dev0 -data-prep-toolkit[spark]==0.2.3.dev0 +dpk_fdedup_transform_python==0.2.2 +data-prep-toolkit[spark]>=0.2.2 pyyaml>=6.0.2 boto3>=1.34.69 kubernetes>=30.1.0 diff --git a/transforms/universal/filter/kfp_ray/filter_wf.py b/transforms/universal/filter/kfp_ray/filter_wf.py index b856b1007..4b122d98f 100644 --- a/transforms/universal/filter/kfp_ray/filter_wf.py +++ b/transforms/universal/filter/kfp_ray/filter_wf.py @@ -24,7 +24,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/filter-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index fcf0f6419..8e9bb2366 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt index 100626f60..91f37927e 100644 --- a/transforms/universal/filter/python/requirements.txt +++ b/transforms/universal/filter/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 duckdb>=0.10.1 diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index 64776e0c1..94df1cbac 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-filter-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index ef46c9a1b..f62a81085 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.3.dev0", + "data-prep-toolkit[spark]>=0.2.2", ] [project.optional-dependencies] diff --git a/transforms/universal/hap/kfp_ray.disable/hap_wf.py b/transforms/universal/hap/kfp_ray.disable/hap_wf.py index 786011d4d..8069ec181 100644 --- a/transforms/universal/hap/kfp_ray.disable/hap_wf.py +++ b/transforms/universal/hap/kfp_ray.disable/hap_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "hap_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml index bf7c85577..7b30dd72e 100644 --- a/transforms/universal/hap/python/pyproject.toml +++ b/transforms/universal/hap/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HAP Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index 1250d1f77..70e633ac9 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml index 38e78938b..6518e5277 100644 --- a/transforms/universal/hap/ray/pyproject.toml +++ b/transforms/universal/hap/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "HAP Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 7c4c8eb94..3d18acaa4 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit[ray]==0.2.3.dev0 -dpk-hap-transform-python==0.2.3.dev0 +data-prep-toolkit[ray]>=0.2.2 +dpk-hap-transform-python==0.2.2 nltk==3.9.1 transformers==4.38.2 torch>=2.2.2,<=2.4.1 diff --git a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py index 3b102d205..737b60121 100644 --- a/transforms/universal/noop/kfp_ray/noop_multiple_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_multiple_wf.py @@ -23,7 +23,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/kfp_ray/noop_wf.py b/transforms/universal/noop/kfp_ray/noop_wf.py index e8125328b..9dbdaf3b0 100644 --- a/transforms/universal/noop/kfp_ray/noop_wf.py +++ b/transforms/universal/noop/kfp_ray/noop_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "noop_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index ff9a24244..b60eef1ef 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.3.dev0", + "data-prep-toolkit>=0.2.2", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index da9327917..e9e28eefd 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-noop-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index d3cd47bf6..89d0a18dd 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.3.dev0", - "data-prep-toolkit[spark]==0.2.3.dev0", + "dpk-noop-transform-python==0.2.2", + "data-prep-toolkit[spark]>=0.2.2", ] [build-system] diff --git a/transforms/universal/profiler/kfp_ray/profiler_wf.py b/transforms/universal/profiler/kfp_ray/profiler_wf.py index 914637895..ee6323d74 100644 --- a/transforms/universal/profiler/kfp_ray/profiler_wf.py +++ b/transforms/universal/profiler/kfp_ray/profiler_wf.py @@ -24,7 +24,7 @@ EXEC_SCRIPT_NAME: str = "profiler_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 39d9788f8..117be53c0 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt index 526140ada..420e3fe86 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index ac8d729ec..336d7e35d 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.3.dev0", - "dpk_profiler_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.2", + "dpk_profiler_transform_python==0.2.2", "tqdm==4.66.3", ] diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml index 6ba790301..1e1638766 100644 --- a/transforms/universal/profiler/spark/pyproject.toml +++ b/transforms/universal/profiler/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Profiler Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-profiler-transform-python==0.2.3.dev0", - "data-prep-toolkit[spark]==0.2.3.dev0", + "dpk-profiler-transform-python==0.2.2", + "data-prep-toolkit[spark]>=0.2.2", ] [build-system] diff --git a/transforms/universal/resize/kfp_ray/resize_wf.py b/transforms/universal/resize/kfp_ray/resize_wf.py index 0724ed731..0a9be8e95 100644 --- a/transforms/universal/resize/kfp_ray/resize_wf.py +++ b/transforms/universal/resize/kfp_ray/resize_wf.py @@ -22,7 +22,7 @@ # the name of the job script EXEC_SCRIPT_NAME: str = "resize_transform_ray.py" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files component_spec_path = "../../../../kfp/kfp_ray_components/" diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 6fdad69d0..836388694 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "resize Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt index 2f67f6a80..e9abc2535 100644 --- a/transforms/universal/resize/python/requirements.txt +++ b/transforms/universal/resize/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.2 \ No newline at end of file diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index c266a39f4..fbb4d0f30 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-resize-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml index 7de14c673..9f83a6816 100644 --- a/transforms/universal/resize/spark/pyproject.toml +++ b/transforms/universal/resize/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_spark" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Resize Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.3.dev0", - "data-prep-toolkit[spark]==0.2.3.dev0", + "dpk-resize-transform-python==0.2.2", + "data-prep-toolkit[spark]>=0.2.2", ] [build-system] diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index c131d11ea..243cac6be 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -23,7 +23,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest" # components -base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" +base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:0.2.2" # path to kfp component specifications files # path to kfp component specifications files diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index dbb8e84ba..021a1427f 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt index 8a1920162..9c2a695a6 100644 --- a/transforms/universal/tokenization/python/requirements.txt +++ b/transforms/universal/tokenization/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.3.dev0 +data-prep-toolkit>=0.2.2 transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index c094b9e7e..4cea4b905 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.3.dev0" +version = "0.2.2" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.3.dev0", - "data-prep-toolkit[ray]==0.2.3.dev0", + "dpk-tokenization-transform-python==0.2.2", + "data-prep-toolkit[ray]>=0.2.2", ] [build-system] diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index 1af3f12a4..dfb74a6ca 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.3.dev0 +data-prep-toolkit>=0.2.2 data_prep_connector>=0.2.3 \ No newline at end of file