diff --git a/.github/workflows/test-code-code2parquet-kfp.yml b/.github/workflows/test-code-code2parquet-kfp.yml index 710654571..882e5b65e 100644 --- a/.github/workflows/test-code-code2parquet-kfp.yml +++ b/.github/workflows/test-code-code2parquet-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/code2parquet/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/code2parquet/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-code_quality-kfp.yml b/.github/workflows/test-code-code_quality-kfp.yml index c07bc1d2d..f6ccb3a9e 100644 --- a/.github/workflows/test-code-code_quality-kfp.yml +++ b/.github/workflows/test-code-code_quality-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/code_quality/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/code_quality/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-header_cleanser-kfp.yml b/.github/workflows/test-code-header_cleanser-kfp.yml index 7c419fd14..755393e5d 100644 --- a/.github/workflows/test-code-header_cleanser-kfp.yml +++ b/.github/workflows/test-code-header_cleanser-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/header_cleanser/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/header_cleanser/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-license_select-kfp.yml b/.github/workflows/test-code-license_select-kfp.yml index d72a85dd4..7b6b69ef9 100644 --- a/.github/workflows/test-code-license_select-kfp.yml +++ b/.github/workflows/test-code-license_select-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/license_select/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/license_select/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-malware-kfp.yml b/.github/workflows/test-code-malware-kfp.yml index 89bf47239..e68eb175e 100644 --- a/.github/workflows/test-code-malware-kfp.yml +++ b/.github/workflows/test-code-malware-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/malware/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/malware/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-proglang_select-kfp.yml b/.github/workflows/test-code-proglang_select-kfp.yml index 31328f3d5..edbf09ea1 100644 --- a/.github/workflows/test-code-proglang_select-kfp.yml +++ b/.github/workflows/test-code-proglang_select-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/proglang_select/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/proglang_select/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-repo_level_ordering-kfp.yml b/.github/workflows/test-code-repo_level_ordering-kfp.yml index 4e328f53e..26374677a 100644 --- a/.github/workflows/test-code-repo_level_ordering-kfp.yml +++ b/.github/workflows/test-code-repo_level_ordering-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/repo_level_ordering/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/repo_level_ordering/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-kfp-transform.template b/.github/workflows/test-kfp-transform.template index f12511118..bed645bed 100644 --- a/.github/workflows/test-kfp-transform.template +++ b/.github/workflows/test-kfp-transform.template @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "@TARGET_TRANSFORM_DIR@/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "@TARGET_TRANSFORM_DIR@/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-doc_chunk-kfp.yml b/.github/workflows/test-language-doc_chunk-kfp.yml index 985f79b97..fe347dc61 100644 --- a/.github/workflows/test-language-doc_chunk-kfp.yml +++ b/.github/workflows/test-language-doc_chunk-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/doc_chunk/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/doc_chunk/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-doc_quality-kfp.yml b/.github/workflows/test-language-doc_quality-kfp.yml index 6842a1859..33910778a 100644 --- a/.github/workflows/test-language-doc_quality-kfp.yml +++ b/.github/workflows/test-language-doc_quality-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/doc_quality/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/doc_quality/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-html2parquet-kfp.yml b/.github/workflows/test-language-html2parquet-kfp.yml index c7a1cecc5..5da046347 100644 --- a/.github/workflows/test-language-html2parquet-kfp.yml +++ b/.github/workflows/test-language-html2parquet-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/html2parquet/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/html2parquet/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-lang_id-kfp.yml b/.github/workflows/test-language-lang_id-kfp.yml index 936ba8e45..562c38362 100644 --- a/.github/workflows/test-language-lang_id-kfp.yml +++ b/.github/workflows/test-language-lang_id-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/lang_id/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/lang_id/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-pdf2parquet-kfp.yml b/.github/workflows/test-language-pdf2parquet-kfp.yml index f232d78e4..a304e22ec 100644 --- a/.github/workflows/test-language-pdf2parquet-kfp.yml +++ b/.github/workflows/test-language-pdf2parquet-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/pdf2parquet/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/pdf2parquet/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-pii_redactor-kfp.yml b/.github/workflows/test-language-pii_redactor-kfp.yml index 451ac7961..3982c2f67 100644 --- a/.github/workflows/test-language-pii_redactor-kfp.yml +++ b/.github/workflows/test-language-pii_redactor-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/pii_redactor/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/pii_redactor/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-text_encoder-kfp.yml b/.github/workflows/test-language-text_encoder-kfp.yml index 96b8308d1..fdc085058 100644 --- a/.github/workflows/test-language-text_encoder-kfp.yml +++ b/.github/workflows/test-language-text_encoder-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/text_encoder/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/text_encoder/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-doc_id-kfp.yml b/.github/workflows/test-universal-doc_id-kfp.yml index fadcc6403..194b5b65f 100644 --- a/.github/workflows/test-universal-doc_id-kfp.yml +++ b/.github/workflows/test-universal-doc_id-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/doc_id/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/doc_id/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-ededup-kfp.yml b/.github/workflows/test-universal-ededup-kfp.yml index 225d7539e..48d3a1469 100644 --- a/.github/workflows/test-universal-ededup-kfp.yml +++ b/.github/workflows/test-universal-ededup-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/ededup/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/ededup/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-fdedup-kfp.yml b/.github/workflows/test-universal-fdedup-kfp.yml index b36df964d..cb2055bfc 100644 --- a/.github/workflows/test-universal-fdedup-kfp.yml +++ b/.github/workflows/test-universal-fdedup-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/fdedup/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/fdedup/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-filter-kfp.yml b/.github/workflows/test-universal-filter-kfp.yml index cab769f11..2fe6d33f7 100644 --- a/.github/workflows/test-universal-filter-kfp.yml +++ b/.github/workflows/test-universal-filter-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/filter/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/filter/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-hap-kfp.yml b/.github/workflows/test-universal-hap-kfp.yml index d5d6aa63c..c69077368 100644 --- a/.github/workflows/test-universal-hap-kfp.yml +++ b/.github/workflows/test-universal-hap-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/hap/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/hap/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-noop-kfp.yml b/.github/workflows/test-universal-noop-kfp.yml index 9322ea9b6..2be2a6e87 100644 --- a/.github/workflows/test-universal-noop-kfp.yml +++ b/.github/workflows/test-universal-noop-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/noop/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/noop/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-profiler-kfp.yml b/.github/workflows/test-universal-profiler-kfp.yml index f31543ad3..63631e4d6 100644 --- a/.github/workflows/test-universal-profiler-kfp.yml +++ b/.github/workflows/test-universal-profiler-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/profiler/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/profiler/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-resize-kfp.yml b/.github/workflows/test-universal-resize-kfp.yml index 3e099bba5..c502cb4b9 100644 --- a/.github/workflows/test-universal-resize-kfp.yml +++ b/.github/workflows/test-universal-resize-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/resize/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/resize/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-tokenization-kfp.yml b/.github/workflows/test-universal-tokenization-kfp.yml index e4a5b5693..887dd4eb3 100644 --- a/.github/workflows/test-universal-tokenization-kfp.yml +++ b/.github/workflows/test-universal-tokenization-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/tokenization/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/tokenization/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.make.versions b/.make.versions index ed36fe8c8..53a814695 100644 --- a/.make.versions +++ b/.make.versions @@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0 # The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release. DPK_MINOR_VERSION=2 # The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set. -DPK_MICRO_VERSION=2 +DPK_MICRO_VERSION=3 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev2 +DPK_VERSION_SUFFIX=.dev0 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) @@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION) KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION) KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION) -DPK_CONNECTOR_VERSION=0.2.3.dev0 +DPK_CONNECTOR_VERSION=0.2.4.dev0 ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. @@ -59,3 +59,11 @@ else WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support endif +################################################################################ +# This defines the transforms' package version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +TRANSFORMS_PKG_VERSION=0.2.3.dev1 diff --git a/README.md b/README.md index 2c1caa04e..716f3b0f2 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,14 @@ Explore more examples [here](examples/notebooks). ### Run your first data prep pipeline -Now that you have run a single transform, the next step is to explore how to put these transforms together to run a data prep pipeline for an end to end use case like fine tuning model or building a RAG application. This [notebook](examples/notebooks/fine%20tuning/code/sample-notebook.ipynb) gives an example of how to build an end to end data prep pipeline for fine tuning for code LLMs. You can also explore how to build a RAG pipeline [here](examples/notebooks/rag). +Now that you have run a single transform, the next step is to explore how to put these transforms +together to run a data prep pipeline for an end to end use case like fine tuning a model or building +a RAG application. +This [notebook](examples/notebooks/fine%20tuning/code/sample-notebook.ipynb) gives an example of +how to build an end to end data prep pipeline for fine tuning for code LLMs. Similarly, this +[notebook](examples/notebooks/fine%20tuning/language/demo_with_launcher.ipynb) is a fine tuning +example of an end-to-end sample data pipeline designed for processing language datasets. +You can also explore how to build a RAG pipeline [here](examples/notebooks/rag). ### Current list of transforms The matrix below shows the the combination of modules and supported runtimes. All the modules can be accessed [here](transforms) and can be combined to form data processing pipelines, as shown in the [examples](examples) folder. @@ -133,7 +140,8 @@ The matrix below shows the the combination of modules and supported runtimes. Al | **Data Ingestion** | | | | | | [Code (from zip) to Parquet](transforms/code/code2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | [PDF to Parquet](transforms/language/pdf2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | -| [HTML to Parquet](transforms/language/html2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | | +| [HTML to Parquet](transforms/language/html2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | +| [Web to Parquet](transforms/universal/web2parquet/README.md) | :white_check_mark: | | | | | **Universal (Code & Language)** | | | | | | [Exact dedup filter](transforms/universal/ededup/ray/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | [Fuzzy dedup filter](transforms/universal/fdedup/ray/README.md) | | :white_check_mark: | | :white_check_mark: | @@ -223,11 +231,11 @@ If you use Data Prep Kit in your research, please cite our paper: @misc{wood2024dataprepkitgettingdataready, title={Data-Prep-Kit: getting your data ready for LLM application development}, author={David Wood and Boris Lublinsky and Alexy Roytman and Shivdeep Singh - and Abdulhamid Adebayo and Revital Eres and Mohammad Nassar and Hima Patel - and Yousaf Shah and Constantin Adam and Petros Zerfos and Nirmit Desai - and Daiki Tsuzuku and Takuya Goto and Michele Dolfi and Saptha Surendran - and Paramesvaran Selvam and Sungeun An and Yuan Chi Chang and Dhiraj Joshi - and Hajar Emami-Gohari and Xuan-Hong Dang and Yan Koyfman and Shahrokh Daijavad}, + and Constantin Adam and Abdulhamid Adebayo and Sungeun An and Yuan Chi Chang + and Xuan-Hong Dang and Nirmit Desai and Michele Dolfi and Hajar Emami-Gohari + and Revital Eres and Takuya Goto and Dhiraj Joshi and Yan Koyfman + and Mohammad Nassar and Hima Patel and Paramesvaran Selvam and Yousaf Shah + and Saptha Surendran and Daiki Tsuzuku and Petros Zerfos and Shahrokh Daijavad}, year={2024}, eprint={2409.18164}, archivePrefix={arXiv}, diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml index 4fcc97ed9..69e914f0c 100644 --- a/data-connector-lib/pyproject.toml +++ b/data-connector-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_connector" -version = "0.2.3.dev1" +version = "0.2.4.dev0" requires-python = ">=3.10,<3.13" keywords = [ "data", diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml index 2e827ea82..a347a14a1 100644 --- a/data-processing-lib/pyproject.toml +++ b/data-processing-lib/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.2.dev2" +version = "0.2.3.dev0" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Ray and Python" @@ -16,7 +16,6 @@ dynamic = ["dependencies", "optional-dependencies"] Repository = "https://github.com/IBM/data-prep-kit" Issues = "https://github.com/IBM/data-prep-kit/issues" Documentation = "https://ibm.github.io/data-prep-kit/doc" -"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] diff --git a/data-processing-lib/python/requirements.txt b/data-processing-lib/python/requirements.txt index 7b363f2b5..318d715d5 100644 --- a/data-processing-lib/python/requirements.txt +++ b/data-processing-lib/python/requirements.txt @@ -4,3 +4,4 @@ argparse mmh3 psutil + polars>=1.9.0 diff --git a/data-processing-lib/python/src/data_processing/utils/transform_utils.py b/data-processing-lib/python/src/data_processing/utils/transform_utils.py index e2d37581c..ccb7f3fe8 100644 --- a/data-processing-lib/python/src/data_processing/utils/transform_utils.py +++ b/data-processing-lib/python/src/data_processing/utils/transform_utils.py @@ -11,6 +11,7 @@ ################################################################################ import hashlib +import io import os import string import sys @@ -144,8 +145,21 @@ def convert_binary_to_arrow(data: bytes, schema: pa.schema = None) -> pa.Table: table = pq.read_table(reader, schema=schema) return table except Exception as e: - logger.error(f"Failed to convert byte array to arrow table, exception {e}. Skipping it") - return None + logger.warning(f"Could not convert bytes to pyarrow: {e}") + + # We have seen this exception before when using pyarrow, but polars does not throw it. + # "Nested data conversions not implemented for chunked array outputs" + # See issue 816 https://github.com/IBM/data-prep-kit/issues/816. + logger.info(f"Attempting read of pyarrow Table using polars") + try: + import polars + + df = polars.read_parquet(io.BytesIO(data)) + table = df.to_arrow() + except Exception as e: + logger.error(f"Could not convert bytes to pyarrow using polars: {e}. Skipping.") + table = None + return table @staticmethod def convert_arrow_to_binary(table: pa.Table) -> bytes: diff --git a/data-processing-lib/spark/Makefile b/data-processing-lib/spark/Makefile index d4769187b..5fde2bb07 100644 --- a/data-processing-lib/spark/Makefile +++ b/data-processing-lib/spark/Makefile @@ -11,9 +11,14 @@ setup:: set-versions: .check-env $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml - sed -e 's/"pyspark...*",/"pyspark>=${SPARK_VERSION}",/' \ - pyproject.toml > tt.toml - mv tt.toml pyproject.toml + if [ -e pyproject.toml ]; then \ + cat pyproject.toml | sed -e 's/"spark[default]==.*",/"spark[default]==$(SPARK_VERSION)",/' > tt.toml; \ + mv tt.toml pyproject.toml; \ + fi + if [ -e requirements.txt ]; then \ + cat requirements.txt | sed -e 's/ray[default]==.*/ray[default]==$(SPARK_VERSION)/' > tt.txt; \ + mv tt.txt requirements.txt; \ + fi build:: build-dist @@ -26,7 +31,7 @@ publish-dist :: .check-env .defaults.publish-dist publish-image:: .defaults.publish-image -venv:: pyproject.toml +venv:: $(MAKE) .defaults.spark-lib-src-venv pip install pytest pytest-cov diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index d7058f2ae..f09b2f32a 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.2.dev2", + "data-prep-toolkit-kfp-shared==0.2.3.dev0", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 04b6bc7a2..01c5b3e17 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.2.dev2", + "data-prep-toolkit-kfp-shared==0.2.3.dev0", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py index ac5e32689..28f36acf7 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py @@ -103,7 +103,7 @@ def _add_node_selector() -> None: def set_s3_env_vars_to_component( task: dsl.PipelineTask, secret: str = "", - env2key: Dict[str, str] = {"s3-key": "S3_KEY", "s3-secret": "S3_SECRET", "s3-endpoint": "ENDPOINT"}, + env2key: Dict[str, str] = None, prefix: str = None, ) -> None: """ @@ -113,6 +113,8 @@ def set_s3_env_vars_to_component( :param env2key: dict with mapping each env variable to a key in the secret :param prefix: prefix to add to env name """ + if env2key is None: + env2key = {"s3-key": "S3_KEY", "s3-secret": "S3_SECRET", "s3-endpoint": "ENDPOINT"} if prefix is not None: for secret_key, _ in env2key.items(): diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index df27ad1cf..e0a6235c1 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/release-notes.md b/release-notes.md index 15f23c542..4b7b8d553 100644 --- a/release-notes.md +++ b/release-notes.md @@ -1,5 +1,42 @@ # Data Prep Kit Release notes +## Release 0.2.2 - 11/25/2024 + +### General +1. Update RAG example to use granite model +1. Updated transforms with Docling 2 +1. Added single package for dpk with extra for \[spark\] and \[ray\] +1. Added single package for transforms with extra for \[all\] or \[individual-transform-name\] + + +### data-prep-toolkit libraries (python, ray, spark) + +1. Fix metadata logging even when actors crash +1. Add multilock for ray workers downloads/cleanup +1. Multiple updates to spark runtime +1. Added support for python 3.12 +1. refactoring of data access code + + +### KFP Workloads + +1. Modify superpipeline params type Str/json +1. Set kuberay apiserver version +1. Add Super pipeline for code transforms + + +### Transforms + +1. Enhance pdf2parquet with docling2 support for extracting HTML, DOCS, etc. +1. Added web2parquet transform +1. Added HAP transform + +### HTTP Connector 0.2.3 + +1. Enhanced parameter/configuration allows the user to customize crawler settings +1. implement subdomain focus feature in data-prep-connector + + ## Release 0.2.2- HTTP Connector Module - 10/23/2024 ### General diff --git a/resources.md b/resources.md index 4f5657a02..3164f5ce3 100644 --- a/resources.md +++ b/resources.md @@ -1,3 +1,8 @@ +# New Features & Enhancements + +- Support for Docling 2.0 added to DPK in [pdf2parquet](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/pdf2parquet/python) transform. The new updates allow DPK users to ingest other type of documents, e.g. MS Word, MS Powerpoint, Images, Markdown, Asciidocs, etc. +- Released [Web2parquet](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/web2parquet) transform for crawling the web. + # Data Prep Kit Resources ## 📄 Papers @@ -7,24 +12,43 @@ 3. [Scaling Granite Code Models to 128K Context](https://arxiv.org/abs/2407.13739) -## 🎤 Talks +## 🎤 External Events and Showcase 1. **"Building Successful LLM Apps: The Power of high quality data"** - [Video](https://www.youtube.com/watch?v=u_2uiZBBVIE) | [Slides](https://www.slideshare.net/slideshow/data_prep_techniques_challenges_methods-pdf-a190/271527890) 2. **"Hands on session for fine tuning LLMs"** - [Video](https://www.youtube.com/watch?v=VEHIA3E64DM) 3. **"Build your own data preparation module using data-prep-kit"** - [Video](https://www.youtube.com/watch?v=0WUMG6HIgMg) 4. **"Data Prep Kit: A Comprehensive Cloud-Native Toolkit for Scalable Data Preparation in GenAI App"** - [Video](https://www.youtube.com/watch?v=WJ147TGULwo) | [Slides](https://ossaidevjapan24.sched.com/event/1jKBm) +5. **"RAG with Data Prep Kit" Workshop** @ Mountain View, CA, USA ** - [info](https://github.com/sujee/data-prep-kit-examples/blob/main/events/2024-09-21__RAG-workshop-data-riders.md) +6. **Tech Educator summit** [IBM CSR Event](https://www.linkedin.com/posts/aanchalaggarwal_github-ibmdata-prep-kit-open-source-project-activity-7254062098295472128-OA_x?utm_source=share&utm_medium=member_desktop) +7. **Talk and Hands on session** at [MIT Bangalore](https://www.linkedin.com/posts/saptha-surendran-71a4a0ab_ibmresearch-dataprepkit-llms-activity-7261987741087801346-h0no?utm_source=share&utm_medium=member_desktop) +8. **PyData NYC 2024** - [90 mins Tutorial](https://nyc2024.pydata.org/cfp/talk/AWLTZP/) +9. **Open Source AI** [Demo Night](https://lu.ma/oss-ai?tk=A8BgIt) +10. [**Data Exchange Podcast with Ben Lorica**](https://thedataexchange.media/ibm-data-prep-kit/) +11. Unstructured Data Meetup - SF, NYC, Silicon Valley +12. IBM TechXchange Las Vegas +13. Open Source [**RAG Pipeline workshop**](https://www.linkedin.com/posts/sujeemaniyam_dataprepkit-workshop-llm-activity-7256176802383986688-2UKc?utm_source=share&utm_medium=member_desktop) with Data Prep Kit at TechEquity's AI Summit in Silicon Valley +14. **Data Science Dojo Meetup** - [video](https://datasciencedojo.com/tutorial/data-preparation-toolkit/) +15. [**DPK tutorial and hands on session at IIIT Delhi**](https://www.linkedin.com/posts/cai-iiitd-97a6a4232_datascience-datapipelines-machinelearning-activity-7263121565125349376-FG8E?utm_source=share&utm_medium=member_desktop) + ## Example Code +Find example code in readme section of each tranform and some sample jupyter notebooks for getting started [**here**](examples/notebooks) ## Blogs / Tutorials - [**IBM Developer Blog**](https://developer.ibm.com/blogs/awb-unleash-potential-llms-data-prep-kit/) +- [**Introductory Blog on DPK**](https://www.linkedin.com/pulse/unleashing-potential-large-language-models-through-data-aanchal-goyal-fgtff) +- [**DPK Header Cleanser Module Blog by external contributor**](https://www.linkedin.com/pulse/enhancing-data-quality-developing-header-cleansing-tool-kalathiya-i1ohc/?trackingId=6iAeBkBBRrOLijg3LTzIGA%3D%3D) + -## Workshops +# Relevant online communities -- **2024-09-21: "RAG with Data Prep Kit" Workshop** @ Mountain View, CA, USA - [info](https://github.com/sujee/data-prep-kit-examples/blob/main/events/2024-09-21__RAG-workshop-data-riders.md) +- [**Data Prep Kit Discord Channel**](https://discord.com/channels/1276554812359442504/1303454647427661866) +- [**DPK is now listed in Github Awesome-LLM under LLM Data section**](https://github.com/Hannibal046/Awesome-LLM) +- [**DPK is now up for access via IBM Skills Build Download**](https://academic.ibm.com/a2mt/downloads/artificial_intelligence#/) +- [**DPK added to the Application Hub of “AI Sustainability Catalog”**](https://enterprise-neurosystem.github.io/Sustainability-Catalog/) -## Discord +## We Want Your Feedback! + Feel free to contribute to discussions or create a new one to share your [feedback](https://github.com/IBM/data-prep-kit/discussions) -- [**Data Prep Kit Discord Channel**](https://discord.com/channels/1276554812359442504/1286046139921207476) diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh index cb7b1ee10..7054a7b9a 100755 --- a/scripts/check-workflows.sh +++ b/scripts/check-workflows.sh @@ -17,7 +17,7 @@ if [ ! -d transforms ]; then echo Please run this script from the top of the repository exit 1 fi -KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser" +KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser fdedup" while [ $# -ne 0 ]; do case $1 in -show-kfp-black-list) echo $KFP_BLACK_LIST; exit 0; diff --git a/scripts/k8s-setup/requirements.env b/scripts/k8s-setup/requirements.env index 756cd69e4..9fbc5f9de 100644 --- a/scripts/k8s-setup/requirements.env +++ b/scripts/k8s-setup/requirements.env @@ -1,5 +1,5 @@ KUBERAY_OPERATOR=1.0.0 -KUBERAY_APISERVER=1.1.0 +KUBERAY_APISERVER=1.2.2 KIND_VERSION=0.22.0 HELM_VERSION=3.10.0 diff --git a/scripts/k8s-setup/tools/install_kuberay.sh b/scripts/k8s-setup/tools/install_kuberay.sh index 341ad2c85..30563b931 100755 --- a/scripts/k8s-setup/tools/install_kuberay.sh +++ b/scripts/k8s-setup/tools/install_kuberay.sh @@ -9,10 +9,10 @@ MAX_RETRIES="${MAX_RETRIES:-5}" EXIT_CODE=0 deploy() { - sed -i.back "s/tag: v[0-9].*/tag: v${KUBERAY_APISERVER}/" ${K8S_SETUP_SCRIPTS}/ray_api_server_values.yaml helm repo add kuberay https://ray-project.github.io/kuberay-helm/ helm repo update kuberay helm install kuberay-operator kuberay/kuberay-operator -n kuberay --version ${KUBERAY_OPERATOR} --set image.pullPolicy=IfNotPresent --create-namespace + sed -i.back "s/tag: v[0-9].*/tag: v${KUBERAY_APISERVER}/" ${K8S_SETUP_SCRIPTS}/ray_api_server_values.yaml helm install -f ${K8S_SETUP_SCRIPTS}/ray_api_server_values.yaml kuberay-apiserver kuberay/kuberay-apiserver -n kuberay --version ${KUBERAY_APISERVER} --set image.pullPolicy=IfNotPresent echo "Finished KubeRay deployment." } diff --git a/transforms/Makefile b/transforms/Makefile index 3e8b9cfde..ed492db4d 100644 --- a/transforms/Makefile +++ b/transforms/Makefile @@ -107,7 +107,7 @@ build-pkg-dist: -rm -fr src mkdir src # Copy all the src folders recursively (not clear if they have subfolders) - for x in $(shell find . | grep '[ray| python]/src$$') ; do \ + for x in $(shell find . | grep '[ray| python | spark]/src$$') ; do \ echo $$x ; \ if [ -d "$$x" ]; then \ cp -r $$x/* src ; \ diff --git a/transforms/README-list.md b/transforms/README-list.md index 3e70b6b62..8040dc7a9 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -36,8 +36,13 @@ Note: This list includes the transforms that were part of the release starting w * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/tokenization/python/README.md) * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/doc_id/python/README.md) * [web2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/web2parquet/README.md) + +## Release notes: - +### 0.2.2.dev3 +* web2parquet +### 0.2.2.dev2 +* pdf2parquet now supports HTML,DOCX,PPTX, ... in addition to PDF diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index 5e6f41bb2..be84b2f20 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt index bbb84b749..08deeb7d9 100644 --- a/transforms/code/code2parquet/python/requirements.txt +++ b/transforms/code/code2parquet/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 parameterized pandas diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index 15a4be4c1..923e2d4f3 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk-code2parquet-transform-python==0.2.2.dev2", + "data-prep-toolkit[ray]>=0.2.3.dev0", + "dpk-code2parquet-transform-python==0.2.3.dev0", "parameterized", "pandas", ] diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml index 492603d54..334c86fed 100644 --- a/transforms/code/code_profiler/python/pyproject.toml +++ b/transforms/code/code_profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt index 8608c6d6e..b36eabb8d 100644 --- a/transforms/code/code_profiler/python/requirements.txt +++ b/transforms/code/code_profiler/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 parameterized pandas aiolimiter==1.1.0 diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml index 933152e3f..dbd552e93 100644 --- a/transforms/code/code_profiler/ray/pyproject.toml +++ b/transforms/code/code_profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_profiler_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Pankaj Thorat", email = "pankaj.thorat@ibm.com" }, ] dependencies = [ - "dpk-code-profiler-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-code-profiler-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 5f201c8ae..17cbce67d 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt index 0bd936ef2..21ada1a79 100644 --- a/transforms/code/code_quality/python/requirements.txt +++ b/transforms/code/code_quality/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 bs4==0.0.2 transformers==4.38.2 diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index 290429f95..70dcd445b 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-code-quality-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index ecaf4d7bb..3703ec55f 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt index c2d0d8793..7a0fe8d28 100644 --- a/transforms/code/header_cleanser/python/requirements.txt +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index adff71cfc..896f451ad 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-header-cleanser-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 30f2f001e..3345d3a5a 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License Select Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt index 368287e5d..08447f212 100644 --- a/transforms/code/license_select/python/requirements.txt +++ b/transforms/code/license_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index 815121787..b5facfffe 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "License Select Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-license-select-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index 22d92fd8c..29db772a6 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev2", + "data-prep-toolkit>=0.2.3.dev0", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index 791b8d253..9f9e78377 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-malware-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index 186198d83..e5736a9c7 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt index 368287e5d..08447f212 100644 --- a/transforms/code/proglang_select/python/requirements.txt +++ b/transforms/code/proglang_select/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index bf3e5f9f4..321eb8f19 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-proglang-select-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index 80440a362..2481a1bf8 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]>=0.2.3.dev0", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/doc_chunk.ipynb b/transforms/language/doc_chunk/doc_chunk.ipynb new file mode 100644 index 000000000..3a8466037 --- /dev/null +++ b/transforms/language/doc_chunk/doc_chunk.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. We will only show the use of data_files_to_use and doc_chunk_chunking_type. For a complete list of parameters, please refer to the README.md for this transform\n", + "##### \n", + "| parameter:type | value | Description |\n", + "| --- | --- | --- |\n", + "|data_files_to_use: list | .parquet | Process all parquet files in the input folder |\n", + "| doc_chunk_chunking_type: str | dl_json | |\n" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from doc_chunk_transform_python import DocChunkPythonTransformConfiguration\n" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.parquet']\"),\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " \"doc_chunk_chunking_type\": \"dl_json\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:19:48 INFO - pipeline id pipeline_id\n", + "15:19:48 INFO - code location None\n", + "15:19:48 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "15:19:48 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:19:48 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:19:48 INFO - orchestrator doc_chunk started at 2024-11-20 15:19:48\n", + "15:19:48 INFO - Number of files is 1, source profile {'max_file_size': 0.011513710021972656, 'min_file_size': 0.011513710021972656, 'total_file_size': 0.011513710021972656}\n", + "15:19:48 INFO - Completed 1 files (100.0%) in 0.001 min\n", + "15:19:48 INFO - Done processing 1 files, waiting for flush() completion.\n", + "15:19:48 INFO - done flushing in 0.0 sec\n", + "15:19:48 INFO - Completed execution in 0.001 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=DocChunkPythonTransformConfiguration())\n", + "launcher.launch()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/metadata.json', 'python/output/test1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index 9abca2b79..1ec3a8080 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -1,5 +1,16 @@ # Chunk documents Transform +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Contributors + +- Michele Dolfi (dol@zurich.ibm.com) + +## Description + This transform is chunking documents. It supports multiple _chunker modules_ (see the `chunking_type` parameter). When using documents converted to JSON, the transform leverages the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` @@ -9,20 +20,26 @@ which provides the required JSON structure. When using documents converted to Markdown, the transform leverages the [Llama Index](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) `MarkdownNodeParser`, which is relying on its internal Markdown splitting logic. -## Output format + +### Input + +| input column name | data type | description | +|-|-|-| +| the one specified in _content_column_name_ configuration | string | the content used in this transform | + + +### Output format The output parquet file will contain all the original columns, but the content will be replaced with the individual chunks. -### Tracing the origin of the chunks +#### Tracing the origin of the chunks The transform allows to trace the origin of the chunk with the `source_doc_id` which is set to the value of the `document_id` column (if present) in the input table. The actual name of columns can be customized with the parameters described below. -## Running - -### Parameters +## Configuration The transform can be tuned with the following parameters. @@ -40,6 +57,12 @@ The transform can be tuned with the following parameters. | `output_pageno_column_name` | `page_number` | Column name to store the page number of the chunk in the output table. | | `output_bbox_column_name` | `bbox` | Column name to store the bbox of the chunk in the output table. | + + +## Usage + +### Launched Command Line Options + When invoking the CLI, the parameters must be set as `--doc_chunk_`, e.g. `--doc_chunk_column_name_key=myoutput`. @@ -63,8 +86,32 @@ ls output ``` To see results of the transform. +### Code example + +TBD (link to the notebook will be provided) + +See the sample script [src/doc_chunk_local_python.py](src/doc_chunk_local_python.py). + + ### Transforming data using the transform image To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. + +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_doc_chunk_python.py) + + +## Further Resource + +- For the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` + - +- For the Markdown chunker in LlamaIndex + - [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) +- For the Token Text Splitter in LlamaIndex + - [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/) diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index dd076d0e0..6488e9c68 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,3 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 docling-core==2.3.0 +pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 29b594fac..774e9fc13 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -12,7 +12,7 @@ authors = [ ] dependencies = [ "dpk-doc-chunk-transform-python==0.3.0", - "data-prep-toolkit[ray]==0.2.2.dev2", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/language/doc_quality/doc_quality.ipynb b/transforms/language/doc_quality/doc_quality.ipynb new file mode 100644 index 000000000..bf91047b6 --- /dev/null +++ b/transforms/language/doc_quality/doc_quality.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install data-prep-toolkit\n", + "%pip install data-prep-toolkit-transforms==0.2.2.dev3" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding DocQualityTransform configuration for values are as follows: \n", + "* text_lang - specifies language used in the text content. By default, \"en\" is used.\n", + "* doc_content_column - specifies column name that contains document text. By default, \"contents\" is used.\n", + "* bad_word_filepath - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words.\n", + "#####" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from doc_quality_transform import (bad_word_filepath_cli_param, doc_content_column_cli_param, text_lang_cli_param,)\n", + "from doc_quality_transform_python import DocQualityPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# create parameters\n", + "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " # execution info\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n", + " # doc_quality params\n", + " text_lang_cli_param: \"en\",\n", + " doc_content_column_cli_param: \"contents\",\n", + " bad_word_filepath_cli_param: os.path.join(\"python\", \"ldnoobw\", \"en\"),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "12:39:07 INFO - doc_quality parameters are : {'text_lang': 'en', 'doc_content_column': 'contents', 'bad_word_filepath': 'python/ldnoobw/en', 's3_cred': None, 'docq_data_factory': }\n", + "12:39:07 INFO - pipeline id pipeline_id\n", + "12:39:07 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", + "12:39:07 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "12:39:07 INFO - data factory data_ max_files -1, n_sample -1\n", + "12:39:07 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "12:39:07 INFO - orchestrator docq started at 2024-11-25 12:39:07\n", + "12:39:07 INFO - Number of files is 1, source profile {'max_file_size': 0.0009870529174804688, 'min_file_size': 0.0009870529174804688, 'total_file_size': 0.0009870529174804688}\n", + "12:39:07 INFO - Load badwords found locally from python/ldnoobw/en\n", + "12:39:09 INFO - Completed 1 files (100.0%) in 0.033 min\n", + "12:39:09 INFO - Done processing 1 files, waiting for flush() completion.\n", + "12:39:09 INFO - done flushing in 0.0 sec\n", + "12:39:09 INFO - Completed execution in 0.033 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=DocQualityPythonTransformConfiguration())\n", + "launcher.launch()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/metadata.json', 'python/output/test1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/doc_quality/python/README.md b/transforms/language/doc_quality/python/README.md index 38421f34f..c10bc4b88 100644 --- a/transforms/language/doc_quality/python/README.md +++ b/transforms/language/doc_quality/python/README.md @@ -1,13 +1,25 @@ # Document Quality Transform + Please see the set of [transform project conventions](../../../README.md#transform-project-conventions) for details on general project conventions, transform configuration, testing and IDE set up. -## Summary -This transform will calculate and annotate several metrics related to document, which are usuful to see the quality of document. +## Contributors + +- Daiki Tsuzuku (dtsuzuku@jp.ibm.com) + +## Description +This transform will calculate and annotate several metrics which are useful to assess the quality of the document. +The document quality transform operates on text documents only + +### Input -In this transform, following metrics will be included: +| input column name | data type | description | +|-|-|-| +| the one specified in _doc_content_column_ configuration | string | text whose quality will be calculated by this transform | + +### Output columns annotated by this transform | output column name | data type | description | supported language | |-|-|-|-| @@ -27,7 +39,7 @@ In this transform, following metrics will be included: You can see more detailed backgrounds of some columns in [Deepmind's Gopher paper](https://arxiv.org/pdf/2112.11446.pdf) -## Configuration and command line Options +## Configuration The set of dictionary keys holding [DocQualityTransform](src/doc_quality_transform.py) configuration for values are as follows: @@ -36,13 +48,19 @@ configuration for values are as follows: * _doc_content_column_ - specifies column name that contains document text. By default, "contents" is used. * _bad_word_filepath_ - specifies a path to bad word file: local folder (file or directory) that points to bad word file. You don't have to set this parameter if you don't need to set bad words. -## Running +Example +``` +{ + text_lang_key: "en", + doc_content_column_key: "contents", + bad_word_filepath_key: os.path.join(basedir, "ldnoobw", "en"), +} +``` + +## Usage ### Launched Command Line Options -When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -the options provided by -the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). +The following command line arguments are available ``` --docq_text_lang DOCQ_TEXT_LANG language used in the text content. By default, "en" is used. --docq_doc_content_column DOCQ_DOC_CONTENT_COLUMN column name that contain document text. By default, "contents" is used. @@ -70,6 +88,9 @@ ls output ``` To see results of the transform. +### Code example + +[notebook](../doc_quality.ipynb) ### Transforming data using the transform image @@ -77,7 +98,27 @@ To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_doc_quality_python.py) +- [Integration test](test/test_doc_quality.py) + + +## Further Resource + +- For those who want to learn C4 heuristic rules + - https://arxiv.org/pdf/1910.10683.pdf +- For those who want to learn Gopher statistics + - https://arxiv.org/pdf/2112.11446.pdf +- For those who want to see the source of badwords used by default + - https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words + + +## Consideration -## Troubleshooting guide +### Troubleshooting guide For M1 Mac user, if you see following error during make command, `error: command '/usr/bin/clang' failed with exit code 1`, you may better follow [this step](https://freeman.vc/notes/installing-fasttext-on-an-m1-mac) \ No newline at end of file diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index 72406e945..23538b8c7 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt index 2993d6b12..fddab961a 100644 --- a/transforms/language/doc_quality/python/requirements.txt +++ b/transforms/language/doc_quality/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index dc13d5f94..6395c45b4 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-doc_quality-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/language/html2parquet/requirements.txt b/transforms/language/html2parquet/requirements.txt index af6ffe1e5..fdd84b1e8 100644 --- a/transforms/language/html2parquet/requirements.txt +++ b/transforms/language/html2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 trafilatura==1.12.0 diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index c5de6826a..a69724a2d 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt index a405f7afc..828ec54c3 100644 --- a/transforms/language/lang_id/python/requirements.txt +++ b/transforms/language/lang_id/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 fasttext==0.9.2 langcodes==3.3.0 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index ac45a167e..777e0d718 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-lang_id-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/language/pdf2parquet/README.md b/transforms/language/pdf2parquet/README.md index 14373a68c..89a53147d 100644 --- a/transforms/language/pdf2parquet/README.md +++ b/transforms/language/pdf2parquet/README.md @@ -1,10 +1,10 @@ -# PDF2PARQUET Transform +# Pdf2Parquet Transform -The PDF2PARQUET transforms iterate through PDF files or zip of PDF files and generates parquet files -containing the converted document in Markdown format. +The Pdf2Parquet transforms iterate through PDF, Docx, Pptx, Images files or zip of files and generates parquet files +containing the converted document in Markdown or JSON format. -The PDF conversion is using the [Docling package](https://github.com/DS4SD/docling). +The conversion is using the [Docling package](https://github.com/DS4SD/docling). The following runtimes are available: diff --git a/transforms/language/pdf2parquet/pdf2parquet.ipynb b/transforms/language/pdf2parquet/pdf2parquet.ipynb new file mode 100644 index 000000000..2d26741b3 --- /dev/null +++ b/transforms/language/pdf2parquet/pdf2parquet.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. We will only show the use of double_precision. For a complete list, please refer to the README.md for this transform.\n", + "##### \n", + "| parameter:type | Description |\n", + "| --- | --- |\n", + "| data_files_to_use: list | list of file extensions in the input folder to use for running the transform |\n", + "|pdf2parquet_double_precision: int | If set, all floating points (e.g. bounding boxes) are rounded to this precision. For tests it is advised to use 0 |\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from pdf2parquet_transform_python import Pdf2ParquetPythonTransformConfiguration\n" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# create parameters\n", + "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.pdf','.docx','.pptx','.zip']\"),\n", + " # execution info\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " # pdf2parquet params\n", + " \"pdf2parquet_double_precision\": 0,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:13:18 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 0}\n", + "15:13:18 INFO - pipeline id pipeline_id\n", + "15:13:18 INFO - code location None\n", + "15:13:18 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "15:13:18 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:13:18 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf', '.docx', '.pptx', '.zip'], files to checkpoint ['.parquet']\n", + "15:13:18 INFO - orchestrator pdf2parquet started at 2024-11-20 15:13:18\n", + "15:13:18 INFO - Number of files is 2, source profile {'max_file_size': 0.3013172149658203, 'min_file_size': 0.2757863998413086, 'total_file_size': 0.5771036148071289}\n", + "15:13:18 INFO - Initializing models\n", + "15:14:08 INFO - Processing archive_doc_filename='2305.03393v1-pg9.pdf' \n", + "15:14:09 INFO - Processing archive_doc_filename='2408.09869v1-pg1.pdf' \n", + "15:14:10 INFO - Completed 1 files (50.0%) in 0.04 min\n", + "15:14:18 INFO - Completed 2 files (100.0%) in 0.179 min\n", + "15:14:18 INFO - Done processing 2 files, waiting for flush() completion.\n", + "15:14:18 INFO - done flushing in 0.0 sec\n", + "15:14:18 INFO - Completed execution in 1.007 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=Pdf2ParquetPythonTransformConfiguration())\n", + "launcher.launch()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/redp5110-ch1.parquet',\n", + " 'python/output/metadata.json',\n", + " 'python/output/archive1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef6667e-71ed-4054-9382-55c6bb3fda70", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/pdf2parquet/python/README.md b/transforms/language/pdf2parquet/python/README.md index a4bd31e06..d9dc2520a 100644 --- a/transforms/language/pdf2parquet/python/README.md +++ b/transforms/language/pdf2parquet/python/README.md @@ -1,4 +1,15 @@ -# Ingest PDF to Parquet +# Ingest PDF to Parquet Transform + +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Contributors + +- Michele Dolfi (dol@zurich.ibm.com) + +## Description This tranforms iterate through document files or zip of files and generates parquet files containing the converted document in Markdown or JSON format. @@ -7,6 +18,9 @@ The PDF conversion is using the [Docling package](https://github.com/DS4SD/docli The Docling configuration in DPK is tuned for best results when running large batch ingestions. For more details on the multiple configuration options, please refer to the official [Docling documentation](https://ds4sd.github.io/docling/). + +### Input files + This transform supports the following input formats: - PDF documents @@ -17,37 +31,39 @@ This transform supports the following input formats: - Markdown documents - ASCII Docs documents +The input documents can be provided in a folder structure, or as a zip archive. +Please see the configuration section for specifying the input files. -## Output format -The output format will contain all the columns of the metadata CSV file, -with the addition of the following columns +### Output format -```jsonc -{ - "source_filename": "string", // the basename of the source archive or file - "filename": "string", // the basename of the PDF file - "contents": "string", // the content of the PDF - "document_id": "string", // the document id, a random uuid4 - "document_hash": "string", // the document hash of the input content - "ext": "string", // the detected file extension - "hash": "string", // the hash of the `contents` column - "size": "string", // the size of `contents` - "date_acquired": "date", // the date when the transform was executing - "num_pages": "number", // number of pages in the PDF - "num_tables": "number", // number of tables in the PDF - "num_doc_elements": "number", // number of document elements in the PDF - "pdf_convert_time": "float", // time taken to convert the document in seconds -} -``` +The output table will contain following columns +| output column name | data type | description | +|-|-|-| +| source_filename | string | the basename of the source archive or file | +| filename | string | the basename of the PDF file | +| contents | string | the content of the PDF | +| document_id | string | the document id, a random uuid4 | +| document_hash | string | the document hash of the input content | +| ext | string | the detected file extension | +| hash | string | the hash of the `contents` column | +| size | string | the size of `contents` | +| date_acquired | date | the date when the transform was executing | +| num_pages | number | number of pages in the PDF | +| num_tables | number | number of tables in the PDF | +| num_doc_elements | number | number of document elements in the PDF | +| pdf_convert_time | float | time taken to convert the document in seconds | -## Parameters + + +## Configuration The transform can be initialized with the following parameters. | Parameter | Default | Description | |------------|----------|--------------| +| `data_files_to_use` | - | The files extensions to be considered when running the transform. Example value `['.pdf','.docx','.pptx','.zip']`. For all the supported input formats, see the section above. | | `batch_size` | -1 | Number of documents to be saved in the same result table. A value of -1 will generate one result file for each input file. | | `artifacts_path` | | Path where to Docling models artifacts are located, if unset they will be downloaded and fetched from the [HF_HUB_CACHE](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache) folder. | | `contents_type` | `text/markdown` | The output type for the `contents` column. Valid types are `text/markdown`, `text/plain` and `application/json`. | @@ -58,9 +74,68 @@ The transform can be initialized with the following parameters. | `pdf_backend` | `dlparse_v2` | The PDF backend to use. Valid values are `dlparse_v2`, `dlparse_v1`, `pypdfium2`. | | `double_precision` | `8` | If set, all floating points (e.g. bounding boxes) are rounded to this precision. For tests it is advised to use 0. | + +Example + +```py +{ + "data_files_to_use": ast.literal_eval("['.pdf','.docx','.pptx','.zip']"), + "contents_type": "application/json", + "do_ocr": True, +} +``` + +## Usage + +### Launched Command Line Options + When invoking the CLI, the parameters must be set as `--pdf2parquet_`, e.g. `--pdf2parquet_do_ocr=true`. +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/pdf2parquet_transform_python.py using command line args +* `run-local-sample` - runs src/pdf2parquet_local.py +* `run-local-python-sample` - runs src/pdf2parquet_local_python.py + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-local-python-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + + +### Code example + +TBD (link to the notebook will be provided) + +See the sample script [src/pdf2parquet_local_python.py](src/pdf2parquet_local_python.py). + + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py) +- [Integration test](transforms/language/pdf2parquet/python/test/test_pdf2parquet.py) + + ## Credits The PDF document conversion is developed by the AI for Knowledge group in IBM Research Zurich. diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index 2912af252..19f394d6c 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 docling-core==2.3.0 docling-ibm-models==2.0.3 deepsearch-glm==0.26.1 diff --git a/transforms/language/pdf2parquet/ray/README.md b/transforms/language/pdf2parquet/ray/README.md index 5ef98f645..4db4b47c7 100644 --- a/transforms/language/pdf2parquet/ray/README.md +++ b/transforms/language/pdf2parquet/ray/README.md @@ -1,7 +1,55 @@ -# PDF2PARQUET Ray Transform +# Ingest PDF to Parquet Ray Transform +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. This module implements the ray version of the [pdf2parquet transform](../python/). +## Summary +This project wraps the [Ingest PDF to Parquet transform](../python) with a Ray runtime. + +## Configuration and command line Options + +Ingest PDF to Parquet configuration and command line options are the same as for the base python transform. + +## Running + +### Launched Command Line Options +When running the transform with the Ray launcher (i.e. TransformLauncher), +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. + +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/pdf2parquet_transform_ray.py using command line args +* `run-local-sample` - runs src/pdf2parquet_local_ray.py +* `run-s3-sample` - runs src/pdf2parquet_s3_ray.py + * Requires prior invocation of `make minio-start` to load data into local minio for S3 access. + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + ## Prometheus metrics diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 2b414c59e..93b9c3f96 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,5 +1,5 @@ dpk-pdf2parquet-transform-python==0.3.0 -data-prep-toolkit[ray]==0.2.2.dev2 +data-prep-toolkit[ray]>=0.2.3.dev0 # docling-core==1.7.2 # docling-ibm-models==2.0.0 # deepsearch-glm==0.22.0 diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index 72c1bf783..9e490e730 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "PII redactor Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt index 958210865..5c3d41aa8 100644 --- a/transforms/language/pii_redactor/python/requirements.txt +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 presidio-analyzer>=2.2.355 presidio-anonymizer>=2.2.355 flair>=0.14.0 diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index b96f16615..a3648e80d 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk_pii_redactor_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/python/README.md b/transforms/language/text_encoder/python/README.md index 4c927d1ed..fa9c54ada 100644 --- a/transforms/language/text_encoder/python/README.md +++ b/transforms/language/text_encoder/python/README.md @@ -1,14 +1,36 @@ # Text Encoder Transform -## Summary +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Contributors + +- Michele Dolfi (dol@zurich.ibm.com) + +## Description + This transform is using [sentence encoder models](https://en.wikipedia.org/wiki/Sentence_embedding) to create embedding vectors of the text in each row of the input .parquet table. The embeddings vectors generated by the transform are useful for tasks like sentence similarity, features extraction, etc which are also at the core of retrieval-augmented generation (RAG) applications. +### Input + +| input column name | data type | description | +|-|-|-| +| the one specified in _content_column_name_ configuration | string | the content used in this transform | + + +### Output columns + + +| output column name | data type | description | +|-|-|-| +| the one specified in _output_embeddings_column_name_ configuration | `array[float]` | the embeddings vectors of the content | -## Running -### Parameters +## Configuration The transform can be tuned with the following parameters. @@ -18,7 +40,11 @@ The transform can be tuned with the following parameters. | `model_name` | `BAAI/bge-small-en-v1.5` | The HF model to use for encoding the text. | | `content_column_name` | `contents` | Name of the column containing the text to be encoded. | | `output_embeddings_column_name` | `embeddings` | Column name to store the embeddings in the output table. | -| `output_path_column_name` | `doc_path` | Column name to store the document path of the chunk in the output table. | + + +## Usage + +### Launched Command Line Options When invoking the CLI, the parameters must be set as `--text_encoder_`, e.g. `--text_encoder_column_name_key=myoutput`. @@ -43,8 +69,20 @@ ls output ``` To see results of the transform. +### Code example + +TBD (link to the notebook will be provided) + + ### Transforming data using the transform image To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. + +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_text_encoder_python.py) \ No newline at end of file diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index 87dad3c1c..dc15beb6e 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt index 2eb79e69b..5a1cae43d 100644 --- a/transforms/language/text_encoder/python/requirements.txt +++ b/transforms/language/text_encoder/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 sentence-transformers==3.0.1 diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index ef08f697a..530f890d2 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-text_encoder-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/language/text_encoder/text_encoder.ipynb b/transforms/language/text_encoder/text_encoder.ipynb new file mode 100644 index 000000000..aca309594 --- /dev/null +++ b/transforms/language/text_encoder/text_encoder.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. For this notebook, we use all the default parameters. For a complete list of parameters, please refer to the README.md for this transform.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from text_encoder_transform_python import TextEncoderPythonTransformConfiguration\n" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "input_folder = os.path.join (\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:44:57 INFO - pipeline id pipeline_id\n", + "15:44:57 INFO - code location None\n", + "15:44:57 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "15:44:57 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:44:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:44:57 INFO - orchestrator text_encoder started at 2024-11-20 15:44:57\n", + "15:44:57 INFO - Number of files is 1, source profile {'max_file_size': 0.0010089874267578125, 'min_file_size': 0.0010089874267578125, 'total_file_size': 0.0010089874267578125}\n", + "15:44:58 INFO - Completed 1 files (100.0%) in 0.003 min\n", + "15:44:58 INFO - Done processing 1 files, waiting for flush() completion.\n", + "15:44:58 INFO - done flushing in 0.0 sec\n", + "15:44:58 INFO - Completed execution in 0.017 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=TextEncoderPythonTransformConfiguration())\n", + "launcher.launch()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/metadata.json', 'python/output/test1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 2357553e4..d6e0d2fdd 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.2.dev3" +version = "0.2.3.dev1" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -44,6 +44,7 @@ all = { file = [ "universal/hap/python/requirements.txt", "universal/tokenization/python/requirements.txt", "universal/ededup/python/requirements.txt", +"universal/fdedup/python/requirements.txt", "universal/profiler/python/requirements.txt", "universal/doc_id/python/requirements.txt", "universal/filter/python/requirements.txt", @@ -71,6 +72,7 @@ pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]} hap = { file = ["universal/hap/python/requirements.txt"]} tokenization = { file = ["universal/tokenization/python/requirements.txt"]} ededup = { file = ["universal/ededup/python/requirements.txt"]} +fdedup = { file = ["universal/fdedup/python/requirements.txt"]} profiler = { file = ["universal/profiler/python/requirements.txt"]} doc_id = { file = ["universal/doc_id/python/requirements.txt"]} filter = { file = ["universal/filter/python/requirements.txt"]} @@ -80,11 +82,11 @@ web2parquet = { file = ["universal/web2parquet/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them -[tool.setuptools.package-data] -"*" = ["*.txt"] +#[tool.setuptools.package-data] +#"*" = ["*.txt"] -[tool.setuptools.packages.find] -where = ["src"] +#[tool.setuptools.packages.find] +#where = ["src"] #[tool.setuptools.package-dir] #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt index 9012f685b..b0527bdd6 100644 --- a/transforms/requirements-ray.txt +++ b/transforms/requirements-ray.txt @@ -1,4 +1,4 @@ -data-prep-toolkit[ray]>=0.2.2.dev2 +data-prep-toolkit[ray]>=0.2.3.dev0 networkx==3.3 colorlog==6.8.2 func-timeout==4.3.5 diff --git a/transforms/requirements.txt b/transforms/requirements.txt index 8b48a970f..934c95182 100644 --- a/transforms/requirements.txt +++ b/transforms/requirements.txt @@ -1 +1 @@ -data-prep-toolkit>=0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 diff --git a/transforms/transform.config b/transforms/transform.config index c226171c6..7bafba684 100644 --- a/transforms/transform.config +++ b/transforms/transform.config @@ -7,11 +7,3 @@ # expected files and is used to define the transform's image name. TRANSFORM_NAME=data-prep-kit-transforms -################################################################################ -# This defines the transforms' package version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -TRANSFORMS_PKG_VERSION=0.2.2.dev2 diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index 0e2658087..1a962662d 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt index 368287e5d..08447f212 100644 --- a/transforms/universal/doc_id/python/requirements.txt +++ b/transforms/universal/doc_id/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index 5a5941155..372f39762 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk_doc_id_transform_python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index 36f345c09..369a1bb72 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2.dev2", + "data-prep-toolkit[spark]==0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/ededup/README.md b/transforms/universal/ededup/README.md index 9a112e816..0390cc19c 100644 --- a/transforms/universal/ededup/README.md +++ b/transforms/universal/ededup/README.md @@ -1,4 +1,4 @@ -# Exect Deduplification Transform +# Exact Deduplication Transform ## Summary diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 735104f20..da28e715f 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt index 75baaef62..b5082bf0b 100644 --- a/transforms/universal/ededup/python/requirements.txt +++ b/transforms/universal/ededup/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 mmh3>=4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 9e3885e50..43045fed7 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_ededup_transform_python==0.2.2.dev2", + "data-prep-toolkit[ray]>=0.2.3.dev0", + "dpk_ededup_transform_python==0.2.3.dev0", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/README.md b/transforms/universal/fdedup/README.md index e128566d2..fed3c1370 100644 --- a/transforms/universal/fdedup/README.md +++ b/transforms/universal/fdedup/README.md @@ -1,10 +1,11 @@ -# Fuzzy Deduplification Transform -The fdedup transforms removes documents that are very similar to each other within a set of parquet files, -per the set of -[transform project conventions](../../README.md#transform-project-conventions) -the following runtimes are available: +# Fuzzy Deduplication Transform +The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a +set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries. +Per the set of [transform project conventions](../../README.md#transform-project-conventions) the following runtimes are available: -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. +* [python](python/README.md) - enables running the base transform in a pure python environment +* [ray](ray/README.md) - enables running the base python transform in a Ray runtime +* [spark](spark/README.md) - enables running the base python transform in a spark runtime +* [kfp](kfp_ray/README.md) - enables running the ray docker image in a kubernetes cluster using a generated `yaml` file. + +Please check [here](python/README.md) for a more detailed description of this transform. diff --git a/transforms/universal/fdedup/fdedup_python.ipynb b/transforms/universal/fdedup/fdedup_python.ipynb new file mode 100644 index 000000000..684583ffd --- /dev/null +++ b/transforms/universal/fdedup/fdedup_python.ipynb @@ -0,0 +1,564 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args, ServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"python\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:29 INFO - Starting SignatureCalculation step\n", + "13:30:29 INFO - Got parameters for SignatureCalculation\n", + "13:30:29 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "13:30:29 INFO - data factory scdata_ is using local configuration without input/output path\n", + "13:30:29 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "13:30:29 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:29 INFO - pipeline id pipeline_id\n", + "13:30:29 INFO - code location None\n", + "13:30:29 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n", + "13:30:29 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:29 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:29 INFO - orchestrator minhash started at 2024-11-26 13:30:29\n", + "13:30:29 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n", + "13:30:33 INFO - Completed 1 files (50.0%) in 0.074 min\n", + "13:30:33 INFO - Completed 2 files (100.0%) in 0.074 min\n", + "13:30:33 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:30:33 INFO - Starting flush()\n", + "13:30:34 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", + "13:30:34 INFO - done flushing in 0.063 sec\n", + "13:30:34 INFO - Completed execution in 0.075 min, execution result 0\n", + "13:30:34 INFO - SignatureCalculation completed successfully\n", + "13:30:34 INFO - Starting ClusterAnalysis step\n", + "13:30:34 INFO - Got parameters for ClusterAnalysis\n", + "13:30:34 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/docs_to_remove\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator cluster started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of folders is 14\n", + "13:30:34 INFO - Completed 1 files (7.14%) in 0.0 min\n", + "13:30:34 INFO - Completed 2 files (14.29%) in 0.0 min\n", + "13:30:34 INFO - Completed 3 files (21.43%) in 0.001 min\n", + "13:30:34 INFO - Completed 4 files (28.57%) in 0.001 min\n", + "13:30:34 INFO - Completed 5 files (35.71%) in 0.001 min\n", + "13:30:34 INFO - Completed 6 files (42.86%) in 0.001 min\n", + "13:30:34 INFO - Completed 7 files (50.0%) in 0.001 min\n", + "13:30:34 INFO - Completed 8 files (57.14%) in 0.002 min\n", + "13:30:34 INFO - Completed 9 files (64.29%) in 0.002 min\n", + "13:30:34 INFO - Completed 10 files (71.43%) in 0.002 min\n", + "13:30:34 INFO - Completed 11 files (78.57%) in 0.002 min\n", + "13:30:34 INFO - Completed 12 files (85.71%) in 0.002 min\n", + "13:30:34 INFO - Completed 13 files (92.86%) in 0.002 min\n", + "13:30:34 INFO - Completed 14 files (100.0%) in 0.003 min\n", + "13:30:34 INFO - Done processing 14 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.003 min, execution result 0\n", + "13:30:34 INFO - ClusterAnalysis completed successfully\n", + "13:30:34 INFO - Starting GetDuplicateList step\n", + "13:30:34 INFO - Got parameters for GetDuplicateList\n", + "13:30:34 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator fdlist started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of folders is 1\n", + "13:30:34 INFO - Get Duplicate List for folder docs_to_remove\n", + "13:30:34 INFO - 8 documents marked as duplicates\n", + "13:30:34 INFO - Completed 1 files (100.0%) in 0.0 min\n", + "13:30:34 INFO - Done processing 1 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.001 min, execution result 0\n", + "13:30:34 INFO - GetDuplicateList completed successfully\n", + "13:30:34 INFO - Starting DataCleaning step\n", + "13:30:34 INFO - Got parameters for DataCleaning\n", + "13:30:34 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "13:30:34 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "13:30:34 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - pipeline id pipeline_id\n", + "13:30:34 INFO - code location None\n", + "13:30:34 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/python/output/cleaned\n", + "13:30:34 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:34 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:34 INFO - orchestrator fdclean started at 2024-11-26 13:30:34\n", + "13:30:34 INFO - Number of files is 2, source profile {'max_file_size': 0.0029497146606445312, 'min_file_size': 0.0013322830200195312, 'total_file_size': 0.0042819976806640625}\n", + "13:30:34 INFO - Completed 1 files (50.0%) in 0.0 min\n", + "13:30:34 INFO - Completed 2 files (100.0%) in 0.0 min\n", + "13:30:34 INFO - Done processing 2 files, waiting for flush() completion.\n", + "13:30:34 INFO - done flushing in 0.0 sec\n", + "13:30:34 INFO - Completed execution in 0.0 min, execution result 0\n", + "13:30:34 INFO - DataCleaning completed successfully\n" + ] + } + ], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = ServiceOrchestrator(global_params=args)\n", + "# Launch python fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/cleaned/metadata.json',\n", + " 'python/output/cleaned/data_1',\n", + " 'python/output/cleaned/data_2']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"python/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (12, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 3 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 5 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ │\n", + "│ 6 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", + "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", + "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", + "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", + "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", + "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", + "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", + "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", + "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", + "│ ┆ another capy capably chewing on cantaloupe │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", + "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 17 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "input_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_1\", \"df1.parquet\"))\n", + "input_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"test-data\", \"input\", \"data_2\", \"df2.parquet\"))\n", + "input_df = input_df_1.vstack(input_df_2)\n", + "\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (4, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "output_df_1 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_1\", \"df1.parquet\"))\n", + "output_df_2 = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"python\", \"output\", \"cleaned\", \"data_2\", \"df2.parquet\"))\n", + "output_df = output_df_1.vstack(output_df_2)\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/fdedup_ray.ipynb b/transforms/universal/fdedup/fdedup_ray.ipynb new file mode 100644 index 000000000..bb69579a9 --- /dev/null +++ b/transforms/universal/fdedup/fdedup_ray.ipynb @@ -0,0 +1,601 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-26 13:30:56,482\tINFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.\n" + ] + } + ], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args\n", + "from fdedup_transform_ray import RayServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |\n", + "| run_locally:bool | true | if true, launch a ray cluster locally, otherwise connect to an already existing cluster | \n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"ray\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + " # ray configuration parameters\n", + " \"run_locally\": True,\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use ray runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "13:30:57 INFO - Starting SignatureCalculation step\n", + "13:30:57 INFO - Got parameters for SignatureCalculation\n", + "13:30:57 INFO - minhash parameters are : {'document_id_column': 'int_id_column', 'contents_column': 'contents', 'seed': 42, 'num_permutations': 112, 'jaccard_similarity_threshold': 0.75, 'word_shingle_size': 5, 'num_bands': 14, 'num_minhashes_per_band': 8, 'num_segments': 1, 'shingle_option': 'word'}\n", + "13:30:57 INFO - data factory scdata_ is using local configuration without input/output path\n", + "13:30:57 INFO - data factory scdata_ max_files -1, n_sample -1\n", + "13:30:57 INFO - data factory scdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:57 INFO - pipeline id pipeline_id\n", + "13:30:57 INFO - code location None\n", + "13:30:57 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:30:57 INFO - actor creation delay 0\n", + "13:30:57 INFO - job details {'job category': 'preprocessing', 'job name': 'minhash', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:30:57 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", + "13:30:57 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:30:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:30:57 INFO - Running locally\n", + "2024-11-26 13:31:08,860\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - orchestrator started at 2024-11-26 13:31:12\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.162438202649355, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:12 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - Completed processing 1 files in 0.002 min\n", + "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Starting flush()\n", + "\u001b[36m(orchestrate pid=86958)\u001b[0m 13:31:14 INFO - done flushing in 0.045 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=86984)\u001b[0m 13:31:14 INFO - Wrote 14 tables with a total size of 80,640 bytes\n", + "13:31:24 INFO - Completed execution in 0.446 min, execution result 0\n", + "13:31:26 INFO - SignatureCalculation completed successfully\n", + "13:31:26 INFO - Starting ClusterAnalysis step\n", + "13:31:26 INFO - Got parameters for ClusterAnalysis\n", + "13:31:26 INFO - cluster parameters are : {'jaccard_similarity_threshold': 0.75, 'num_bands': 14, 'num_segments': 1, 'sort_output': False}\n", + "13:31:26 INFO - pipeline id pipeline_id\n", + "13:31:26 INFO - code location None\n", + "13:31:26 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:26 INFO - actor creation delay 0\n", + "13:31:26 INFO - job details {'job category': 'preprocessing', 'job name': 'cluster', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:26 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/bands output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/docs_to_remove\n", + "13:31:26 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:26 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:26 INFO - Running locally\n", + "2024-11-26 13:31:28,318\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - orchestrator started at 2024-11-26 13:31:31\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of folders is 14\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.77626838721335, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:31 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 2 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 3 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 4 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 5 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 6 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 7 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 8 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 9 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 10 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed 11 files (78.571%) in 0.001 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - Completed processing 14 files in 0.001 min\n", + "\u001b[36m(orchestrate pid=87057)\u001b[0m 13:31:33 INFO - done flushing in 0.001 sec\n", + "13:31:43 INFO - Completed execution in 0.292 min, execution result 0\n", + "13:31:45 INFO - ClusterAnalysis completed successfully\n", + "13:31:45 INFO - Starting GetDuplicateList step\n", + "13:31:45 INFO - Got parameters for GetDuplicateList\n", + "13:31:45 INFO - fdlist parameters are : {'docs_to_remove': 'docs_to_remove', 'consolidated_filename': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'sort_output': False}\n", + "13:31:45 INFO - pipeline id pipeline_id\n", + "13:31:45 INFO - code location None\n", + "13:31:45 INFO - number of workers 1 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:31:45 INFO - actor creation delay 0\n", + "13:31:45 INFO - job details {'job category': 'preprocessing', 'job name': 'fdlist', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:31:45 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output\n", + "13:31:45 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:31:45 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:31:45 INFO - Running locally\n", + "2024-11-26 13:31:47,311\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - orchestrator started at 2024-11-26 13:31:50\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of folders is 1\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.749520111829042, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:50 INFO - Number of workers - 1 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - Completed processing 1 files in 0.0 min\n", + "\u001b[36m(orchestrate pid=87134)\u001b[0m 13:31:52 INFO - done flushing in 0.001 sec\n", + "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - Get Duplicate List for folder docs_to_remove\n", + "\u001b[36m(RayTransformFileProcessor pid=87153)\u001b[0m 13:31:52 INFO - 8 documents marked as duplicates\n", + "13:32:02 INFO - Completed execution in 0.295 min, execution result 0\n", + "13:32:04 INFO - GetDuplicateList completed successfully\n", + "13:32:04 INFO - Starting DataCleaning step\n", + "13:32:04 INFO - Got parameters for DataCleaning\n", + "13:32:04 INFO - fdclean parameters are : {'document_id_column': 'int_id_column', 'duplicate_list_location': 'docs_to_remove_consolidated/docs_to_remove_consolidated.parquet', 'operation_mode': 'filter_duplicates'}\n", + "13:32:04 INFO - data factory dcdata_ is using local configuration without input/output path\n", + "13:32:04 INFO - data factory dcdata_ max_files -1, n_sample -1\n", + "13:32:04 INFO - data factory dcdata_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:04 INFO - pipeline id pipeline_id\n", + "13:32:04 INFO - code location None\n", + "13:32:04 INFO - number of workers 3 worker options {'num_cpus': 0.8, 'max_restarts': -1}\n", + "13:32:04 INFO - actor creation delay 0\n", + "13:32:04 INFO - job details {'job category': 'preprocessing', 'job name': 'fdclean', 'job type': 'ray', 'job id': 'job_id'}\n", + "13:32:04 INFO - data factory data_ is using local data access: input_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/test-data/input output_folder - /Users/touma/data-prep-kit/transforms/universal/fdedup/ray/output/cleaned\n", + "13:32:04 INFO - data factory data_ max_files -1, n_sample -1\n", + "13:32:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "13:32:04 INFO - Running locally\n", + "2024-11-26 13:32:07,526\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - orchestrator started at 2024-11-26 13:32:10\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of files is 1, source profile {'max_file_size': 0.003920555114746094, 'min_file_size': 0.003920555114746094, 'total_file_size': 0.003920555114746094}\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Cluster resources: {'cpus': 12, 'gpus': 0, 'memory': 11.738976669497788, 'object_store': 2.0}\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:10 INFO - Number of workers - 3 with {'num_cpus': 0.8, 'max_restarts': -1} each\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed 0 files (0.0%) in 0.0 min. Waiting for completion\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - Completed processing 1 files in 0.002 min\n", + "\u001b[36m(orchestrate pid=87217)\u001b[0m 13:32:13 INFO - done flushing in 0.003 sec\n", + "13:32:23 INFO - Completed execution in 0.313 min, execution result 0\n", + "13:32:24 INFO - DataCleaning completed successfully\n" + ] + } + ], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = RayServiceOrchestrator(global_params=args)\n", + "# Launch ray fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ray/output/cleaned/metadata.json', 'ray/output/cleaned/df1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"ray/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (12, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 3 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 5 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ │\n", + "│ 6 ┆ │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 11 ┆ A couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. Yesterday, a pair of │\n", + "│ ┆ capricious pigeons prattled placidly by the cactus, curiously considering │\n", + "│ ┆ another pigeon capably pecking at cantaloupe. The lazy llama lightly limped │\n", + "│ ┆ through the lilacs, laboriously longing for a lozenge │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 13 ┆ The lazy llama lightly limped through the lilacs, laboriously longing for a │\n", + "│ ┆ lozenge. A couple of capricious capybaras chatted coolly by the cactus, │\n", + "│ ┆ curiously considering another capy capably chewing on cantaloupe. Yesterday, a │\n", + "│ ┆ pair of capricious pigeons prattled placidly by the cactus, curiously │\n", + "│ ┆ considering another pigeon capably pecking at cantaloupe. │\n", + "│ 14 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously pondering │\n", + "│ ┆ another capy capably chewing on cantaloupe │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 16 ┆ New sheepskin leather coat with natural fur is 50 times warmer. The color is │\n", + "│ ┆ very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "│ 17 ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful minds of our time about │\n", + "│ ┆ what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"test-data\", \"input\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "shape: (4, 2)\n", + "┌───────────────┬──────────────────────────────────────────────────────────────────────────────────┐\n", + "│ int_id_column ┆ contents │\n", + "│ --- ┆ --- │\n", + "│ i64 ┆ str │\n", + "╞═══════════════╪══════════════════════════════════════════════════════════════════════════════════╡\n", + "│ 1 ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ PR Newswire October 12, 2019 │\n", + "│ ┆ 145-year-old Retailer Anchors Woodland Mall Just Outside Grand Rapids; │\n", + "│ ┆ New Location Continues Strategic National Expansion Plans │\n", + "│ ┆ DAVENPORT, Iowa, Oct. 12, 2019 /PRNewswire/ -- Von Maur Department Stores opened │\n", + "│ ┆ a new store today at Woodland Mall in Kentwood, Mich. The 90,000-square-foot │\n", + "│ ┆ store is the Company's third location in Michigan. │\n", + "│ ┆ Known for its outstanding selection of brand name and specialty apparel, shoes, │\n", + "│ ┆ accessories and gifts, the store features products from leading brands such as │\n", + "│ ┆ Eileen Fisher, Vineyard Vines, Free People, and Kendra Scott, among many others. │\n", + "│ ┆ Von Maur is also widely-regarded for its superior customer service, including an │\n", + "│ ┆ interest-free charge card, accommodating return policy, free gift wrapping and │\n", + "│ ┆ free shipping services. │\n", + "│ ┆ Today's opening continues to build upon the momentum of the family-owned │\n", + "│ ┆ Company's targeted national growth strategy. Von Maur opened its first Wisconsin │\n", + "│ ┆ location in 2017 and a second Minnesota location in 2018, and it has grown in │\n", + "│ ┆ new states beyond its Midwestern footprint, including New York, Alabama and │\n", + "│ ┆ Oklahoma. Additionally, the Company has plans to open its second Wisconsin │\n", + "│ ┆ location in Madison in Fall 2021. │\n", + "│ ┆ \"With its easy accessibility to the larger Grand Rapids area and exceptional │\n", + "│ ┆ collection of shopping, dining and entertainment options, Woodland Mall is a │\n", + "│ ┆ fantastic location for us to continue growing our brand in Michigan,\" said Jim │\n", + "│ ┆ von Maur, president of Von Maur. \"From the moment shoppers walk through our │\n", + "│ ┆ doors, creating an unrivaled shopping experience is the motivation behind │\n", + "│ ┆ everything we do. We look forward to extending our offerings of brand name │\n", + "│ ┆ merchandise and signature customer service to the Grand Rapids area for many │\n", + "│ ┆ years to come.\" │\n", + "│ ┆ \"We are thrilled to welcome Von Maur, known for their high-quality merchandise │\n", + "│ ┆ and exceptional service, as the anchor of the newly developed wing at Woodland │\n", + "│ ┆ Mall,\" said Joe Coradino, CEO of PREIT. \"The addition most certainly solidifies │\n", + "│ ┆ Woodland Mall's place as the premier retail and entertainment destination in │\n", + "│ ┆ Grand Rapids, driving its place as a top-performing PREIT property.\" │\n", + "│ ┆ Centrally-located for shoppers from Grand Rapids and the surrounding areas, the │\n", + "│ ┆ new single story Von Maur store features the Company's signature exterior brick │\n", + "│ ┆ façade, open expansive floor plan, and residential ambiance, including music │\n", + "│ ┆ from the store's grand piano. │\n", + "│ ┆ The Woodland Mall store will eventually employ up to 150 associates; the │\n", + "│ ┆ majority of them will be full-time. Von Maur offers above-market wages, │\n", + "│ ┆ excellent benefits and a positive, professional work environment. Hours of │\n", + "│ ┆ operation are Monday to Saturday, 10 a.m. – 9 p.m. ET, and Sunday, 12 p.m. – 6 │\n", + "│ ┆ p.m. ET. │\n", + "│ ┆ About Von Maur │\n", + "│ ┆ Von Maur was founded 145 years ago in downtown Davenport, Iowa. The Company │\n", + "│ ┆ currently operates 35 stores in 15 states, along with a 120,000 square foot │\n", + "│ ┆ E-Commerce facility that drives its successful online business at vonmaur.com. │\n", + "│ ┆ Courtney Smith │\n", + "│ ┆ courtney@reputationpartners.com │\n", + "│ ┆ View original content:http://www.prnewswire.com/news-releases/von-maur-departmen │\n", + "│ ┆ t-store-opens-third-location-in-michigan-300937186.html │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ 4 ┆ │\n", + "│ ┆ The Genius Life │\n", + "│ ┆ Max Lugavere │\n", + "│ ┆ You don't have to be born a Genius to become one. Follow health and science │\n", + "│ ┆ journalist, New York Times bestselling author, TV personality and nutrition │\n", + "│ ┆ expert Max Lugavere as he speaks to the most insightful │\n", + "│ ┆ minds of our time about what it means to live like a Genius. │\n", + "│ ┆ 35: How Wheat, Carbs, and Sugar Can Harm Your Brain | David Perlmutter, MD │\n", + "│ ┆ David Perlmutter, MD is a board-certified neurologist, Fellow of the American │\n", + "│ ┆ College of Nutrition, and the New York Times best-selling author of Brain Maker │\n", + "│ ┆ and Grain Brain, now updated with the latest nutritional and neurological │\n", + "│ ┆ science. │\n", + "│ ┆ Von Maur Department Store Opens Third Location in Michigan │\n", + "│ ┆ Zuckerberg on Libra drop outs: 'It's a risky project' │\n", + "│ ┆ │\n", + "│ 12 ┆ Yesterday, a pair of capricious pigeons prattled placidly by the cactus, │\n", + "│ ┆ curiously considering another pigeon capably pecking at cantaloupe. The lazy │\n", + "│ ┆ llama lightly limped through the lilacs, laboriously longing for a lozenge. A │\n", + "│ ┆ couple of capricious capybaras chatted coolly by the cactus, curiously │\n", + "│ ┆ considering another capy capably chewing on cantaloupe. │\n", + "│ 15 ┆ The new sheepskin leather coat with natural fur is 46-48 times warmer. The color │\n", + "│ ┆ is very beautiful bright green looks very beautiful. Purchased by the shopping │\n", + "│ ┆ center Dubrovka 19 000 now in the store the price is 22000-24000 call any time. │\n", + "└───────────────┴──────────────────────────────────────────────────────────────────────────────────┘\n" + ] + } + ], + "source": [ + "import polars as pl\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"ray\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c11d3a4b-8ef9-417d-a8a2-f688db067a52", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/fdedup_spark.ipynb b/transforms/universal/fdedup/fdedup_spark.ipynb new file mode 100644 index 000000000..9f4bf1772 --- /dev/null +++ b/transforms/universal/fdedup/fdedup_spark.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.utils import ParamsUtils\n", + "from fdedup_transform_python import parse_args\n", + "from fdedup_transform_spark import SparkServiceOrchestrator" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform\n", + "We will only provide a description for the parameters used in this example. For a complete list of parameters, please refer to the README.md for this transform:\n", + "|parameter:type | value | description |\n", + "|-|-|-|\n", + "| input_folder:str | \\${PWD}/ray/test-data/input/ | folder that contains the input parquet files for the fuzzy dedup algorithm |\n", + "| output_folder:str | \\${PWD}/ray/output/ | folder that contains the all the intermediate results and the output parquet files for the fuzzy dedup algorithm |\n", + "| contents_column:str | contents | name of the column that stores document text |\n", + "| document_id_column:str | int_id_column | name of the column that stores document ID |\n", + "| num_permutations:int | 112 | number of permutations to use for minhash calculation |\n", + "| num_bands:int | 14 | number of bands to use for band hash calculation |\n", + "| num_minhashes_per_band | 8 | number of minhashes to use in each band |\n", + "| operation_mode:{filter_duplicates,filter_non_duplicates,annotate} | filter_duplicates | operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents |" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\")\n", + "output_folder = os.path.join(os.path.abspath(\"\"), \"spark\", \"output\")\n", + "params = {\n", + " # transform configuration parameters\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + " \"contents_column\": \"contents\",\n", + " \"document_id_column\": \"int_id_column\",\n", + " \"num_permutations\": 112,\n", + " \"num_bands\": 14,\n", + " \"num_minhashes_per_band\": 8,\n", + " \"operation_mode\": \"filter_duplicates\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use spark runtime to invoke each transform in the fuzzy dedup pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "args = parse_args()\n", + "# Initialize the orchestrator\n", + "orchestrator = SparkServiceOrchestrator(global_params=args)\n", + "# Launch spark fuzzy dedup execution\n", + "orchestrator.orchestrate()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "glob.glob(\"spark/output/cleaned/*\")" + ] + }, + { + "cell_type": "markdown", + "id": "d30489d9-fc98-423e-90a8-e8f372787e88", + "metadata": {}, + "source": [ + "***** print the input data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b22234f-f7a1-4b92-b2ac-376b2545abce", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "input_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"test-data\", \"input\", \"df1.parquet\"))\n", + "\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(input_df)" + ] + }, + { + "cell_type": "markdown", + "id": "5305d127-10fd-4fa6-97a6-ac47db2bdc7e", + "metadata": {}, + "source": [ + "***** print the output result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b2eddb9-4fb6-41eb-916c-3741b9129f2c", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "output_df = pl.read_parquet(os.path.join(os.path.abspath(\"\"), \"spark\", \"output\", \"cleaned\", \"df1.parquet\"))\n", + "with pl.Config(fmt_str_lengths=10000000, tbl_rows=-1):\n", + " print(output_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d60e391d-cf58-47ae-9991-04c05d114edc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "fdedup_spark", + "language": "python", + "name": "fdedup_spark" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/kfp_ray/README.md b/transforms/universal/fdedup/kfp_ray/README.md index 97fd45a69..75eb77a08 100644 --- a/transforms/universal/fdedup/kfp_ray/README.md +++ b/transforms/universal/fdedup/kfp_ray/README.md @@ -1,8 +1,8 @@ -# Fuzzy Deduplication Ray-base KubeFlow Pipeline Transformation +# Fuzzy Deduplication Ray-based KubeFlow Pipeline Transformation ## Summary -This project allows execution of the [noop Ray transform](../ray) as a +This project allows execution of the [fuzzy dedup Ray transform](../ray) as a [KubeFlow Pipeline](https://www.kubeflow.org/docs/components/pipelines/overview/) The detail pipeline is presented in the [Simplest Transform pipeline tutorial](../../../../kfp/doc/simple_transform_pipeline.md) @@ -16,13 +16,9 @@ make workflow-build from the directory. It creates a virtual environment (make workflow-venv) and after that compiles the pipeline definitions in the folder. The virtual environment is created once for all transformers. -Note: the pipelines definitions can be compiled and executed on KFPv1 and KFPv2. Meantime, KFPv1 is our default. If you -prefer KFPv2, please do the following: -```shell -make clean -export KFPv2=1 -make workflow-build -``` +## Considerations +Currently, fuzzy dedup KFP pipeline definitions can be compiled and executed on KFPv1. KFPv2 is not +supported currently, because of this issue: https://github.com/kubeflow/pipelines/issues/10914 The next steps are described in [Deploying a pipeline](../../../../kfp/doc/simple_transform_pipeline.md#deploying-a-pipeline-) and [Executing pipeline and watching execution results](../../../../kfp/doc/simple_transform_pipeline.md#executing-pipeline-and-watching-execution-results-) \ No newline at end of file diff --git a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py index 3156ab6f1..ffc6f79bc 100644 --- a/transforms/universal/fdedup/kfp_ray/fdedup_wf.py +++ b/transforms/universal/fdedup/kfp_ray/fdedup_wf.py @@ -14,14 +14,24 @@ import kfp.compiler as compiler import kfp.components as comp import kfp.dsl as dsl -from src.fdedup_compute_execution_params import fdedup_compute_execution_params +from src.fdedup_compute_execution_params import ( + cluster_analysis_compute_execution_params, + compute_common_params, + data_cleaning_compute_execution_params, + get_duplicate_list_compute_execution_params, + signature_calc_compute_execution_params, +) from workflow_support.compile_utils import ONE_HOUR_SEC, ONE_WEEK_SEC, ComponentUtils -task_image = "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest" +task_image = os.getenv("FDEDUP_IMAGE_LOCATION", "quay.io/dataprep1/data-prep-kit/fdedup-ray:latest") +image_pull_secret = os.getenv("FDEDUP_IMAGE_PULL_SECRET", "my_secret") # the name of the job script -EXEC_SCRIPT_NAME: str = "fdedup_transform_ray.py" +SIGNATURE_CALC_EXEC_SCRIPT_NAME: str = "signature_calc_transform_ray.py" +CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME: str = "cluster_analysis_transform_ray.py" +GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME: str = "get_duplicate_list_transform_ray.py" +DATA_CLEANING_EXEC_SCRIPT_NAME: str = "data_cleaning_transform_ray.py" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" @@ -40,8 +50,18 @@ # compilation time. import uuid - compute_exec_params_op = dsl.component_decorator.component( - func=fdedup_compute_execution_params, base_image=base_kfp_image + compute_common_params_op = dsl.component_decorator.component(func=compute_common_params, base_image=base_kfp_image) + compute_signature_calc_exec_params_op = dsl.component_decorator.component( + func=signature_calc_compute_execution_params, base_image=base_kfp_image + ) + compute_cluster_analysis_exec_params_op = dsl.component_decorator.component( + func=cluster_analysis_compute_execution_params, base_image=base_kfp_image + ) + compute_get_duplicate_list_exec_params_op = dsl.component_decorator.component( + func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image + ) + compute_data_cleaning_exec_params_op = dsl.component_decorator.component( + func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) print( "WARNING: the ray cluster name can be non-unique at runtime, please do not execute simultaneous Runs of the " @@ -49,61 +69,92 @@ ) run_id = uuid.uuid4().hex else: - compute_exec_params_op = comp.create_component_from_func( - func=fdedup_compute_execution_params, base_image=base_kfp_image + compute_common_params_op = comp.create_component_from_func(func=compute_common_params, base_image=base_kfp_image) + compute_signature_calc_exec_params_op = comp.create_component_from_func( + func=signature_calc_compute_execution_params, base_image=base_kfp_image + ) + compute_cluster_analysis_exec_params_op = comp.create_component_from_func( + func=cluster_analysis_compute_execution_params, base_image=base_kfp_image + ) + compute_get_duplicate_list_exec_params_op = comp.create_component_from_func( + func=get_duplicate_list_compute_execution_params, base_image=base_kfp_image + ) + compute_data_cleaning_exec_params_op = comp.create_component_from_func( + func=data_cleaning_compute_execution_params, base_image=base_kfp_image ) run_id = dsl.RUN_ID_PLACEHOLDER # create Ray cluster create_ray_op = comp.load_component_from_file(component_spec_path + "createRayClusterComponent.yaml") -# execute job -execute_ray_jobs_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute signature calculation job +execute_signature_calc_job_op = comp.load_component_from_file( + component_spec_path + "executeRayJobComponent_multi_s3.yaml" +) +# execute cluster analysis job +execute_cluster_analysis_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute get duplicate list job +execute_get_duplicate_list_job_op = comp.load_component_from_file(component_spec_path + "executeRayJobComponent.yaml") +# execute data cleaning job +execute_data_cleaning_job_op = comp.load_component_from_file( + component_spec_path + "executeRayJobComponent_multi_s3.yaml" +) # clean up Ray cleanup_ray_op = comp.load_component_from_file(component_spec_path + "deleteRayClusterComponent.yaml") # Task name is part of the pipeline name, the ray cluster name and the job name in DMF. -TASK_NAME: str = "fdedup" +TASK_NAME: str = "fuzzydedup" @dsl.pipeline( name=TASK_NAME + "-ray-pipeline", - description="Pipeline for fdedup", + description="Pipeline for fuzzy dedup", ) -def fdedup( +def fuzzydedup( + # folders used # Ray cluster - ray_name: str = "fdedup-kfp-ray", # name of Ray cluster + ray_name: str = "fuzzydedup-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed - ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_head_options: dict = { + "cpu": 8, + "memory": 64, + "image": task_image, + "image_pull_secret": image_pull_secret, + "imagePullPolicy": "Always", + }, + ray_worker_options: dict = { + "replicas": 10, + "max_replicas": 10, + "min_replicas": 10, + "cpu": 16, + "memory": 128, + "image": task_image, + "image_pull_secret": image_pull_secret, + "imagePullPolicy": "Always", + }, + runtime_actor_options: dict = {"num_cpus": 0.8, "memory": 16}, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access. checkpointing is not supported by dedup - data_s3_config: str = "{'input_folder': 'test/fdedup/input/', 'output_folder': 'test/fdedup/output/'}", - data_s3_access_secret: str = "s3-secret", + data_s3_config: str = "{'input_folder': 's3://cos-llm-pile-south/spark_test/fd_xs_dataset_test/', 'output_folder': 's3://cos-llm-pile-south/spark_test/fuzzy_dedup_test_output_data/kfp_test_1/'}", + data_s3_access_secret: str = "s3-south-secret", + scdata_s3_access_secret: str = "s3-south-secret", + dcdata_s3_access_secret: str = "s3-south-secret", data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {"num_cpus": 0.7}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # columns used - fdedup_doc_column: str = "contents", - fdedup_id_column: str = "int_id_column", - fdedup_cluster_column: str = "cluster", - # infrastructure - fdedup_bucket_cpu: float = 0.5, - fdedup_doc_cpu: float = 0.5, - fdedup_mhash_cpu: float = 0.5, + fdedup_contents_column: str = "contents", + fdedup_document_id_column: str = "int_id_column", # fuzzy parameters - fdedup_num_permutations: int = 64, - fdedup_threshold: float = 0.8, - fdedup_shingles_size: int = 5, - fdedup_delimiters: str = " ", - # Random delay between reads - fdedup_random_delay_limit: int = 5, - # snapshotting - fdedup_snapshot_delay: int = 1, - fdedup_use_doc_snapshot: bool = False, - fdedup_use_bucket_snapshot: bool = False, + fdedup_num_permutations: int = 112, + fdedup_num_bands: int = 14, + fdedup_num_minhashes_per_band: int = 8, + fdedup_word_shingle_size: int = 5, + fdedup_shingle_option: str = "word", + fdedup_jaccard_similarity_threshold: float = 0.75, + fdedup_seed: int = 42, + fdedup_operation_mode: str = "annotate", # data sampling fdedup_n_samples: int = 10, # additional parameters @@ -136,63 +187,47 @@ def fdedup( wait_print_tmout - time between prints, sec http_retries - http retries for API server calls :param data_s3_access_secret - s3 access secret + :param scdata_s3_access_secret - signature calculation s3 access secret + :param dcdata_s3_access_secret - data cleaning s3 access secret :param data_s3_config - s3 configuration :param data_max_files - max files to process :param data_num_samples - num samples to process - :param runtime_actor_options - actor options :param runtime_pipeline_id - pipeline id :param runtime_code_location - code location - :param fdedup_doc_column - document column name - :param fdedup_id_column - integer document id column name - :param fdedup_cluster_column - cluster column name - :param fdedup_bucket_cpu - number of CPUs per bucket hash - :param fdedup_doc_cpu - number of CPUs per doc hash - :param fdedup_mhash_cpu - number of CPUs per minhash hash + :param fdedup_contents_column - document column name + :param fdedup_document_id_column - integer document id column name :param fdedup_num_permutations - number of permutations - :param fdedup_threshold - threshold - :param fdedup_shingles_size - number of words in shingle - :param fdedup_delimiters - delimiter for splitting document - :param fdedup_random_delay_limit - delay between reads to reduce S3 load. - A random number between 0 and random_delay_limit is used - :param fdedup_snapshot_delay - delay between restoring individual actors - :param fdedup_use_bucket_snapshot - flag to skip buckets building and start from existing snapshots - :param fdedup_use_doc_snapshot - flag to skip documents building and start from existing snapshots + :param fdedup_num_bands - number of bands + :param fdedup_num_minhashes_per_band - length of a band + :param fdedup_word_shingle_size - length of word shingles + :param fdedup_shingle_option - type of shingle, one of 'word', or 'char' + :param fdedup_jaccard_similarity_threshold - similarity threshold + :param fdedup_seed - seed for the random number generator + :param fdedup_operation_mode - data cleaning mode, one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate' :param fdedup_n_samples - number of samples for parameters computation :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): # compute execution params - compute_exec_params = compute_exec_params_op( + compute_common_exec_params = compute_common_params_op( worker_options=ray_worker_options, actor_options=runtime_actor_options, data_s3_config=data_s3_config, - data_max_files=data_max_files, - data_num_samples=data_num_samples, - runtime_pipeline_id=runtime_pipeline_id, - runtime_job_id=run_id, - runtime_code_location=runtime_code_location, - doc_column=fdedup_doc_column, - id_column=fdedup_id_column, - cluster_column=fdedup_cluster_column, - bucket_cpu=fdedup_bucket_cpu, - doc_cpu=fdedup_doc_cpu, - mhash_cpu=fdedup_mhash_cpu, num_permutations=fdedup_num_permutations, - threshold=fdedup_threshold, - shingles_size=fdedup_shingles_size, - delimiters=fdedup_delimiters, - random_delay_limit=fdedup_random_delay_limit, - snapshot_delay=fdedup_snapshot_delay, - use_doc_snapshot=fdedup_use_doc_snapshot, - use_bucket_snapshot=fdedup_use_bucket_snapshot, n_samples=fdedup_n_samples, ) - ComponentUtils.add_settings_to_component(compute_exec_params, ONE_HOUR_SEC * 2) - ComponentUtils.set_s3_env_vars_to_component(compute_exec_params, data_s3_access_secret) + ComponentUtils.add_settings_to_component(compute_common_exec_params, ONE_HOUR_SEC * 2) + ComponentUtils.set_s3_env_vars_to_component(compute_common_exec_params, data_s3_access_secret) + fdedup_num_segments = compute_common_exec_params.outputs["num_segments"] + runtime_num_actors = compute_common_exec_params.outputs["num_actors"] + runtime_actor_cpus = compute_common_exec_params.outputs["actor_cpu"] + runtime_actor_memory = compute_common_exec_params.outputs["actor_memory"] # start Ray cluster ray_cluster = create_ray_op( @@ -204,21 +239,148 @@ def fdedup( additional_params=additional_params, ) ComponentUtils.add_settings_to_component(ray_cluster, ONE_HOUR_SEC * 2) - ray_cluster.after(compute_exec_params) + ray_cluster.after(compute_common_exec_params) + + # Get the parameters for the signature calculation job + compute_signature_calc_exec_params = compute_signature_calc_exec_params_op( + runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + doc_column=fdedup_contents_column, + id_column=fdedup_document_id_column, + num_permutations=fdedup_num_permutations, + num_bands=fdedup_num_bands, + num_minhashes_per_band=fdedup_num_minhashes_per_band, + word_shingle_size=fdedup_word_shingle_size, + shingle_option=fdedup_shingle_option, + threshold=fdedup_jaccard_similarity_threshold, + num_segments=fdedup_num_segments, + seed=fdedup_seed, + ) + ComponentUtils.add_settings_to_component(compute_signature_calc_exec_params, ONE_HOUR_SEC * 2) + compute_signature_calc_exec_params.after(ray_cluster) + + # Execute signature calculation job + execute_signature_calc_job = execute_signature_calc_job_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_signature_calc_exec_params.output, + exec_script_name=SIGNATURE_CALC_EXEC_SCRIPT_NAME, + server_url=server_url, + prefix="scdata", + ) + ComponentUtils.add_settings_to_component(execute_signature_calc_job, ONE_WEEK_SEC) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") != "1": + ComponentUtils.set_s3_env_vars_to_component(execute_signature_calc_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_signature_calc_job, scdata_s3_access_secret, prefix="scdata" + ) + execute_signature_calc_job.after(compute_signature_calc_exec_params) + + # Get the parameters for the cluster analysis job + compute_cluster_analysis_exec_params = compute_cluster_analysis_exec_params_op( + runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + num_bands=fdedup_num_bands, + threshold=fdedup_jaccard_similarity_threshold, + num_segments=fdedup_num_segments, + ) + ComponentUtils.add_settings_to_component(compute_cluster_analysis_exec_params, ONE_HOUR_SEC * 2) + compute_cluster_analysis_exec_params.after(execute_signature_calc_job) + # Execute job + execute_cluster_analysis_job = execute_cluster_analysis_job_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_cluster_analysis_exec_params.output, + exec_script_name=CLUSTER_ANALYSIS_EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_cluster_analysis_job, ONE_WEEK_SEC) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") != "1": + ComponentUtils.set_s3_env_vars_to_component(execute_cluster_analysis_job, data_s3_access_secret) + execute_cluster_analysis_job.after(compute_cluster_analysis_exec_params) + + compute_get_duplicate_list_exec_params = compute_get_duplicate_list_exec_params_op( + runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + ) + ComponentUtils.add_settings_to_component(compute_get_duplicate_list_exec_params, ONE_HOUR_SEC * 2) + compute_get_duplicate_list_exec_params.after(execute_cluster_analysis_job) + # Execute job + execute_get_duplicate_list_job = execute_get_duplicate_list_job_op( + ray_name=ray_name, + run_id=run_id, + additional_params=additional_params, + exec_params=compute_get_duplicate_list_exec_params.output, + exec_script_name=GET_DUPLICATE_LIST_EXEC_SCRIPT_NAME, + server_url=server_url, + ) + ComponentUtils.add_settings_to_component(execute_get_duplicate_list_job, ONE_WEEK_SEC) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") != "1": + ComponentUtils.set_s3_env_vars_to_component(execute_get_duplicate_list_job, data_s3_access_secret) + execute_get_duplicate_list_job.after(compute_get_duplicate_list_exec_params) + + compute_data_cleaning_exec_params = compute_data_cleaning_exec_params_op( + runtime_num_actors=runtime_num_actors, + runtime_actor_cpus=runtime_actor_cpus, + runtime_actor_memory=runtime_actor_memory, + data_s3_config=data_s3_config, + data_max_files=data_max_files, + data_num_samples=data_num_samples, + runtime_pipeline_id=runtime_pipeline_id, + runtime_job_id=run_id, + runtime_code_location=runtime_code_location, + id_column=fdedup_document_id_column, + operation_mode=fdedup_operation_mode, + ) + ComponentUtils.add_settings_to_component(compute_data_cleaning_exec_params, ONE_HOUR_SEC * 2) + compute_data_cleaning_exec_params.after(execute_get_duplicate_list_job) + # Execute job - execute_job = execute_ray_jobs_op( + execute_data_cleaning_job = execute_data_cleaning_job_op( ray_name=ray_name, run_id=run_id, additional_params=additional_params, - exec_params=compute_exec_params.output, - exec_script_name=EXEC_SCRIPT_NAME, + exec_params=compute_data_cleaning_exec_params.output, + exec_script_name=DATA_CLEANING_EXEC_SCRIPT_NAME, server_url=server_url, + prefix="dcdata", ) - ComponentUtils.add_settings_to_component(execute_job, ONE_WEEK_SEC) - ComponentUtils.set_s3_env_vars_to_component(execute_job, data_s3_access_secret) - execute_job.after(ray_cluster) + ComponentUtils.add_settings_to_component(execute_data_cleaning_job, ONE_WEEK_SEC) + # FIXME: see https://github.com/kubeflow/pipelines/issues/10914 + if os.getenv("KFPv2", "0") != "1": + ComponentUtils.set_s3_env_vars_to_component(execute_data_cleaning_job, data_s3_access_secret) + ComponentUtils.set_s3_env_vars_to_component( + execute_data_cleaning_job, dcdata_s3_access_secret, prefix="dcdata" + ) + execute_data_cleaning_job.after(compute_data_cleaning_exec_params) if __name__ == "__main__": # Compiling the pipeline - compiler.Compiler().compile(fdedup, __file__.replace(".py", ".yaml")) + compiler.Compiler().compile(fuzzydedup, __file__.replace(".py", ".yaml")) diff --git a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py index 726200339..15722c164 100644 --- a/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py +++ b/transforms/universal/fdedup/kfp_ray/src/fdedup_compute_execution_params.py @@ -10,10 +10,87 @@ # limitations under the License. ################################################################################ +from typing import Any, NamedTuple -def fdedup_compute_execution_params( + +def compute_common_params( worker_options: dict, # ray worker configuration - actor_options: dict, # actor's resource requirements + actor_options: dict, # actor desired configuration + data_s3_config: str, # S3 configuration + num_permutations: int, # number of permutations (minhashes) per document + n_samples: int, # files to sample for number of documents estimation +) -> NamedTuple( + "fdedup_params", [("num_segments", int), ("num_actors", str), ("actor_cpu", float), ("actor_memory", int)] +): + """ + Compute fuzzy dedup execution parameters common to all the transforms + :param worker_options: worker group configuration + :param actor_options: desired actor configuration + :param data_s3_config: s3 configuration + :param num_permutations: number of permutations + :param n_samples: number of samples used to estimate the total number of documents in the dataset + :return: fdedup_params NamedTuple: num_segments - int, num_actors - str, cpus (float) and memory (int) per actor + """ + + import sys + + from data_processing.data_access import DataAccessS3 + from data_processing.utils import GB + from runtime_utils import KFPUtils + + # get credentials + s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() + s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} + s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) + # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly + data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) + # sample input data + sampling: dict[str, Any] + sampling, _ = data_access.sample_input_data(n_samples=n_samples) + number_of_docs = int(sampling.get("estimated number of docs")) + if number_of_docs == 0: + print(f"Estimated number of documents and documents size is zero. Please verify the input path.") + sys.exit(1) + print(f"Estimated number of docs: {number_of_docs}") + actor_cpu: float = actor_options.get("num_cpus", 1) # if num_cpus not specified, request 1 CPU per actor + actor_memory: int = int(actor_options.get("memory", 16)) * GB # if memory not specified, request 16 GB per actor + # Calculate the number of segments + # Assume each document takes doc_bytes = (8 + num_permutations * 4 + 20) bytes, where: + # 8 bytes are taken by the band hash + # (num_permutations * 4) bytes are taken by the min hashes + # 20 bytes to provide some extra space for storage in a table + # The total amount of space needed by a band is number_of_docs * doc_bytes. + # To scale band handling, divide each band into segments, each smaller than 1/6 of an actor's allocated memory + doc_bytes = 8 + num_permutations * 4 + 20 + band_bytes = number_of_docs * doc_bytes + num_segments = 1 + (band_bytes // (actor_memory // 6)) + print(f"Number of segments: {num_segments}") + + # Calculate number of actors, using KFPUtils.default_compute_execution_params() + # Create new dict with memory expressed in bytes, as expected by KFPUtils.default_compute_execution_params() + actor_config = { + "num_cpus": actor_cpu, + "memory": actor_memory, + } + num_actors = KFPUtils.default_compute_execution_params(str(worker_options), str(actor_config)) + + print(f"num_actors = {num_actors}") + from collections import namedtuple + + fdedup_params = namedtuple( + typename="fdedup_params", + field_names=["num_segments", "num_actors", "actor_cpu", "actor_memory"], + ) + print( + f"num_segments = {num_segments}, num_actors = {num_actors}, actor_cpu = {actor_cpu}, actor_memory = {actor_memory}" + ) + return fdedup_params(num_segments, num_actors, actor_cpu, actor_memory) + + +def signature_calc_compute_execution_params( + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor data_s3_config: str, # s3 configuration data_max_files: int, # max files to process data_num_samples: int, # num samples to process @@ -22,27 +99,21 @@ def fdedup_compute_execution_params( runtime_code_location: dict, # code location doc_column: str, # document column name id_column: str, # integer document id column name - cluster_column: str, # cluster column name - bucket_cpu: float, # number of CPUs per bucket hash - doc_cpu: float, # number of CPUs per doc hash - mhash_cpu: float, # number of CPUs per minhash hash num_permutations: int, # number of permutations + num_bands: int, # number of bands + num_minhashes_per_band: int, # band length + word_shingle_size: int, # number of words in shingle + shingle_option: str, # type of shingle, one of 'word' or 'char' threshold: float, # threshold, - shingles_size: int, # number of words in shingle - delimiters: str, # delimiter for splitting document - random_delay_limit: int, # delay between reads to reduce S3 load. - # A random number between 0 and random_delay_limit is used - snapshot_delay: int, # delay between restoring individual actors - use_doc_snapshot: bool, # flag to skip documents building and start from existing snapshots - use_bucket_snapshot: bool, # flag to skip buckets building and start from existing snapshots - n_samples: int, # number of samples to use -) -> dict: # NamedTuple( - # "Output", [("workers", int), ("preprocessors", int), ("docs", int), ("buckets", int), ("min_hashes", int)] + num_segments: int, # number of segments + seed: int, # seed for the random number generator +) -> dict: """ - Compute fuzzy dedup execution parameters - :param worker_options: cluster parameters - :param actor_options: actor request requirements + Compute fuzzy dedup execution parameters for signature calculation + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor :param data_s3_config: s3 configuration :param data_max_files: max files to process :param data_num_samples: num samples to process @@ -51,182 +122,205 @@ def fdedup_compute_execution_params( :param runtime_code_location: code location :param doc_column: document column name :param id_column: integer document id column name - :param cluster_column: cluster column name - :param bucket_cpu: number of CPUs per bucket hash - :param doc_cpu: number of CPUs per doc hash - :param mhash_cpu: number of CPUs per minhash hash :param num_permutations: number of permutations + :param num_bands: number of bands + :param num_minhashes_per_band: band length + :param word_shingle_size: number of words/chars in shingle + :param shingle_option: str: type of shingle, one of 'word' or 'char' + :param threshold: threshold, + :param num_segments: number of segments + :param seed: seed for the random number generator + :return: dictionary with Ray Job execution parameters + """ + + # fuzzy parameters for signature calculation + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "minhash_contents_column": doc_column, + "minhash_document_id_column": id_column, + "minhash_num_permutations": num_permutations, + "minhash_num_bands": num_bands, + "minhash_num_minhashes_per_band": num_minhashes_per_band, + "minhash_word_shingle_size": word_shingle_size, + "minhash_shingle_option": shingle_option, + "minhash_jaccard_similarity_threshold": threshold, + "minhash_num_segments": num_segments, + "minhash_seed": seed, + "scdata_s3_config": data_s3_config, + } + + +def cluster_analysis_compute_execution_params( + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + num_bands: int, # number of bands + threshold: float, # threshold, + num_segments: int, # number of segments +) -> dict: + + """ + Compute fuzzy dedup execution parameters for cluster analysis + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param num_bands: number of bands :param threshold: threshold, - :param shingles_size: number of words in shingle - :param delimiters: delimiter for splitting document - :param random_delay_limit: # delay between reads to reduce S3 load. A random number between 0 and random_delay_limit is used - :param snapshot_delay: delay between restoring individual actors - :param use_doc_snapshot: flag to skip documents building and start from existing snapshots - :param use_bucket_snapshot: flag to skip buckets building and start from existing snapshots - :param n_samples: number of samples to use + :param num_segments: number of segments :return: a dictionary with a Ray Job execution parameters """ - import math - import sys + import json + import os - from data_processing.data_access import DataAccessS3 - from data_processing.utils import GB, KB - from runtime_utils import KFPUtils - from scipy.integrate import quad as integrate - - EXECUTION_OF_KB_DOC = 0.003 - - def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, - ) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt + # fuzzy parameters + # Get cluster parameters + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + data_s3_config_dict["input_folder"] = os.path.join(base_folder, "bands") + data_s3_config_dict["output_folder"] = os.path.join(base_folder, "docs_to_remove") + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "cluster_num_bands": num_bands, + "cluster_jaccard_similarity_threshold": threshold, + "cluster_num_segments": num_segments, + } + + +def get_duplicate_list_compute_execution_params( + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location +) -> dict: + """ + Compute fuzzy dedup execution parameters for get duplicate list step + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :return: a dictionary with a Ray Job execution parameters + """ + import json + import os # fuzzy parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=threshold, - num_perm=num_permutations, - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - print(f"Fuzzy parameters: num buckets {num_buckets}, bucket length {length_bucket}") + duplicate_docids_folder: str = "docs_to_remove" + duplicate_list_location: str = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") # Get cluster parameters - cluster_cpu = worker_options["replicas"] * worker_options["cpu"] - cluster_memory = worker_options["replicas"] * worker_options["memory"] - print(f"Cluster available CPUs {cluster_cpu}, Memory {cluster_memory}") - cluster_cpu -= 1 - cluster_memory *= 0.85 - # get actor requirements - actor_cpu = actor_options["num_cpus"] - print(f"actor required cpu {actor_cpu}") - # get credentials - s3_key, s3_secret, s3_endpoint = KFPUtils.credentials() - s3_creds = {"access_key": s3_key, "secret_key": s3_secret, "url": s3_endpoint} - s3_config = KFPUtils.load_from_json(data_s3_config.replace("'", '"')) - if type(s3_config) is list: - # S3 config is list. take the first element - s3_config = s3_config[0] - # because S3 is the only viable version for kfp-based implementation, we are here creating DataAccess S3 directly - data_access = DataAccessS3(s3_credentials=s3_creds, s3_config=s3_config, d_sets=None, checkpoint=False, m_files=-1) - # sample input data - sampling, _ = data_access.sample_input_data(n_samples=n_samples) - avg_doc_size = sampling.get("average doc size KB") - number_of_docs = sampling.get("estimated number of docs") - avg_table_size = sampling.get("average table size MB") / KB - if number_of_docs == 0: - print(f"Estimated number of documents and documents size is zero. Please verify the input path.") - sys.exit(1) - # we are creating more buckets actors, so that we get better parallelization for bucket processing - b_actors = math.ceil(num_buckets * number_of_docs * 64 * 1.1 / GB) - d_actors = math.ceil(number_of_docs * 48 * 1.1 / GB) - m_actors = math.ceil(number_of_docs * 128 * 1.1 / GB) - # compute cpu requirements - # Define number of preprocessors. We are assuming that preprocessors and workers are using the same amount - # of CPUs - n_preprocessors = int( - (0.85 * cluster_cpu - b_actors * bucket_cpu - m_actors * mhash_cpu - d_actors * doc_cpu) / actor_cpu - ) - if n_preprocessors <= 0: - print(f"Not enough CPUs to run fuzzy de duping, computed number of workers is {n_preprocessors}") - print(f"Required bucket actors {b_actors}, minhash actors {m_actors}, document actors {d_actors}") - print("Try to increase the size of the cluster") - sys.exit(1) - # compute the amount of workers - n_workers = int((0.85 * cluster_cpu - d_actors * doc_cpu) / actor_cpu) - # Ensure that we do not overwhelm S3 - if n_workers > 2000: - n_workers = 2000 - print( - f"Number of preprocessors: {n_preprocessors}, Number of workers: {n_workers}, bucket actors {b_actors}, " - f"minhash actors {m_actors}, document actors {d_actors}" - ) + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + data_s3_config_dict["input_folder"] = base_folder + data_s3_config_dict["output_folder"] = base_folder + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} + return { + "data_s3_config": data_s3_config, + "data_max_files": data_max_files, + "data_num_samples": data_num_samples, + "runtime_num_workers": runtime_num_actors, + "runtime_worker_options": str(actor_options), + "runtime_pipeline_id": runtime_pipeline_id, + "runtime_job_id": runtime_job_id, + "runtime_code_location": str(runtime_code_location), + "fdlist_docs_to_remove": duplicate_docids_folder, + "fdlist_consolidated_filename": duplicate_list_location, + } - # Make sure that we have enough memory - r_mem = avg_table_size * 4 * n_preprocessors + 2 * (b_actors + m_actors + d_actors) - print(f"Required execution memory {r_mem} GB") - if r_mem > cluster_memory: - print(f"Not enough memory to run de duping, required {r_mem}, available {cluster_memory}") - print(f"Try to increase the size of the cluster or increase size of the cpu per worker (current {actor_cpu})") - sys.exit(1) - print( - f"Required cpu : " - f"{b_actors * bucket_cpu + m_actors * mhash_cpu + d_actors * doc_cpu + n_workers * actor_cpu}" - ) +def data_cleaning_compute_execution_params( + runtime_num_actors: str, # number of actors computed by KFPUtils.default_compute_execution_params() + runtime_actor_cpus: float, # number of CPUS needed for each actor + runtime_actor_memory: int, # memory (in bytes) needed by each actor + data_s3_config: str, # s3 configuration + data_max_files: int, # max files to process + data_num_samples: int, # num samples to process + runtime_pipeline_id: str, # pipeline id + runtime_job_id: str, # job id + runtime_code_location: dict, # code location + id_column: str, # integer document id column name + operation_mode: str, # filter (non-)duplicates or annotate +) -> dict: + """ + Compute fuzzy dedup execution parameters + :param runtime_num_actors: number of actors computed by KFPUtils.default_compute_execution_params() + :param runtime_actor_cpus: number of CPUS needed for each actor + :param runtime_actor_memory: memory (in bytes) needed by each actor + :param data_s3_config: s3 configuration + :param data_max_files: max files to process + :param data_num_samples: num samples to process + :param runtime_pipeline_id: pipeline id + :param runtime_job_id: job id + :param runtime_code_location: code location + :param id_column: integer document id column name + :param operation_mode: filter (non-)duplicates or annotate + :return: a dictionary with a Ray Job execution parameters + """ + import json + import os - projected_execution = EXECUTION_OF_KB_DOC * avg_doc_size * number_of_docs / n_workers / 60 - print(f"Projected execution time {projected_execution} min") + # fuzzy parameters + # Get cluster parameters + data_s3_config_dict = json.loads(data_s3_config.replace("'", '"')) + base_folder = data_s3_config_dict.get("output_folder") + if operation_mode == "filter_duplicates": + output_subfolder = "cleaned" + elif operation_mode == "filter_non_duplicates": + output_subfolder = "duplicates" + else: # operation_mode == "annotate" + output_subfolder = "annotated" + data_s3_config_dict["output_folder"] = os.path.join(base_folder, output_subfolder) + data_s3_config = json.dumps(data_s3_config_dict).replace('"', "'") + duplicate_list_location: str = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") + actor_options = {"num_cpus": runtime_actor_cpus, "memory": runtime_actor_memory} return { "data_s3_config": data_s3_config, "data_max_files": data_max_files, "data_num_samples": data_num_samples, - "runtime_num_workers": n_workers, + "runtime_num_workers": runtime_num_actors, "runtime_worker_options": str(actor_options), "runtime_pipeline_id": runtime_pipeline_id, "runtime_job_id": runtime_job_id, "runtime_code_location": str(runtime_code_location), - "fdedup_doc_column": doc_column, - "fdedup_id_column": id_column, - "fdedup_cluster_column": cluster_column, - "fdedup_bucket_cpu": bucket_cpu, - "fdedup_doc_cpu": doc_cpu, - "fdedup_mhash_cpu": mhash_cpu, - "fdedup_num_doc_actors": d_actors, - "fdedup_num_bucket_actors": b_actors, - "fdedup_num_minhash_actors": m_actors, - "fdedup_num_preprocessors": n_preprocessors, - "fdedup_num_permutations": num_permutations, - "fdedup_threshold": threshold, - "fdedup_shingles_size": shingles_size, - "fdedup_delimiters": delimiters, - "fdedup_random_delay_limit": random_delay_limit, - "fdedup_snapshot_delay": snapshot_delay, - "fdedup_use_doc_snapshot": use_doc_snapshot, - "fdedup_use_bucket_snapshot": use_bucket_snapshot, + "fdclean_document_id_column": id_column, + "fdclean_duplicate_list_location": duplicate_list_location, + "fdclean_operation_mode": operation_mode, } diff --git a/transforms/universal/fdedup/python/.dockerignore b/transforms/universal/fdedup/python/.dockerignore new file mode 100644 index 000000000..f7275bbbd --- /dev/null +++ b/transforms/universal/fdedup/python/.dockerignore @@ -0,0 +1 @@ +venv/ diff --git a/transforms/universal/fdedup/python/Dockerfile b/transforms/universal/fdedup/python/Dockerfile new file mode 100644 index 000000000..79c85e4ac --- /dev/null +++ b/transforms/universal/fdedup/python/Dockerfile @@ -0,0 +1,41 @@ +FROM docker.io/python:3.10.14-slim-bullseye + +RUN pip install --upgrade --no-cache-dir pip + +# install pytest +RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME + +# Create a user and use it to run the transform +RUN useradd -ms /bin/bash dpk +USER dpk +WORKDIR /home/dpk + +# Copy and install data processing libraries +# These are expected to be placed in the docker context before this is run (see the make image). +COPY --chown=dpk:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} + +COPY --chown=dpk:root src/ src/ +COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt + +RUN pip install --no-cache-dir -e . + +# copy source data +COPY ./src/fdedup_transform_python.py fdedup_transform_python.py +COPY ./src/fdedup_transform_python.py local/ + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +# Set environment +ENV PYTHONPATH /home/dpk + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/python/Makefile b/transforms/universal/fdedup/python/Makefile new file mode 100644 index 000000000..05f6bf5ca --- /dev/null +++ b/transforms/universal/fdedup/python/Makefile @@ -0,0 +1,64 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +# Include the common configuration for this transform +include ../transform.config + +venv:: .transforms.python-venv + +test:: .transforms.python-test + +clean:: .transforms.clean + +image:: .transforms.python-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-python + +setup:: .transforms.setup + +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_PYTHON_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.python-test-image + +run-cli-sample: .transforms.run-cli-python-sample + +run-local-sample: .transforms.run-local-sample + +run-local-python-sample: .transforms.run-local-python-sample + +#run-s3-ray-sample: .transforms.run-s3-ray-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/python/README.md b/transforms/universal/fdedup/python/README.md new file mode 100644 index 000000000..295862221 --- /dev/null +++ b/transforms/universal/fdedup/python/README.md @@ -0,0 +1,244 @@ +# Fuzzy Dedup + +Please see the set of +[transform project conventions](../../../README.md) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Contributors +- Nelson Bore (kibnelson@gmail.com) +- Constantin Adam (cmadam@us.ibm.com) + +## Description +The fdedup transform eliminates documents that are highly similar to each other (but not necessarily identical) from a +set of Parquet files. This ensures that the resulting dataset contains only unique or sufficiently distinct entries. + +Fuzzy dedup is a complex process made up of a pipeline that performs four main steps: + +1. **Signature Calculation**: creates a set of minhashes for each document, and uses them to create band signatures for +the document. +2. **Cluster Analysis**: groups documents into clusters based on matching band signatures. Within each cluster, it +retains only the documents that have a Jaccard similarity above a specified threshold, and it identifies which documents +to keep as unique and which ones to mark as duplicates. +3. **Duplicate List Generation**: combines the similarity clusters identified in each band to create a single, unified +list of duplicate documents. +4. **Data Cleaning**: processes the documents by either filtering out duplicates or adding annotations to distinguish +duplicates from non-duplicates. + +Each one of these steps is described in more detail below. + +### Signature Calculation + +This transform computes `num_permutations` minhashes and `num_bands` signatures for each document in the dataset, by +following these processing steps: +1. **Shingle Generation**: create a set of character or word shingles, using a specified window length. Character +shingles are more effective at detecting similar documents, but require more computational resources compared to word +shingles. +2. **Minhash Calculation**: using the shingles as input, compute `num_permutations` minhashes for each document. +3. **Band Signature Calculation**: divide the minhashes into `num_bands`, where each band contains +`num_minhashes_per_band` minhashes. For each document, generate a unique signature for every band. + +The values for `num_bands` and `num_minhashes_per_band` determine the likelihood that documents with a certain Jaccard +similarity will be marked as duplicates. A Jupyter notebook in the [utils](../utils) folder generates a graph of this +probability function, helping users explore how different settings for `num_bands` and `num_minhashes_per_band` impact +the deduplication process. + +To help distribute the workload and speed up processing of the next steps, the hash space of each band is divided into +`num_segments` segments. The band signatures, the minhashes, the document ids, and lengths are stored in an organized +output folder structure `bands/band=b/segment=s`, where `b` is the band number and `s` is the segment number. + +### Cluster Analysis + +This transform leverages segmented processing to analyze the data generated by the **Signature Calculation** step +efficiently and in parallel. Each worker processes a specific segment `s` of a band `b` by loading and analyzing all +Parquet files from the folder `bands/band=b/segment=s`. Each row in the Parquet files contains, for a document: +* `band_hash`, the document's band signature, and +* `data`, a structure with three fields: the unique `document_id`, document's `minhashes`, and `document_size`. + +The transform runs the following processing steps: +1. **Data Loading**: combine into a single dataframe all Parquet files in `bands/band=b/segment=s`. +2. **Clustering**: run a `group_by` operation on the `band_hash` column that will group documents with the same band +signature into clusters. +3. **Similarity Analysis**: for each cluster, calculate Jaccard similarity between pairs of documents using their +minhashes, and move documents below the specified Jaccard similarity threshold into new clusters. +4. **Duplicate Identification**: in clusters with more than one document remaining, retain the largest document with the +smallest document id, and mark as duplicates all other documents in the cluster. +5. **Persist Results**: save the duplicate clusters in a file. + +### Duplicate List Generation + +The **Cluster Analysis** step identifies duplicates across multiple bands, meaning a document can be marked as a +duplicate in one or more bands (e.g., if two documents are identical, one will be marked as a duplicate in all bands). +This transform consolidates all duplicate information from each band segment into a single file, providing a unified +record of duplicates detected across the dataset. + +### Data Cleaning + +This transform processes the original dataset using the list of duplicate documents generated by the **Duplicate List +Generation** step. It imports each file in the original dataset into a table and produces a new dataset. The directory +structure of the input dataset is preserved, but the contents of the output files depend on the selected operating mode: +1. **Annotate** - add a new `duplicate` column to the dataset, that contains a `d` for documents marked as duplicates, +and is empty for non-duplicates +2. **Filter duplicates** - removes all documents identified as duplicates from the dataset. +3. **Filter non-duplicates** - removes from the dataset all documents that were not marked as duplicates, leaving only +the duplicates. + +The output dataset reflects the selected mode, providing flexibility for downstream processing. + +## Input Columns Used by This Transform + +| Input Column Name | Data Type | Description | +|---------------------------------------------------------------------|-----------|----------------------------------| +| Column specified by the _contents_column_ configuration argument | str | Column that stores document text | +| Column specified by the _document_id_column_ configuration argument | int64 | Column that stores document ID | + +## Output Columns Annotated by This Transform +| Output Column Name | Data Type | Description | +|------------|-----------|---------------------------------------------------------------------------------------------------------------------| +| duplicate | str | Column added if fuzzy dedup runs in 'annotate' mode. Value is 'd' for duplicate documents, empty for non-duplicates | + +## Configuration and Usage +### Fuzzy Deduplication Transform +The set of dictionary keys holding [Fuzzy Dedup](src/fdedup_transform_python.py) configuration for values are as +follows: +```text +--input_folder INPUT_FOLDER + Input folder path +--output_folder OUTPUT_FOLDER + Output folder path +--operation_mode {filter_duplicates,filter_non_duplicates,annotate} + operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents +--contents_column CONTENTS_COLUMN + name of the column that stores document text +--document_id_column DOCUMENT_ID_COLUMN + name of the column that stores document ID +--seed SEED seed of the random number generator +--num_permutations NUM_PERMUTATIONS + number of permutations to use for minhash calculation +--num_bands NUM_BANDS + number of bands to use for band hash calculation +--num_minhashes_per_band NUM_MINHASHES_PER_BAND + number of minhashes to use in each band +--word_shingle_size WORD_SHINGLE_SIZE + number of words included in one shingle +--jaccard_similarity_threshold JACCARD_SIMILARITY_THRESHOLD + jaccard similarity threshold above which two documents are similar +--num_segments NUM_SEGMENTS + the number of segments dividing the hashing space for each band (for scalability) +--duplicate_list_location DUPLICATE_LIST_LOCATION + path to the file with all the duplicate document ids +--services SERVICES Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning) +--use_s3 USE_S3 use s3 +--s3_cred S3_CRED ast string of options for s3 credentials +--shingle_option SHINGLE_OPTION + Option used for shingling + +``` + +### Signature Calculation Transform +The set of dictionary keys holding [SignatureCalcTransform](src/signature_calc_transform.py) configuration for values +are as follows: +```text +--minhash_document_id_column MINHASH_DOCUMENT_ID_COLUMN + name of the column storing the unique ID assigned to each document +--minhash_contents_column MINHASH_CONTENTS_COLUMN + name of the column storing the contents of each document +--minhash_seed MINHASH_SEED + the seed used to instantiate the random number generator +--minhash_num_permutations MINHASH_NUM_PERMUTATIONS + number of permutations (minhashes) calculated for each document +--minhash_word_shingle_size MINHASH_WORD_SHINGLE_SIZE + the size of the word shingles calculated for each document +--minhash_num_bands MINHASH_NUM_BANDS + the number of bands to use in the banding technique +--minhash_num_minhashes_per_band MINHASH_NUM_MINHASHES_PER_BAND + the number of minhashes to use in each band +--minhash_num_segments MINHASH_NUM_SEGMENTS + the number of segments across which we divide the hashing space for each band +--minhash_shingle_option MINHASH_SHINGLE_OPTION + Shingling option ('word' or 'char') +``` + +### Cluster Analysis Transform +The set of dictionary keys holding [ClusterAnalysisTransform](src/cluster_analysis_transform.py) configuration for values +are as follows: +```text +--cluster_jaccard_similarity_threshold CLUSTER_JACCARD_SIMILARITY_THRESHOLD + Jaccard similarity threshold above which two documents are duplicates +--cluster_num_bands CLUSTER_NUM_BANDS + The number of bands used in the banding technique +--cluster_num_segments CLUSTER_NUM_SEGMENTS + The number of segments dividing the hashing space for each band +``` + +### Get Duplicates List Transform +This transform currently has no configuration parameters. + +### Data Cleaning Transform +The set of dictionary keys holding [DataCleaningTransform](src/data_cleaning_transform.py) configuration for values +are as follows: +```text + --fdclean_document_id_column FDCLEAN_DOCUMENT_ID_COLUMN + name of the column storing the unique ID assigned to each document + --fdclean_operation_mode {filter_duplicates,filter_non_duplicates,annotate} + operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents +``` + +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: + +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_python.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_python.py +python cluster_analysis_local_python.py +python get_duplicate_list_local_python.py +python data_cleaning_local_python.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. + +### Code example + +TBD (link to the notebook will be provided) + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +For testing fuzzy deduplication in a pure python runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` + +## Further Resources +The following is a list of references to research articles and github repositories that inspired the module's design: + +1. [Jure Leskovec, Anand Rajaraman, Jeff Ullman, Mining of Massive Datasets, Chapter 3: Finding Similar Items](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) +2. [G Penedo et al., The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale](https://arxiv.org/pdf/2406.17557) +3. [Datatrove github repo](https://github.com/huggingface/datatrove) diff --git a/data-processing-lib/spark/pyproject.toml b/transforms/universal/fdedup/python/pyproject.toml similarity index 52% rename from data-processing-lib/spark/pyproject.toml rename to transforms/universal/fdedup/python/pyproject.toml index 89b4d9bf8..08b20ed75 100644 --- a/data-processing-lib/spark/pyproject.toml +++ b/transforms/universal/fdedup/python/pyproject.toml @@ -1,31 +1,21 @@ [project] -name = "data_prep_toolkit_spark" -version = "0.2.2.dev2" -keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +name = "dpk_fdedup_transform_python" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" -description = "Data Preparation Toolkit Library for Spark" +description = "Fuzzy Dedup Transform for Python" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev2", - "pyspark>=3.5.2", - "psutil>=6.0.0", - "PyYAML>=6.0.2" -] - -[project_urls] -Repository = "https://github.com/IBM/data-prep-kit" -Issues = "https://github.com/IBM/data-prep-kit/issues" -Documentation = "https://ibm.github.io/data-prep-kit/" -"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ @@ -44,7 +34,7 @@ dev = [ package_dir = ["src","test"] [options.packages.find] -where = ["src/data_processing_spark"] +where = ["src/"] [tool.pytest.ini_options] # Currently we use low coverage since we have to run tests separately (see makefile) diff --git a/transforms/universal/fdedup/python/requirements.txt b/transforms/universal/fdedup/python/requirements.txt new file mode 100644 index 000000000..985c0b967 --- /dev/null +++ b/transforms/universal/fdedup/python/requirements.txt @@ -0,0 +1,10 @@ +data-prep-toolkit>=0.2.3.dev0 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars==1.9.0 +disjoint-set>=0.8.0 +scipy>=1.14.1, <2.0.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 diff --git a/transforms/universal/fdedup/python/src/Murmur_MH.py b/transforms/universal/fdedup/python/src/Murmur_MH.py new file mode 100644 index 000000000..03d5047ea --- /dev/null +++ b/transforms/universal/fdedup/python/src/Murmur_MH.py @@ -0,0 +1,112 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + + +import logging +import os +from typing import List, Set + +import mmh3 +import numpy as np + + +class Murmur_MH: + def __init__(self, num_perm=64, seed=42, hashfunc=None): + self.seed = seed + self.num_perm = num_perm # the number of buckets, i.e. the vector length after self.minhash() call + self.permutations = self._init_permutations(seed, num_perm) + + def _init_permutations(self, seed, num_perm): + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + max_int = np.uint64((1 << 64) - 1) + # initialize pseudo random number generator with given seed value + gen = np.random.RandomState(seed) + # get self.num_perm pseudo random numbers between 2 and max_int (excl) + permutations = np.array( + [gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], + dtype=np.uint64, + ).T + # make all even pseudo random numbers odd by adding 1 + permutations[permutations % 2 == 0] += 1 + return permutations + + def minhash(self, shingles: List[str]): + """return np.array of minhash""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0) + ) + + def minhash2(self, shingles: List[str], doc_len: int): + """ + for each shingle (i.e. a group of k-words) it generates a digest value based on + mmh3-hash function (32-bit) + + return tuple (A, B) + A = an array of values = np.array of minhash + B = document_length = number of characters""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0), + doc_len, + ) + + def minhash2_nosalt(self, shingles: List[str], doc_len: int, doc_id: int): + """ + for each shingle (i.e. a group of k-words) it generates a digest value based on + mmh3-hash function (32-bit) + + return tuple (A, B) + A = an array of values = np.array of minhash + B = document_length = number of characters""" + # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic + hash_values = np.array([mmh3.hash(shingle, signed=False) for shingle in shingles], dtype=np.uint64) + return ( + np.right_shift( + (hash_values * np.tile(self.permutations, (len(hash_values), 1)).T).T, + 32, + ) + .astype(np.uint32) + .min(axis=0) + .tolist(), + doc_len, + doc_id, + ) + + @staticmethod + def jaccard(mh1: np.array, mh2: np.array) -> float: + """ + The Jaccard similarity measures the similarity between two sets of data + to see which members are shared and distinct. + + The Jaccard similarity is calculated by dividing the number of observations + in both sets by the number of observations in either set. + + Developed by Paul Jaccard, the index ranges from 0 to 1. + The closer to 1, the more similar the two sets of data. + + As a document is represented by a set. We use Jaccard distance to see how similar between two documents. + """ + assert len(mh1) == len(mh2) + return np.count_nonzero(mh1 == mh2) / len(mh1) diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py new file mode 100644 index 000000000..bb785021c --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_local_python.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + + +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + # Launch python to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py new file mode 100644 index 000000000..fa3ce6d28 --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform.py @@ -0,0 +1,335 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +import re +from argparse import ArgumentParser, Namespace +from typing import Any, List + +import numpy as np +import polars as pl +from data_processing.transform import AbstractFolderTransform, TransformConfiguration +from data_processing.utils import ( + CLIArgumentProvider, + TransformUtils, + UnrecoverableException, + get_logger, +) +from Murmur_MH import Murmur_MH + + +short_name = "cluster" +cli_prefix = f"{short_name}_" + +# configuration keys +num_bands_key = "num_bands" +""" This key holds the number of bands used in the banding technique""" +num_segments_key = "num_segments" +""" This key holds the number of segments dividing the hashing space for each band""" +jaccard_similarity_threshold_key = "jaccard_similarity_threshold" +""" This key holds the Jaccard similarity threshold above which two documents are duplicates""" +sort_output_key = "sort_output" +""" This key is used to sort""" + +# command line arguments +num_bands_cli_param = f"{cli_prefix}{num_bands_key}" +""" The number of bands used in the banding technique""" +jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" +""" Jaccard similarity threshold above which two documents are duplicates""" +num_segments_cli_param = f"{cli_prefix}{num_segments_key}" +""" The number of segments dividing the hashing space for each band""" +sort_output_cli_param = f"{cli_prefix}{sort_output_key}" +""" Sort the output""" + +captured_arg_keys = [ + num_bands_key, + num_segments_key, + jaccard_similarity_threshold_key, + sort_output_key, +] + +# defaults +num_bands_default = 14 +""" Default number of bands used in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)""" +jaccard_similarity_threshold_default = 0.75 +""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_segments_default = 1 +""" Default number of segments dividing the hashing space for each band""" +sort_output_default = False + + +class ClusterAnalysisTransform(AbstractFolderTransform): + """ + This is the second transform of the fuzzy dedup pipeline. It runs in parallel: + for each band, the hashing interval is divided into segments. A cluster analysis + uses as input all the parquet files from segment of a band. The `bands` output + of the signature calculation, the first transform in the fuzzy dedup pipeline + contains all the data for a given segment s of a specific band b in the + subfolder `bands/band=b/segment=s`. + The transform loads all the parquet files in the `bands/band=b/segment=s` + subfolder. Each one of these parquet files has two columns: the `band_hash` + and a `data` structure, which includes the `document_id`, the `minhashes` and + the `document_size` fields. Once all the files have been loaded in a single + dataframe, a `group_by` operation on the `band_hash` field is performed in + that dataframe. All the documents that have the same band_hash are grouped + in a cluster. Subsequently, the documents of each cluster are sorted in + descending order according to their size, and a Jaccard similarity is + calculated between the cluster documents. The documents for which the Jaccard + similarity is above the `jaccard_similarity_threshold` remain in the cluster, + the others are removed from the cluster. Finally, from each cluster that has + more than one document after running the Jaccard similarity, we select a doc + to keep (the largest size document), and mark the other documents as + duplicates. The resulting clusters are saved in a file for further analysis. + + The following internal variables are initialized from the config parameter: + num_bands: number of bands used in the banding technique + jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + num_segments: the number of segments dividing the hashing space for each band + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.num_bands = config.get(num_bands_key, num_bands_default) + self.num_segments = config.get(num_segments_key, num_segments_default) + self.jaccard_similarity_threshold = config.get( + jaccard_similarity_threshold_key, jaccard_similarity_threshold_default + ) + self.sort_output = config.get(sort_output_key, sort_output_default) + self.data_access = config.get("data_access") + if self.data_access is None: + raise UnrecoverableException("Could not get a pointer to the data access object inside the transform.") + self.logger = get_logger(__name__) + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + self.logger.debug(f"Cluster analysis for folder {folder_name}") + metadata = {} + input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name)) + files, retries = self.data_access.get_folder_files( + path=input_folder, + extensions=[".parquet"], + return_data=True, + ) + if retries > 0: + metadata |= {"data_access_retries": retries} + match = re.match(r"^band=(\d+)/segment=(\d+)$", folder_name) + if match: + band = int(match.group(1)) + segment = int(match.group(2)) + else: + raise ValueError(f"Wrong folder_name {folder_name}, should be band=b/segment=s") + output_folder = TransformUtils.clean_path(self.data_access.output_folder) + output_path = os.path.join(output_folder, f"band_{band}_segment_{segment}.parquet") + + # consolidate into a single data frame band hashes computed by workers + band_segment_dataframe, consolidation_stats = self._consolidate_band_segment_files(files) + metadata |= consolidation_stats + # cluster grouping by band hashes + cluster_dataframe, cluster_stats = self._get_clusters(band_segment_dataframe) + metadata |= cluster_stats + # cluster analysis using jaccard similarity + jaccard_cluster_dataframe, jaccard_stats = self._analyze_clusters(cluster_dataframe) + metadata |= jaccard_stats + # Generate the docs_to_remove dataframe + docs_to_remove_dataframe = jaccard_cluster_dataframe.explode("docs_to_remove") + output_data = TransformUtils.convert_arrow_to_binary(docs_to_remove_dataframe.to_arrow()) + self.logger.debug(f"{len(docs_to_remove_dataframe)} documents marked to remove") + metadata |= {"num_duplicate_documents": len(docs_to_remove_dataframe)} + return [(output_data, output_path)], metadata + + def _consolidate_band_segment_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + band_segment_dataframe = pl.DataFrame() + total_input_rows = 0 + for fname, contents in files.items(): + df = pl.read_parquet(io.BytesIO(contents)) + total_input_rows += len(df) + self.logger.debug(f"{fname} has {len(df)} rows") + band_segment_dataframe = band_segment_dataframe.vstack(df) + + consolidation_stats = { + "input_files": len(files), + "input_bytes": sum(len(v) for v in files.values()), + "input_rows": total_input_rows, + "consolidated_files": 1, + "consolidated_bytes": band_segment_dataframe.to_arrow().nbytes, + "consolidated_rows": len(band_segment_dataframe), + } + return band_segment_dataframe, consolidation_stats + + def _get_clusters(self, band_segment_dataframe: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + groupby_dataframe = band_segment_dataframe.group_by("band_hash").agg("document_data") + cluster_dataframe = groupby_dataframe.with_columns(cluster_length=pl.col("document_data").list.len()).filter( + pl.col("cluster_length") > 1 + ) + # self.logger.info(f"file_name = {file_name}") + num_clusters = len(cluster_dataframe) + if num_clusters > 0: + sum_cdocs = cluster_dataframe.select(pl.sum("cluster_length")).item() + max_cdocs = cluster_dataframe.select(pl.max("cluster_length")).item() + min_cdocs = cluster_dataframe.select(pl.min("cluster_length")).item() + avg_cdocs = cluster_dataframe.select(pl.mean("cluster_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.debug(f"After GroupBy: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + cluster_stats = { + "groupby_clusters": num_clusters, + "cluster_duplicate_docs": sum_cdocs, + } + return cluster_dataframe, cluster_stats + + def _analyze_clusters(self, df: pl.DataFrame) -> tuple[pl.DataFrame, dict[str, Any]]: + # Define the schema with specific data types + schema = {"first_doc": pl.Int64, "docs_to_remove": pl.List(pl.Int64), "docs_to_remove_length": pl.Int64} + doc_ids_lists = [] + docs_to_remove_lists = [] + len_of_docs2remove_lists = [] + for row in df.iter_rows(named=True): + doc_ids_list, docs_to_remove_list, len_of_docs2remove_list = self._jaccard_distance_calculation(row) + doc_ids_lists += doc_ids_list + docs_to_remove_lists += docs_to_remove_list + len_of_docs2remove_lists += len_of_docs2remove_list + jaccard_cluster_dataframe = pl.DataFrame( + { + "first_doc": doc_ids_lists, + "docs_to_remove": docs_to_remove_lists, + "docs_to_remove_length": len_of_docs2remove_lists, + }, + schema=schema, + ) + filtered_jaccard_dataframe = jaccard_cluster_dataframe.filter(pl.col("docs_to_remove_length") > 0) + num_clusters = len(filtered_jaccard_dataframe) + if num_clusters > 0: + sum_cdocs = filtered_jaccard_dataframe.select(pl.sum("docs_to_remove_length")).item() + max_cdocs = filtered_jaccard_dataframe.select(pl.max("docs_to_remove_length")).item() + min_cdocs = filtered_jaccard_dataframe.select(pl.min("docs_to_remove_length")).item() + avg_cdocs = filtered_jaccard_dataframe.select(pl.mean("docs_to_remove_length")).item() + else: + sum_cdocs = 0 + max_cdocs = 0 + min_cdocs = 0 + avg_cdocs = 0 + self.logger.debug(f"After Jaccard: {num_clusters} clusters with {sum_cdocs} total docs") + self.logger.debug(f" max/min/avg docs per cluster: {max_cdocs}/{min_cdocs}/{avg_cdocs:.2f}") + jaccard_stats = { + "jaccard_clusters": num_clusters, + "jaccard_duplicate_docs": sum_cdocs, + } + if self.sort_output: + filtered_jaccard_dataframe = filtered_jaccard_dataframe.sort(by="first_doc") + return filtered_jaccard_dataframe, jaccard_stats + + def _jaccard_distance_calculation(self, row: List[pl.Series]) -> list[list]: + # Process row and return a new list of Series or a new row + threshold = self.jaccard_similarity_threshold + doc_ids_list = [] + docs_to_remove_list = [] + len_of_docs2remove_list = [] + # sort documents + document_data = row["document_data"] + + # Sort the list by 'document_length' + sorted_document_data = sorted(document_data, key=lambda x: (-x["document_length"], x["int_id_column"])) + + # Extracting int_id_column values into a list + doc_list = [item["int_id_column"] for item in sorted_document_data] + + # Creating a dictionary with int_id_column as key and minhashes as value + doc_minhashes = {item["int_id_column"]: item["minhashes"] for item in sorted_document_data} + + while len(doc_list) > 1: + docs_to_remove = [] + new_doc_list = [] + # this is the document we are going to keep + first_doc = doc_list[0] + first_mh = doc_minhashes[first_doc] + for int_id_column in doc_list[1:]: + doc_mh = doc_minhashes[int_id_column] + distance = Murmur_MH.jaccard(np.array(first_mh), np.array(doc_mh)) + if distance >= threshold: + docs_to_remove.append(int_id_column) + else: + new_doc_list.append(int_id_column) + if len(docs_to_remove) > 0: + docs_to_remove = list(set(docs_to_remove)) + doc_ids_list.append(first_doc) + docs_to_remove_list.append(docs_to_remove) + len_of_docs2remove_list.append(len(docs_to_remove)) + doc_list = new_doc_list + + return doc_ids_list, docs_to_remove_list, len_of_docs2remove_list + + +class ClusterAnalysisTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=ClusterAnalysisTransform, + remove_from_metadata=[], + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{jaccard_similarity_threshold_cli_param}", + type=float, + default=jaccard_similarity_threshold_default, + help="Jaccard similarity threshold above which two documents are duplicates", + ) + parser.add_argument( + f"--{num_bands_cli_param}", + type=int, + default=num_bands_default, + help="The number of bands used in the banding technique", + ) + parser.add_argument( + f"--{num_segments_cli_param}", + type=int, + default=num_segments_default, + help="The number of segments dividing the hashing space for each band", + ) + parser.add_argument( + f"--{sort_output_cli_param}", + type=bool, + default=sort_output_default, + help="Sort the similarity clusters by the document ID of the kept doc (used primarily for testing)", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py new file mode 100644 index 000000000..c35c5a711 --- /dev/null +++ b/transforms/universal/fdedup/python/src/cluster_analysis_transform_python.py @@ -0,0 +1,76 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import time +from typing import Any + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class ClusterAnalysisPythonRuntime(DefaultPythonTransformRuntime): + """ + Cluster analysis runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + +class ClusterAnalysisPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for Fuzzy Dedup ClusterAnalysis + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + logger.info("Launching fuzzy dedup cluster analysis python transform") + # Launch python to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/data_cleaning_local_python.py b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py new file mode 100644 index 000000000..aa4aabb90 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_local_python.py @@ -0,0 +1,60 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) +) +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform.py b/transforms/universal/fdedup/python/src/data_cleaning_transform.py new file mode 100644 index 000000000..cb07923ae --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform.py @@ -0,0 +1,179 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +from argparse import ArgumentParser, Namespace +from typing import Any + +import polars as pl +import pyarrow as pa +from data_processing.data_access import DataAccessFactory +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, get_logger + + +short_name = "fdclean" +cli_prefix = f"{short_name}_" + +# configuration keys +document_id_column_key = "document_id_column" +""" This key holds the name of the column storing the unique ID assigned to each document""" +duplicate_list_location_key = "duplicate_list_location" +""" This key holds the location of the list of duplicate documents marked for removal""" +operation_mode_key = "operation_mode" +""" This key holds the operation mode: 'filter_duplicates', 'filter_non_duplicates', or 'annotate'""" + +# command line arguments +document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" +""" Name of the column storing the unique ID assigned to each document""" +duplicate_list_location_cli_param = f"{cli_prefix}{duplicate_list_location_key}" +""" Location of the list of duplicate documents marked for removal""" +operation_mode_cli_param = f"{cli_prefix}{operation_mode_key}" +""" Operation mode, can be one of 'filter_duplicates', 'filter_non_duplicates', or 'annotate'""" + +captured_arg_keys = [ + document_id_column_key, + duplicate_list_location_key, + operation_mode_key, +] + +# defaults +document_id_column_default = "int_id_column" +""" Default name of the column storing the unique ID assigned to each document""" +duplicate_list_location_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") +""" Default location of the list of duplicate documents marked for removal""" +operation_mode_default = "filter_duplicates" +""" Default value for operation mode, will filter out all the duplicate documents""" + +dataclean_data_factory_key = "dc_data_factory" +dataclean_data_access_key = "dc_data_access" + + +class DataCleaningTransform(AbstractTableTransform): + """ + This is the third transform of the fuzzy dedup pipeline. It takes as input + the list of the documents to remove (identified as duplicates during the + cluster analysis phase, and the original dataset. Each dataset file is + imported into a table, and the documents that are in the documents to remove + list are filtered out from that table. The output is a new dataset, which + keeps the directory structure of the input dataset, but has all the fuzzy + duplicates removed. + + The following internal variables are initialized from the config dictionary: + duplicate_list_location: location (local or s3) of the duplicate document list + operation_mode: one of annotate, filter_duplicates, or filter_non_duplicates + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.logger = get_logger(__name__) + self.document_id_column = config.get(document_id_column_key, document_id_column_default) + self.duplicate_list_location = config.get(duplicate_list_location_key, duplicate_list_location_default) + self.operation_mode = config.get(operation_mode_key, operation_mode_default) + contents = config.get("df") + self.docs_to_remove_df = pl.read_parquet(io.BytesIO(contents)) + self.logger.debug(f"Got docs_to_remove_df with {len(self.docs_to_remove_df)} rows") + self.docs_to_remove_df = self.docs_to_remove_df.rename({"docs_to_remove": self.document_id_column}) + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + self.logger.debug(f"Transforming table with {table.num_rows} rows from file {file_name}") + input_df = pl.from_arrow(table) + # handle the case when the doc_id columns in the input dataframe and the + # docs_to_remove_df have different types, i.e. one is int32 and the + # other is int64 + input_doc_id_type = input_df[self.document_id_column].dtype + if input_doc_id_type != self.docs_to_remove_df[self.document_id_column].dtype: + self.docs_to_remove_df = self.docs_to_remove_df.select( + pl.col(self.document_id_column).cast(input_doc_id_type) + ) + if self.operation_mode == "filter_duplicates": + result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="anti") + elif self.operation_mode == "filter_non_duplicates": + result_df = input_df.join(self.docs_to_remove_df, on=self.document_id_column, how="inner") + else: # self.operation_mode == "annotation" + duplicates_df = self.docs_to_remove_df.with_columns(pl.lit("d").alias("duplicate")) + result_df = input_df.join(duplicates_df, on=self.document_id_column, how="left").with_columns( + pl.col("duplicate").fill_null("") + ) + result_table = result_df.to_arrow() + metadata = { + "input_files": 1, + "input_docs": table.num_rows, + "input_bytes": table.nbytes, + "output_files": 1, + "output_docs": result_table.num_rows, + "output_bytes": result_table.nbytes, + "filtered_docs": (table.num_rows - result_table.num_rows), + "filtered_bytes": (table.nbytes - result_table.nbytes), + } + return [result_table], metadata + + +class DataCleaningTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self, transform_class: type[AbstractTableTransform] = DataCleaningTransform): + super().__init__( + name=short_name, + transform_class=transform_class, + remove_from_metadata=[dataclean_data_factory_key], + ) + self.daf = DataAccessFactory(cli_arg_prefix="dcdata_") + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{document_id_column_cli_param}", + type=str, + default=document_id_column_default, + help="name of the column storing the unique ID assigned to each document", + ) + parser.add_argument( + f"--{duplicate_list_location_cli_param}", + type=str, + default=duplicate_list_location_default, + help="location of duplicate document list that are marked for removal", + ) + parser.add_argument( + f"--{operation_mode_cli_param}", + choices=["filter_duplicates", "filter_non_duplicates", "annotate"], + default=operation_mode_default, + help="operation mode: filter out duplicates/non-duplicates, or annotate duplicate documents", + ) + self.daf.add_input_params(parser=parser) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + self.params[dataclean_data_factory_key] = self.daf + return self.daf.apply_input_params(args=args) diff --git a/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py new file mode 100644 index 000000000..edef8b9c5 --- /dev/null +++ b/transforms/universal/fdedup/python/src/data_cleaning_transform_python.py @@ -0,0 +1,103 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from data_cleaning_transform import ( + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) +from data_processing.data_access import DataAccessFactoryBase +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + DefaultPythonTransformRuntime, + PythonTransformRuntimeConfiguration, +) +from data_processing.transform import TransformStatistics +from data_processing.utils import get_logger + + +logger = get_logger(__name__) + + +class DataCleaningPythonRuntime(DefaultPythonTransformRuntime): + """ + Data cleaning runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_transform_config( + self, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics, files: list[str] + ) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :param statistics - reference to statistics actor + :param files - list of files to process + :return: dictionary of transform init params + """ + data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = dc_data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) + return self.params | {"df": self.duplicate_list} + + +class DataCleaningPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for fuzzy dedup data cleaning step + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param: transform_configuration - transform configuration class + :param: runtime_class - name of the runtime configuration class + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(), + runtime_class=DataCleaningPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(DataCleaningTransformConfiguration()) + logger.info("Launching fuzzy dedup data cleaning transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/fdedup_transform_python.py b/transforms/universal/fdedup/python/src/fdedup_transform_python.py new file mode 100644 index 000000000..def3590e4 --- /dev/null +++ b/transforms/universal/fdedup/python/src/fdedup_transform_python.py @@ -0,0 +1,271 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import argparse +import ast +import os +import sys + +import cluster_analysis_transform +import data_cleaning_transform +import get_duplicate_list_transform +import signature_calc_transform +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils, get_logger, str2bool +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +SERVICE_DICT = { + "SignatureCalculation": "minhash", + "ClusterAnalysis": "cluster", + "GetDuplicateList": "fdlist", + "DataCleaning": "fdclean", +} + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + +ARGS_MAP = { + "minhash": signature_calc_transform.captured_arg_keys, + "cluster": cluster_analysis_transform.captured_arg_keys, + "fdlist": get_duplicate_list_transform.captured_arg_keys, + "fdclean": data_cleaning_transform.captured_arg_keys, +} + + +class ServiceOrchestrator: + def __init__(self, global_params: argparse.Namespace = None): + self.global_params = global_params + self.logger = get_logger(__name__) + + def orchestrate(self): + service_list = self.global_params.services.split(",") + for service in service_list: + self.logger.info(f"Starting {service} step") + if service not in SERVICE_DICT: + err_msg = f"Unknown service {service} specified. Must be one of {SERVICE_DICT.keys()}" + self.logger.error(err_msg) + raise ValueError(err_msg) + service_short_name = SERVICE_DICT[service] + service_params = self.get_arguments(self.global_params, service_short_name) + self.logger.info(f"Got parameters for {service}") + status = self.execute_service(service_short_name, service_params) + if status == 0: + self.logger.info(f"{service} completed successfully") + else: + self.logger.error(f"{service} failed with status {status}, aborting ...") + break + + def get_arguments(self, in_args: argparse.Namespace, service_name: str) -> list: + sys_argv = ["python"] + in_args_dict = vars(in_args) + all_module_arguments = ARGS_MAP.get(service_name, []) + passed_args = {k: v for k, v in in_args_dict.items() if k in all_module_arguments and v is not None} + for k, v in passed_args.items(): + sys_argv.append(f"--{service_name}_{k}") + sys_argv.append(str(v)) + if service_name == "minhash": + input_folder = in_args_dict["input_folder"] + output_folder = in_args_dict["output_folder"] + elif service_name == "cluster": + input_folder = os.path.join(in_args_dict["output_folder"], "bands") + output_folder = os.path.join(in_args_dict["output_folder"], "docs_to_remove") + elif service_name == "fdlist": + input_folder = in_args_dict["output_folder"] + output_folder = in_args_dict["output_folder"] + elif service_name == "fdclean": + input_folder = in_args_dict["input_folder"] + operation_mode = in_args_dict.get("operation_mode", "filter_duplicates") + if operation_mode == "filter_duplicates": + output_subfolder = "cleaned" + elif operation_mode == "filter_non_duplicates": + output_subfolder = "duplicates" + else: # operation_mode == "annotate" + output_subfolder = "annotated" + output_folder = os.path.join(in_args_dict["output_folder"], output_subfolder) + else: + self.logger.error(f"Unknown service name: {service_name}") + data_io = { + "input_folder": input_folder, + "output_folder": output_folder, + } + if in_args.use_s3: + if in_args.s3_cred is not None: + s3_cred_ast = ParamsUtils.convert_to_ast(in_args.s3_cred) + sys_argv.append("--data_s3_cred") + sys_argv.append(s3_cred_ast) + if service_name == "minhash": + sys_argv.append("--scdata_s3_cred") + sys_argv.append(s3_cred_ast) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_cred") + sys_argv.append(s3_cred_ast) + elif ( + s3_creds.get("access_key") is not None + and s3_creds.get("secret_key") is not None + and s3_creds.get("url") is not None + ): + ast_s3_cred = ParamsUtils.convert_to_ast(s3_creds) + sys_argv.append("--data_s3_cred") + sys_argv.append(ast_s3_cred) + if service_name == "minhash": + sys_argv.append("--scdata_s3_cred") + sys_argv.append(ast_s3_cred) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_cred") + sys_argv.append(ast_s3_cred) + sys_argv.append("--data_s3_config") + else: + sys_argv.append("--data_local_config") + ast_data_io = ParamsUtils.convert_to_ast(data_io) + sys_argv.append(ast_data_io) + if in_args.use_s3: + if service_name == "minhash": + sys_argv.append("--scdata_s3_config") + sys_argv.append(ast_data_io) + if service_name == "fdclean": + sys_argv.append("--dcdata_s3_config") + sys_argv.append(ast_data_io) + if in_args.run_locally: + sys_argv.append(f"--run_locally={in_args.run_locally}") + return sys_argv + + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params + if service_short_name == "minhash": + launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) + elif service_short_name == "cluster": + launcher = PythonTransformLauncher(runtime_config=ClusterAnalysisPythonTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = PythonTransformLauncher(runtime_config=DataCleaningPythonTransformConfiguration()) + else: + err_msg = f"Unknown service {service_short_name} specified. Must be one of {SERVICE_DICT.values()}" + self.logger.error(err_msg) + raise ValueError(err_msg) + status = launcher.launch() + return status + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Service Orchestrator") + + # Define command line arguments + parser.add_argument("--input_folder", type=str, required=True, help="Input folder path") + parser.add_argument("--output_folder", type=str, required=True, help="Output folder path") + + parser.add_argument( + "--operation_mode", + choices=["filter_duplicates", "filter_non_duplicates", "annotate"], + required=False, + help="operation mode for data cleanup: filter out duplicates/non-duplicates, or annotate duplicate documents", + ) + parser.add_argument( + "--contents_column", type=str, required=False, help="name of the column that stores document text" + ) + parser.add_argument( + "--document_id_column", type=str, required=False, help="name of the column that stores document ID" + ) + parser.add_argument("--seed", type=int, required=False, help="seed of the random number generator") + parser.add_argument( + "--num_permutations", type=int, required=False, help="number of permutations to use for minhash calculation" + ) + parser.add_argument( + "--num_bands", type=int, required=False, help="number of bands to use for band hash calculation" + ) + parser.add_argument( + "--num_minhashes_per_band", type=int, required=False, help="number of minhashes to use in each band" + ) + parser.add_argument( + "--word_shingle_size", type=int, required=False, help="number of words included in one shingle" + ) + parser.add_argument( + "--jaccard_similarity_threshold", + type=float, + required=False, + help="jaccard similarity threshold above which two documents are similar", + ) + parser.add_argument( + "--num_segments", + type=int, + required=False, + help="the number of segments dividing the hashing space for each band (for scalability)", + ) + parser.add_argument( + "--duplicate_list_location", + type=str, + required=False, + help="path to the file with all the duplicate document ids", + ) + + # Single argument for service execution + parser.add_argument( + "--services", + type=str, + required=False, + default="SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning", + help="Comma-separated list of services to run (e.g., SignatureCalculation,ClusterAnalysis,GetDuplicateList,DataCleaning)", + ) + + parser.add_argument( + "--use_s3", + type=lambda x: bool(str2bool(x)), + default=False, + help="use s3", + ) + + parser.add_argument( + "--s3_cred", + type=ast.literal_eval, + default=None, + help="ast string of options for s3 credentials", + ) + + parser.add_argument( + "--shingle_option", + type=str, + required=False, + default="word", + help="Option used for shingling", + ) + + parser.add_argument( + "--run_locally", + type=lambda x: bool(str2bool(x)), + default=False, + help="run locally or connect to a remote machine", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = ServiceOrchestrator(global_params=args) + # Launch python fuzzy dedup execution + orchestrator.orchestrate() diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py new file mode 100644 index 000000000..c14c4bdce --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform.py @@ -0,0 +1,173 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import io +import os +from argparse import ArgumentParser, Namespace +from typing import Any + +import polars as pl +from data_processing.transform import AbstractFolderTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger + + +short_name = "fdlist" +cli_prefix = f"{short_name}_" + +# configuration keys +subfolder_key = "docs_to_remove" +""" This key holds the name of the subfolder with the duplicate records""" +consolidated_filename_key = "consolidated_filename" +""" This key holds the name of the file with the consolidated list of duplicates""" +sort_output_key = "sort_output" +""" This key is used to sort""" + +# command line arguments +subfolder_cli_param = f"{cli_prefix}{subfolder_key}" +""" The name of the subfolder with the duplicate records""" +consolidated_filename_cli_param = f"{cli_prefix}{consolidated_filename_key}" +""" The name of the file with the consolidated list of duplicates""" +sort_output_cli_param = f"{cli_prefix}{sort_output_key}" +""" Sort the output""" + +captured_arg_keys = [ + subfolder_key, + consolidated_filename_key, + sort_output_key, +] + +# defaults +subfolder_default = "docs_to_remove" +""" Default name of the subfolder with the duplicate records""" +consolidated_filename_default = os.path.join("docs_to_remove_consolidated", "docs_to_remove_consolidated.parquet") +""" Default name of the file with the consolidated list of duplicates""" +sort_output_default = False + + +class GetDuplicateListTransform(AbstractFolderTransform): + """ + This is an intermediate step of the fuzzy dedup pipeline. It runs in a single + location and consolidates in a single file all the duplicates found for each + band segment. + These internal variables are initialized from the config dictionary: + subfolder: name of the subfolder with the duplicate records + consolidated_filename: name of the file with the consolidated list of duplicates + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments + defined by the companion runtime, ClusterAnalysisTransformRuntime. + """ + super().__init__(config) + self.subfolder = config.get(subfolder_key, subfolder_default) + self.consolidated_filename = config.get(consolidated_filename_key, consolidated_filename_default) + self.sort_output = config.get(sort_output_key, sort_output_default) + self.data_access = config.get("data_access") + self.logger = get_logger(__name__) + + def transform(self, folder_name: str) -> tuple[list[tuple[bytes, str]], dict[str, Any]]: + self.logger.info(f"Get Duplicate List for folder {folder_name}") + metadata = {} + input_folder = TransformUtils.clean_path(os.path.join(self.data_access.input_folder, folder_name)) + files, retries = self.data_access.get_folder_files( + path=input_folder, + extensions=[".parquet"], + return_data=True, + ) + if retries > 0: + metadata |= {"data_access_retries": retries} + output_folder = TransformUtils.clean_path(self.data_access.output_folder) + output_path = os.path.join(output_folder, self.consolidated_filename) + + # consolidate into a single data frame band hashes computed by workers + consolidated_dataframe, consolidation_stats = self._consolidate_docs_to_remove_files(files) + self.logger.info(f"{len(consolidated_dataframe)} documents marked as duplicates") + metadata |= consolidation_stats + output_data = TransformUtils.convert_arrow_to_binary(consolidated_dataframe.to_arrow()) + return [(output_data, output_path)], metadata + + def _consolidate_docs_to_remove_files(self, files: dict[str, bytes]) -> tuple[pl.DataFrame, dict[str, Any]]: + consolidated_dataframe = pl.DataFrame() + total_input_rows = 0 + for fname, contents in files.items(): + df = pl.read_parquet(io.BytesIO(contents)) + total_input_rows += len(df) + self.logger.debug(f"{fname} has {len(df)} rows") + consolidated_dataframe = consolidated_dataframe.vstack(df) + consolidated_dataframe = consolidated_dataframe.select("docs_to_remove").unique() + + consolidation_stats = { + "input_files": len(files), + "input_bytes": sum(len(v) for v in files.values()), + "input_rows": total_input_rows, + "consolidated_files": 1, + "consolidated_bytes": consolidated_dataframe.to_arrow().nbytes, + "consolidated_rows": len(consolidated_dataframe), + } + if self.sort_output: + consolidated_dataframe = consolidated_dataframe.sort(by="docs_to_remove") + + return consolidated_dataframe, consolidation_stats + + +class GetDuplicateListTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=GetDuplicateListTransform, + remove_from_metadata=[], + ) + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the GetDuplicateListTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{subfolder_cli_param}", + type=str, + default=subfolder_default, + help="The name of the subfolder with the duplicate records", + ) + parser.add_argument( + f"--{consolidated_filename_cli_param}", + type=str, + default=consolidated_filename_default, + help="The name of the file with the consolidated list of duplicates", + ) + parser.add_argument( + f"--{sort_output_cli_param}", + type=bool, + default=sort_output_default, + help="Sort", + ) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + return True diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py new file mode 100644 index 000000000..34b18ab04 --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_local_python.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "cluster_analysis") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} + +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + print(sys.argv) + # create launcher + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py new file mode 100644 index 000000000..703ef630e --- /dev/null +++ b/transforms/universal/fdedup/python/src/get_duplicate_list_transform_python.py @@ -0,0 +1,71 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import time +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.runtime.pure_python import ( + DefaultPythonTransformRuntime, + PythonTransformLauncher, + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from get_duplicate_list_transform import ( + GetDuplicateListTransformConfiguration, + subfolder_key, +) + + +logger = get_logger(__name__) + + +class GetDuplicateListPythonRuntime(DefaultPythonTransformRuntime): + """ + Get duplicate list runtime support for Python + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + return [self.params[subfolder_key]] + + +class GetDuplicateListPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for Fuzzy Dedup GetDuplicateList + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=GetDuplicateListTransformConfiguration(), + runtime_class=GetDuplicateListPythonRuntime, + ) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + logger.info("Launching fuzzy dedup get duplicate list python transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/signature_calc_local_python.py b/transforms/universal/fdedup/python/src/signature_calc_local_python.py new file mode 100644 index 000000000..be395ed4d --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_local_python.py @@ -0,0 +1,51 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys +from ast import Param + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = {"input_folder": input_folder, "output_folder": output_folder} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "scdata_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + + # create launcher + launcher = PythonTransformLauncher(runtime_config=SignatureCalculationPythonTransformConfiguration()) + # Launch python to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform.py b/transforms/universal/fdedup/python/src/signature_calc_transform.py new file mode 100644 index 000000000..4e64bcb5a --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_transform.py @@ -0,0 +1,517 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ +import os +import re +import unicodedata +from argparse import ArgumentParser, Namespace +from pathlib import Path +from typing import Any + +import mmh3 +import numpy as np +import polars as pl +import pyarrow as pa +from data_processing.data_access import DataAccessFactory +from data_processing.transform import AbstractTableTransform, TransformConfiguration +from data_processing.utils import CLIArgumentProvider, UnrecoverableException +from Murmur_MH import Murmur_MH + + +short_name = "minhash" +cli_prefix = f"{short_name}_" + +# configuration keys +document_id_column_key = "document_id_column" +""" This key holds the name of the column storing the unique ID assigned to each document""" +contents_column_key = "contents_column" +""" This key holds the name of the column storing the contents of each document""" +seed_key = "seed" +""" This key holds the seed used to instantiate the random number generator""" +num_permutations_key = "num_permutations" +""" This key holds the number of permutations that determine how many minhashes to calculate for each document""" +num_bands_key = "num_bands" +""" This key holds the number of bands to use in the banding technique""" +num_minhashes_per_band_key = "num_minhashes_per_band" +""" This key holds the number of minhashes to use in each band""" +jaccard_similarity_threshold_key = "jaccard_similarity_threshold" +""" This key holds the Jaccard similarity threshold above which two documents are duplicates""" +word_shingle_size_key = "word_shingle_size" +""" This key holds the size of the word shingles calculated for each document""" +num_segments_key = "num_segments" +""" This key holds the number of segments across which we divide the hashing space for each band""" +shingle_option_key = "shingle_option" +""" This key holds the option that is used to do shingles calculation for each document""" + +# command line arguments +document_id_column_cli_param = f"{cli_prefix}{document_id_column_key}" +""" Name of the column storing the unique ID assigned to each document""" +contents_column_cli_param = f"{cli_prefix}{contents_column_key}" +""" Name of the column storing the contents of each document""" +seed_cli_param = f"{cli_prefix}{seed_key}" +""" The seed used to instantiate the random number generator""" +num_permutations_cli_param = f"{cli_prefix}{num_permutations_key}" +""" Number of permutations that determine how many minhashes to calculate for each document""" +num_bands_cli_param = f"{cli_prefix}{num_bands_key}" +""" The number of bands to use in the banding technique""" +num_minhashes_per_band_cli_param = f"{cli_prefix}{num_minhashes_per_band_key}" +""" The number of minhashes to use in each band""" +jaccard_similarity_threshold_cli_param = f"{cli_prefix}{jaccard_similarity_threshold_key}" +""" Jaccard similarity threshold above which two documents are duplicates""" +word_shingle_size_cli_param = f"{cli_prefix}{word_shingle_size_key}" +""" The size of the word shingles calculated for each document""" +num_segments_cli_param = f"{cli_prefix}{num_segments_key}" +""" The number of segments across which we divide the hashing space for each band""" +shingle_option_cli_param = f"{cli_prefix}{shingle_option_key}" +""" The option (word/char) used to do shingles calculation for each document""" + +captured_arg_keys = [ + document_id_column_key, + contents_column_key, + seed_key, + num_bands_key, + num_minhashes_per_band_key, + num_permutations_key, + jaccard_similarity_threshold_key, + word_shingle_size_key, + num_segments_key, + shingle_option_key, +] + +# defaults +document_id_column_default = "int_id_column" +""" Default name of the column storing the unique ID assigned to each document""" +contents_column_default = "contents" +""" Default name of the column storing the contents of each document""" +seed_default = 42 +""" Default seed used to instantiate the random number generator""" +num_permutations_default = 112 +""" Default number of minhashes used for each document (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_bands_default = 14 +""" Default number of bands to use in the banding technique (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_minhashes_per_band_default = 8 +""" Default number of minhashes to use in each band (from FineWeb https://arxiv.org/pdf/2406.17557)""" +word_shingle_size_default = 5 +""" Default size of the word shingles (from FineWeb https://arxiv.org/pdf/2406.17557)""" +jaccard_similarity_threshold_default = 0.75 +""" Default Jaccard similarity threshold (from FineWeb https://arxiv.org/pdf/2406.17557)""" +num_segments_default = 1 +""" Default number of segments across which we divide the hashing space for each band""" +shingle_option_default = "word" +""" Default option of doing shingling""" + + +sigcalc_data_factory_key = "sc_data_factory" +sigcalc_data_access_key = "sc_data_access" + + +NUMBERS_PATTERN = re.compile(r"\d+(\.\d+)?") +WHITESPACE_PATTERN = re.compile(r"\s+") +PUNCTUATION = "!/—”:%1〈&(、━\\【#%「」,】;+^]~“《„';’{|∶´[=-`*.(–?!:$~«〉,><》)?)。…@_.\"}►»" + "".join( + map( + chr, + (x for a, b in ((0, 9), (11, 13), (13, 32), (127, 160)) for x in range(a, b)), + ) +) +PUNCTUATION_SET = set(PUNCTUATION) +PUNCTUATION_TRANS = str.maketrans(PUNCTUATION, " " * len(PUNCTUATION)) + + +class SignatureCalculationTransform(AbstractTableTransform): + """ + This is the first transform of the fuzzy dedup pipeline. First, it calculates, + for each document in a dataset, `num_permutations` minhashes. It accepts as + input the number of bands and the length (number of minhashes used for) each + band. The band signatures, the minhashes and the document lengths are + then saved in the output folder, under a folder structure `bands/band=b/segment=s`. + To improve scalability of the next step of fuzzy dedup, the hash space of + each band is divided into `num_segments` segments. + + The following internal variables are retrieved from the config parameter: + document_id_column: name of the column storing the unique ID assigned to each document + contents_column_cli_param: name of the column storing the contents of each document + seed: the seed used to instantiate the random number generator + num_permutations: number of minhashes to calculate for each document + num_bands: number of bands to use for banding technique + num_minhashes_per_band: number of minhashes to use in each band + jaccard_similarity_threshold: Jaccard similarity threshold above which two documents are duplicates + word_shingle_size: the size of the word shingles calculated for each document + num_segments: the number of segments across which we divide the hashing space for each band + """ + + def __init__(self, config: dict[str, Any]): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, SignatureCalculationTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + super().__init__(config) + self.document_id_column = config.get(document_id_column_key, document_id_column_default) + self.contents_column = config.get(contents_column_key, contents_column_default) + self.seed = config.get(seed_key, seed_default) + self.num_permutations = config.get(num_permutations_key, num_permutations_default) + self.jaccard_similarity_threshold = config.get( + jaccard_similarity_threshold_key, jaccard_similarity_threshold_default + ) + self.word_shingle_size = config.get(word_shingle_size_key, word_shingle_size_default) + self.num_segments = config.get(num_segments_key, num_segments_default) + self.num_bands = config.get(num_bands_key, num_bands_default) + self.num_rows = config.get(num_minhashes_per_band_key, num_minhashes_per_band_default) + self.shingle_option = config.get(shingle_option_key, shingle_option_default) + # use this dataframe to store the minhashes and size for each document + self.all_minhashes = None + # use this dataframe to store the band hashes for each document + self.all_band_hashes = None + # this variable keeps track of how many files were processed since last + # data write to properly update metadata + self.files_processed = 0 + self.bytes_processed = 0 + self.data_access = config.get("data_access") + if self.data_access is None: + raise UnrecoverableException("Could not get a pointer to the data access object inside the transform.") + self.last_file_name = None + + self.sc_data_access = config.get(sigcalc_data_access_key, None) + self.sc_daf = config.get(sigcalc_data_factory_key, None) + if self.sc_daf is None: + raise RuntimeError(f"Missing configuration value for key {sigcalc_data_factory_key}") + + def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: + """ + Put Transform-specific to convert one Table to 0 or more tables. It also returns + a dictionary of execution statistics - arbitrary dictionary + This implementation makes no modifications so effectively implements a copy of the + input parquet to the output folder, without modification. + """ + self.logger.debug(f"Transforming table with {table.num_rows} rows from file {file_name}") + self.logger.debug("----minhash---") + self.last_file_name = file_name + self.files_processed += 1 + self.bytes_processed += table.nbytes + # instantiate with same seed so every worker use same hash functions + mm_min_hash = Murmur_MH(num_perm=self.num_permutations, seed=self.seed) + + # load the data from pyarrow table + df = pl.from_arrow(table) + # read the target columns + df = df.select(self.contents_column, self.document_id_column) + + # generate minhash values + minhashes = df.map_rows( + lambda row: mm_min_hash.minhash2_nosalt( + *self._generate_word_shingles(row, self.shingle_option, window_size=self.word_shingle_size) + ) + ) + # rename columns, cast minhashes to list(uint32) + minhashes = minhashes.select( + pl.col("column_2").alias(self.document_id_column), + pl.col("column_0").cast(pl.List(pl.UInt32)).alias("minhashes"), + pl.col("column_1").alias("document_length"), + ) + # store the minhash calculations to send out at the end of execution + if self.all_minhashes is None: + self.all_minhashes = minhashes + else: + self.all_minhashes = self.all_minhashes.vstack(minhashes) + + # Calculate band hashes + band_hashes_list = self._process_rows_into_bands( + minhashes, + self.num_bands, + self.num_rows, + ) + band_hash_schema = pl.Schema( + { + "band_hash": pl.UInt64, + "band_index": pl.Int32, + self.document_id_column: pl.Int64, + } + ) + band_hashes = pl.DataFrame(band_hashes_list, schema=band_hash_schema) + + # store the band hash calculations to send out at the end of execution + if self.all_band_hashes is None: + self.all_band_hashes = band_hashes + else: + self.all_band_hashes = self.all_band_hashes.vstack(band_hashes) + + if len(self.all_minhashes) > 750000: + tables, metadata = self._write_band_signatures() + else: + tables = [] + metadata = {} + # update metadata stats and return the stats (no tables are returned in transform) + return tables, metadata + + def flush(self) -> tuple[list[pa.Table], dict[str, Any]]: + """ + This is supporting method for transformers, that implement buffering of tables, for example coalesce. + These transformers can have buffers containing tables that were not written to the output. Flush is + the hook for them to return back locally stored tables and their statistics. The majority of transformers + should use default implementation. + If there is an error, an exception must be raised - exit()ing is not generally allowed when running in Ray. + :return: a tuple of a list of 0 or more converted tables and a dictionary of statistics that will be + propagated to metadata + """ + self.logger.info(f"Starting flush()") + if self.all_band_hashes is not None and self.all_minhashes is not None: + tables, metadata = self._write_band_signatures() + else: + tables = [] + metadata = {} + return tables, metadata + + def _write_band_signatures(self): + # define the upper and lower bounds of each band segment + if self.sc_data_access is None: + self.sc_data_access = self.sc_daf.create_data_access() + segment_bounds_list = [] + upper_bound = np.uint64(np.iinfo(np.uint64).max) + segment_len = np.uint64(upper_bound // self.num_segments) + for segment_index in range(self.num_segments): + segment_bounds_list.append(np.uint64(segment_index) * segment_len) + segment_bounds_list.append(upper_bound) + segment_bounds = np.array(segment_bounds_list, dtype=np.uint64) + self.logger.debug(f"Calculated {len(segment_bounds)} segment_bounds") + # output stats for the metadata + num_tables_written = 0 + num_docs_written = 0 + num_bytes_written = 0 + self.logger.debug(f"dataframe self.all_band_hashes has {len(self.all_band_hashes)} rows") + self.logger.debug(f"dataframe self.all_minhashes has {len(self.all_minhashes)} rows") + # iterate through the bands, get the band hashes for each band, divide + # them into segments, join with minhashes, and upload to storage + for band_ix in range(self.num_bands): + # Filtering on, then dropping the `band_index` column + band_df = self.all_band_hashes.filter(pl.col("band_index") == band_ix).drop("band_index") + # assign each band hash to a segment of the hashing space + self.logger.debug(f"band {band_ix} band_df has {len(band_df)} rows") + for segment_index in range(self.num_segments): + segment_band_df = band_df.filter( + (pl.col("band_hash") > segment_bounds[segment_index]) + & (pl.col("band_hash") <= segment_bounds[segment_index + 1]) + ) + self.logger.debug( + f"band {band_ix} segment {segment_index} segment_band_df has {len(segment_band_df)} rows" + ) + # join the band hash dataframe with the minihash and doc length dataframe + segment_band_minhash_df = segment_band_df.join( + self.all_minhashes, + on=self.document_id_column, + how="inner", + ) + self.logger.debug(f"band {band_ix} segment {segment_index} joined segment_band_df and minhashes") + + # encapsulate document info in a structure + segment_band_minhash_df = segment_band_minhash_df.select( + pl.col("band_hash"), + pl.struct( + [ + pl.col(self.document_id_column), + pl.col("minhashes"), + pl.col("document_length"), + ] + ).alias("document_data"), + ) + self.logger.debug(f"band {band_ix} segment {segment_index} encapsulated document info in a structure") + + # append the table to the result list, and the path to metadata + last_file_name_path = Path(self.last_file_name) + suffix_path = last_file_name_path.relative_to(self.data_access.input_folder) + if self.sc_data_access.output_folder is None: + self.sc_data_access.output_folder = self.data_access.output_folder + save_path = os.path.join( + self.sc_data_access.output_folder, + "bands", + f"band={band_ix}", + f"segment={segment_index}", + suffix_path, + ) + segment_band_minhash_table = segment_band_minhash_df.to_arrow() + bytes_written, _, _ = self.sc_data_access.save_table(save_path, segment_band_minhash_table) + if bytes_written > 0: + num_tables_written += 1 + num_docs_written += segment_band_minhash_table.num_rows + num_bytes_written += bytes_written + self.logger.debug(f"Uploaded table for band {band_ix} and segment {segment_index}") + # add the stats to metadata + metadata = { + "input_files": self.files_processed, + "input_docs": len(self.all_minhashes), + "input_bytes": self.bytes_processed, + "output_files": num_tables_written, + "output_docs": num_docs_written, + "output_bytes": num_bytes_written, + } + self.logger.info(f"Wrote {num_tables_written} tables with a total size of {num_bytes_written:,d} bytes") + self.files_processed = 0 + self.bytes_processed = 0 + self.all_minhashes = None + self.all_band_hashes = None + return [], metadata + + # define shingles generation function + def _generate_word_shingles( + self, row: tuple, shingling_option: str, window_size: int = 5, delimiter: str = " " + ) -> tuple[list, int, int]: + text = row[0] + # lower case + text = text.lower() + # replace numbers with '0' + text = NUMBERS_PATTERN.sub("0", text) + # convert punctuation to spaces + text = text.translate(PUNCTUATION_TRANS) + # remove consecutive spaces, newlines, tabs in the middle and in the beginning / end + text = WHITESPACE_PATTERN.sub(" ", text.strip()) + # diacritics/unicode normalization + text = "".join(c for c in unicodedata.normalize("NFD", text) if unicodedata.category(c) != "Mn") + text = text.strip() + self.logger.debug(shingling_option) + if shingling_option == "char": + words = list(text) + else: + words = text.split() + document_id = row[1] + doc_len = len(row[0]) + word_count = len(words) + k_shingles = [] + for i in range(0, max(1, word_count - window_size + 1)): + k_shingles.append(delimiter.join(words[i : i + window_size])) + return k_shingles, doc_len, document_id + + def _emit_bands(self, int_id_column: str, minhashes: np.array, b: int, r: int, seed: int = 42): + num_minhashes = len(minhashes) + assert b * r <= num_minhashes, f"b*r must be <= num minhashes, was b={b}, r={r}, num_minhashes={num_minhashes}" + results = [] + for band_index in range(b): + band_hash, _ = mmh3.hash64( + minhashes[band_index * r : (band_index + 1) * r], + seed=seed, + signed=False, + ) + results.append((band_hash, band_index, int_id_column)) + return results + + # Apply the function + def _process_rows_into_bands(self, df, minhashlsh_num_bands, minhashlsh_length_band): + result = [] + for row in df.iter_rows(): + bands = self._emit_bands( + row[0], # document id + np.array(row[1], dtype=np.uint32), # minhashes + minhashlsh_num_bands, + minhashlsh_length_band, + ) + for band in bands: + result.append(band) + return result + + +class SignatureCalculationTransformConfiguration(TransformConfiguration): + + """ + Provides support for configuring and using the associated Transform class include + configuration with CLI args. + """ + + def __init__(self): + super().__init__( + name=short_name, + transform_class=SignatureCalculationTransform, + remove_from_metadata=[sigcalc_data_factory_key], + ) + self.daf = DataAccessFactory(cli_arg_prefix="scdata_") + + from data_processing.utils import get_logger + + self.logger = get_logger(__name__, level="INFO") + + def add_input_params(self, parser: ArgumentParser) -> None: + """ + Add Transform-specific arguments to the given parser. + This will be included in a dictionary used to initialize the NOOPTransform. + By convention a common prefix should be used for all transform-specific CLI args + (e.g, noop_, pii_, etc.) + """ + parser.add_argument( + f"--{document_id_column_cli_param}", + type=str, + default=document_id_column_default, + help="name of the column storing the unique ID assigned to each document", + ) + parser.add_argument( + f"--{contents_column_cli_param}", + type=str, + default=contents_column_default, + help="name of the column storing the contents of each document", + ) + parser.add_argument( + f"--{seed_cli_param}", + type=int, + default=seed_default, + help="the seed used to instantiate the random number generator", + ) + parser.add_argument( + f"--{num_permutations_cli_param}", + type=int, + default=num_permutations_default, + help="number of permutations (minhashes) calculated for each document", + ) + parser.add_argument( + f"--{jaccard_similarity_threshold_cli_param}", + type=float, + default=jaccard_similarity_threshold_default, + help="Jaccard similarity threshold above which two documents are duplicates", + ) + parser.add_argument( + f"--{word_shingle_size_cli_param}", + type=int, + default=word_shingle_size_default, + help="the size of the word shingles calculated for each document", + ) + parser.add_argument( + f"--{num_bands_cli_param}", + type=int, + default=num_bands_default, + help="the number of bands to use in the banding technique", + ) + parser.add_argument( + f"--{num_minhashes_per_band_cli_param}", + type=int, + default=num_minhashes_per_band_default, + help="the number of minhashes to use in each band", + ) + parser.add_argument( + f"--{num_segments_cli_param}", + type=int, + default=num_segments_default, + help="the number of segments across which we divide the hashing space for each band", + ) + parser.add_argument( + f"--{shingle_option_cli_param}", + type=str, + default=shingle_option_default, + help="Shingling option", + ) + self.daf.add_input_params(parser=parser) + + def apply_input_params(self, args: Namespace) -> bool: + """ + Validate and apply the arguments that have been parsed + :param args: user defined arguments. + :return: True, if validate pass or False otherwise + """ + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + self.logger.info(f"{short_name} parameters are : {self.params}") + self.params[sigcalc_data_factory_key] = self.daf + return self.daf.apply_input_params(args=args) diff --git a/transforms/universal/fdedup/python/src/signature_calc_transform_python.py b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py new file mode 100644 index 000000000..40e0e97e3 --- /dev/null +++ b/transforms/universal/fdedup/python/src/signature_calc_transform_python.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import time + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.runtime.pure_python.runtime_configuration import ( + PythonTransformRuntimeConfiguration, +) +from data_processing.utils import get_logger +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationPythonTransformConfiguration(PythonTransformRuntimeConfiguration): + """ + Implements the PythonTransformConfiguration for NOOP as required by the PythonTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + launcher = PythonTransformLauncher(SignatureCalculationTransformConfiguration()) + logger.info("Launching fuzzy dedup signature calculation transform") + launcher.launch() diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet new file mode 100644 index 000000000..d67b5bcf8 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet new file mode 100644 index 000000000..267e78385 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cleaned/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json new file mode 100644 index 000000000..de47f367b --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/cleaned/metadata.json @@ -0,0 +1,59 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:34:04", + "end_time": "2024-10-18 10:34:04", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "filter_duplicates", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 96.1, + "gpus": 0, + "memory": 23.82, + "object_store": 0, + "execution time, min": 0.006 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 4490, + "result_files": 2, + "result_size": 18001, + "processing_time": 0.341, + "input_files": 2, + "input_docs": 12, + "input_bytes": 8753, + "output_files": 2, + "output_docs": 4, + "output_bytes": 4650, + "filtered_docs": 8, + "filtered_bytes": 4103, + "source_doc_count": 12, + "result_doc_count": 4 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet new file mode 100644 index 000000000..f5da05a10 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet new file mode 100644 index 000000000..0e089dee3 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet new file mode 100644 index 000000000..4b0fecb15 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 000000000..5601f5cb0 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 000000000..02bedff1c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 000000000..bf131f43c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet new file mode 100644 index 000000000..d41b35de2 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 000000000..2838dd972 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 000000000..7cb2cbac4 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 000000000..9de625746 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet new file mode 100644 index 000000000..8e1fe121e Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 000000000..37aea5168 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 000000000..3d1f158e9 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json new file mode 100644 index 000000000..c08326355 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/cluster_analysis/docs_to_remove/metadata.json @@ -0,0 +1,58 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "cluster", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:32:15", + "end_time": "2024-10-18 10:32:15", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "jaccard_similarity_threshold": 0.7, + "num_bands": 14, + "num_segments": 2, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 91.7, + "gpus": 0, + "memory": 24.01, + "object_store": 0, + "execution time, min": 0.001 + }, + "job_output_stats": { + "result_files": 28, + "result_size": 38040, + "processing_time": 0.061, + "input_files": 28, + "input_bytes": 115324, + "input_rows": 168, + "consolidated_files": 28, + "consolidated_bytes": 80640, + "consolidated_rows": 168, + "groupby_clusters": 35, + "cluster_duplicate_docs": 79, + "jaccard_clusters": 35, + "jaccard_duplicate_docs": 44, + "num_duplicate_documents": 44 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 000000000..d67b5bcf8 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 000000000..267e78385 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json new file mode 100644 index 000000000..717d9bbe9 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/data_cleaning/cleaned/metadata.json @@ -0,0 +1,59 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:10:22", + "end_time": "2024-10-18 10:10:23", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "filter_duplicates", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 112.7, + "gpus": 0, + "memory": 24.17, + "object_store": 0, + "execution time, min": 0.005 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 4490, + "result_files": 2, + "result_size": 18001, + "processing_time": 0.308, + "input_files": 2, + "input_docs": 12, + "input_bytes": 8753, + "output_files": 2, + "output_docs": 4, + "output_bytes": 4650, + "filtered_docs": 8, + "filtered_bytes": 4103, + "source_doc_count": 12, + "result_doc_count": 4 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..edbd80b43 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..34b15a76c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/metadata.json b/transforms/universal/fdedup/python/test-data/expected/metadata.json new file mode 100644 index 000000000..ba1f5b0a6 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 13:22:42", + "end_time": "2024-10-18 13:22:42", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 32.5, + "gpus": 0, + "memory": 13.31, + "object_store": 0, + "execution time, min": 0.001 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.047, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..c7d3d8072 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..c355b299a Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=0/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..ad59ee31c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..fb2a0b13d Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=1/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..aca2026d8 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..1a46cb40f Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=10/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..56934cab8 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..f82d9daca Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=11/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..842ce2caa Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..fcb03c17a Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=12/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..84c399e67 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..79a6f24b3 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=13/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..e67164596 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..cd2e75eaa Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=2/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..5212dff6d Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..d0f1bd9b4 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=3/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..1cc7b2c26 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..f892d384d Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=4/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..1a786300b Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..bc20a7699 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=5/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..151008dc4 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..b485d3882 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=6/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..0da33db3c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..1e1b4765c Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=7/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..7e9af93b0 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..d112e179e Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=8/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet new file mode 100644 index 000000000..f3f7d2a7d Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=0/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet new file mode 100644 index 000000000..06444accf Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/expected/signature_calc/bands/band=9/segment=1/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..8a62a81b2 --- /dev/null +++ b/transforms/universal/fdedup/python/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:08:23", + "end_time": "2024-10-18 10:08:23", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 112.8, + "gpus": 0, + "memory": 24.15, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.006, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet new file mode 100644 index 000000000..c9220bf39 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/input/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet new file mode 100644 index 000000000..23fac4c72 Binary files /dev/null and b/transforms/universal/fdedup/python/test-data/input/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py new file mode 100644 index 000000000..cecd224fe --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_cluster_analysis_transform_python.py @@ -0,0 +1,48 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import sort_output_cli_param +from cluster_analysis_transform_python import ( + ClusterAnalysisPythonTransformConfiguration, +) +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(ClusterAnalysisPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + basedir + "/expected/signature_calc/bands", + basedir + "/expected/cluster_analysis/docs_to_remove", + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py new file mode 100644 index 000000000..8c4debed9 --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_data_cleaning_transform_python.py @@ -0,0 +1,49 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_python import DataCleaningPythonTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) + + +class TestPythonDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected/get_list_transform/docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + } + launcher = PythonTransformLauncher(DataCleaningPythonTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input", basedir + "/expected/data_cleaning/cleaned")] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_get_duplicate_list_transform_python.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py new file mode 100644 index 000000000..9ad8a32d7 --- /dev/null +++ b/transforms/universal/fdedup/python/test/test_signature_calc_transform_python.py @@ -0,0 +1,40 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from signature_calc_transform_python import ( + SignatureCalculationPythonTransformConfiguration, +) + + +class TestPythonSignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, + } + launcher = PythonTransformLauncher(SignatureCalculationPythonTransformConfiguration()) + fixtures = [(launcher, config, basedir + "/input/", basedir + "/expected/signature_calc/")] + return fixtures diff --git a/transforms/universal/fdedup/ray/Dockerfile b/transforms/universal/fdedup/ray/Dockerfile index 0b2e9cf1a..4bfe32a9e 100644 --- a/transforms/universal/fdedup/ray/Dockerfile +++ b/transforms/universal/fdedup/ray/Dockerfile @@ -1,5 +1,4 @@ -ARG BASE_IMAGE=docker.io/rayproject/ray:2.24.0-py310 - +ARG BASE_IMAGE=docker.io/rayproject/ray:2.36.1-py310 FROM ${BASE_IMAGE} RUN pip install --upgrade --no-cache-dir pip @@ -14,24 +13,31 @@ COPY --chown=ray:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] ## Copy the python version of the tansform +COPY --chown=ray:users python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . # Install ray project source COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml COPY --chown=ray:users README.md README.md -COPY --chown=ray:users images/ images/ +COPY --chown=ray:users requirements.txt requirements.txt RUN pip install --no-cache-dir -e . -# copy the main() entry point to the image -COPY ./src/fdedup_transform_ray.py . - -# copy some of the samples in -COPY src/fdedup_local_ray.py local/ +# copy source files needed by test-image +COPY --chown=ray:users ./src/fdedup_transform_ray.py fdedup_transform_ray.py +COPY --chown=ray:users ./src/signature_calc_transform_ray.py signature_calc_transform_ray.py +COPY --chown=ray:users ./src/cluster_analysis_transform_ray.py cluster_analysis_transform_ray.py +COPY --chown=ray:users ./src/get_duplicate_list_transform_ray.py get_duplicate_list_transform_ray.py +COPY --chown=ray:users ./src/data_cleaning_transform_ray.py data_cleaning_transform_ray.py +COPY --chown=ray:users ./src/signature_calc_local_ray.py local/fdedup_local_ray.py # copy test COPY test/ test/ COPY test-data/ test-data/ +USER root +RUN chmod a+rwx /home/ray +USER ray # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/universal/fdedup/ray/Makefile b/transforms/universal/fdedup/ray/Makefile index f5f06c3c3..ec193b6c3 100644 --- a/transforms/universal/fdedup/ray/Makefile +++ b/transforms/universal/fdedup/ray/Makefile @@ -43,7 +43,7 @@ setup:: .transforms.setup # TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_RAY_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist diff --git a/transforms/universal/fdedup/ray/README.md b/transforms/universal/fdedup/ray/README.md index 41be44301..d93be3a4a 100644 --- a/transforms/universal/fdedup/ray/README.md +++ b/transforms/universal/fdedup/ray/README.md @@ -1,185 +1,45 @@ # Fuzzy Dedup -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +configuration, testing and IDE set up. ## Summary -The basic implementation of the fuzzy dedup is based on [MinHash](https://en.wikipedia.org/wiki/MinHash). Also see -[here](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) for more details. The architecture of the implementation is presented here: +This project wraps the [Fuzzy Dedup transform](../python) with a Ray runtime. -![](images/fuzzy.png) +## Configuration and command line Options -The main components of implementation are driver, processors (implemented as actor pools) - table processor, table -filter and bucket hash processor, and hash actors - minhash, buckets and docs. - -The complication of mapping this model to transform model is the fact that in this model assumes a two pass processing, -while a transform model is a single pass. The solution to this mismatch is to use transform runtime to implement the -first path and use the native transform pipeline to implement filtering. - -## Transform runtime -The [transform runtime](src/fdedup_transform_ray.py) is implementing complete first path of the fuzzy deduping: -* creates bucket and minhash collectors -* implements initial file processing to populate bucket and minhash caches -* creates doc collectors -* implement bucket processing -* Clean up everything except for doc collectors in preparation to filter, that is implemented by the framework proper -The main components of runtime are described below - -### TableProcessor Actor - -[Table processing actor](src/fdedup_transform_ray.py) is implemented following framework itself is implemented as a pair - -`FdedupTransform` implementing the actual transformation and and -[transform table processor](../../../../data-processing-lib/src/data_processing/runtime/ray/transform_table_processor.py) -(from the framework itself). - -### DocsMinHash Actor - -This [actor](src/fdedup_support.py) stores MInHashes - -### BucketsHash Actor - -This actor [actor](src/fdedup_support.py) - -### BucketHashProcessor - -BucketHash [actor](src/fdedup_support.py) implement the actual buckets processing, removing duplicates. -Implementation of this actor allows to better manage this "expensive" process, by using Actor pool load balancing -thus minimizing overall time for this operation. Instead of pre partitioning buckets, it is using dynamic load -partitioning. We also are processing "longest" buckets first thus further improving performance. To further improve -the overall performance we can in future implement bucket splitting - its faster to process more smaller buckets -then the long ones - -### BucketHashProcessor - -This [actor](src/fdedup_support.py) is queueing up requests to the `BucketHashProcessor` actor pool, which load -balances their execution - -### DocCollector Actor - -This [actor](src/fdedup_support.py) is a collector for unique documents - -## Transformer - -In the fuzzy dedup implementation, the [transformer](src/fdedup_transform_ray.py) only implements filtering. For every -table, it checks document ids with the `DocumentsCollector` cache and removes all of the rows which do not have ids in -the hash - -## Snapshotting - -Fuzzy dedup often runs on very large data sets and implements three very distinct phases: -* Building buckets -* Processing buckets -* Filtering data -To improve recoverability of fuzzy dedup, current implementation includes snapshotting - at the end of the first two -phases we snapshot the current state of execution - bucket and minhash actors after the first phase and document actors -after the second. This snapshotting provide code with the ability to restart from the existing snapshot. You can use one -of two configuration flags (assuming snapshots exist): -* `use_bucket_snapshot` to start from the second phase -* `use_doc_snapshot` to start from the third phase - -## Building - -A [docker file](Dockerfile) that can be used for building docker image. You can use - -```shell -make build to build it -``` - -### Configuration and command line Options - -The set of dictionary keys holding [BlockListTransform](src/blocklist_transform.py) -configuration for values are as follows: - -* _bucket_cpu_ - specifies number of CPUs for bucket actor -* _doc_cpu_ - specifies number of CPUs for doc actor -* _mhash_cpu_ - specifies number of CPUs for minhash actor -* _num_doc_actors_ - specifies number of doc actors -* _num_bucket_actors_ - specifies number of bucket actors -* _num_minhash_actors_ - specifies number of minhash actors -* _num_preprocessors_ - specifies number of preprocessors -* _num_permutations_ - specifies number of permutations -* _threshold_ - specifies threshold -* _shingles_size_ - specifies shingles size -* _japanese_data_ - specifies whether to use japanese specific document splitting -* _delimiters_ - specifies delimiter for non japanese document splitting -* _snapshot_delay_ - delay between different actors reading/writing snapshot not to overwhelm storage -* -use_bucket_snapshot_ - run from the existing buckets snapshot (bypass building buckets) -* -use_doc_snapshot_ - run from the existing docs snapshot (bypass building and processing buckets) - -Above you see both parameters and their values for small runs (tens of files). We also provide an -[estimate](src/cluster_estimator.py) to roughly determine cluster size for running transformer. +Fuzzy Dedup configuration and command line options are the same as for the base python transform. ## Running - - -### Launched Command Line Options +### Launched Command Line Options When running the transform with the Ray launcher (i.e. TransformLauncher), -the following command line arguments are available in addition to -[the options provided by the launcher](../../../../data-processing-lib/doc/ray-launcher-options.md). - -```shell - --fdedup_doc_column FDEDUP_DOC_COLUMN - document column name - --fdedup_id_column FDEDUP_ID_COLUMN - integer document id column name - --fdedup_cluster_column FDEDUP_CLUSTER_COLUMN - cluster column name - --fdedup_bucket_cpu FDEDUP_BUCKET_CPU - number of CPUs per bucket hash - --fdedup_mhash_cpu FDEDUP_MHASH_CPU - number of CPUs per minhash hash - --fdedup_doc_cpu FDEDUP_DOC_CPU - number of CPUs per doc hash - --fdedup_num_doc_actors FDEDUP_NUM_DOC_ACTORS - number of doc actors to use - --fdedup_num_minhash_actors FDEDUP_NUM_MINHASH_ACTORS - number of minhash actors to use - --fdedup_num_bucket_actors FDEDUP_NUM_BUCKET_ACTORS - number of bucket actors to use - --fdedup_num_preprocessors FDEDUP_NUM_PREPROCESSORS - number of preprocessors to use - --fdedup_num_permutations FDEDUP_NUM_PERMUTATIONS - number of permutations - --fdedup_threshold FDEDUP_THRESHOLD - threshold - --fdedup_shingles_size FDEDUP_SHINGLES_SIZE - number of words in shingle - --fdedup_delimiters FDEDUP_DELIMITERS - delimiter for splitting document - --fdedup_snapshot_delay FDEDUP_SNAPSHOT_DELAY - snapshot delay time - --fdedup_use_bucket_snapshot FDEDUP_USE_BUCKET_SNAPSHOT - flag to continue with bucket snapshot - --fdedup_use_doc_snapshot FDEDUP_USE_DOC_SNAPSHOT - flag to continue with doc snapshot - --fdedup_random_delay_limit FDEDUP_RANDOM_DELAY_LIMIT - maximum delay between read -``` - -These correspond to the configuration keys described above. +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. ### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/fdedup_transform_ray.py using command line args -* `run-local-sample` - runs src/fdedup_local_ray.py -* `run-s3-sample` - runs src/fdedup_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. +To run the samples, use the following `make` target to create a virtual environment: -For example, -```shell -make run-cli-sample -... +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_ray.py ``` -Then +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_ray.py +python cluster_analysis_local_ray.py +python get_duplicate_list_local_ray.py +python data_cleaning_local_ray.py +``` +After running the transforms, execute: ```shell ls output ``` @@ -190,3 +50,18 @@ To see results of the transform. To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. + +## Testing + +For testing fuzzy deduplication in a ray runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` \ No newline at end of file diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 923cbdf82..485d6de21 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,25 +1,21 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ - { name = "David Wood", email = "dawood@us.ibm.com" }, - { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, -] -dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "mmh3>=4.1.0", - "xxhash==3.4.1", - "tqdm==4.66.3", - "scipy>=1.12.0, <2.0.0" + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [project.optional-dependencies] dev = [ diff --git a/transforms/universal/fdedup/ray/requirements.txt b/transforms/universal/fdedup/ray/requirements.txt new file mode 100644 index 000000000..23e0a8b75 --- /dev/null +++ b/transforms/universal/fdedup/ray/requirements.txt @@ -0,0 +1,6 @@ +data-prep-toolkit[ray]>=0.2.3.dev0 +dpk_fdedup_transform_python==0.2.3.dev0 +mmh3>=4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py new file mode 100644 index 000000000..c54ba85c2 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_local_ray.py @@ -0,0 +1,53 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher + + +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py new file mode 100644 index 000000000..a0e8e7de2 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/cluster_analysis_transform_ray.py @@ -0,0 +1,74 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, + RayTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class ClusterAnalysisRayRuntime(DefaultRayTransformRuntime): + """ + Cluster analysis runtime support for Ray + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + +class ClusterAnalysisRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for Fuzzy Dedup Cluster Analysis + as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisRayRuntime, + ) + + +if __name__ == "__main__": + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + logger.info("Launching fuzzy dedup cluster analysis ray transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/compute_shingles.py b/transforms/universal/fdedup/ray/src/compute_shingles.py deleted file mode 100644 index 2db75ebe2..000000000 --- a/transforms/universal/fdedup/ray/src/compute_shingles.py +++ /dev/null @@ -1,50 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import string - - -""" -This implements the most simplistic splitting of document based on the white spaces -that can be overwritten by a different document splitter (tokenizer). This method is -build in the library and can be overwritten using approach described at -https://stackoverflow.com/questions/37553545/how-do-i-override-a-function-of-a-python-library - -import compute_shingles -compute_shingles.compute_shingles = my_local_compute_shingles -""" - - -def _find(s: str, ch: str) -> list[int]: - """ - Get indexes of all locations of character in string - :param s: string - :param ch: character - :return: list of locations - """ - return [i for i, ltr in enumerate(s) if ltr == ch] - - -def compute_shingles(txt: str, word_shingle_size: int, delimiter: str = " ") -> list[str]: - """ - Generate word shingles - :param txt: document - :param delimiter: delimiter to split document - :param word_shingle_size: size of shingle in words - :return: list of shingles - """ - text = txt.replace("\n", "").lower().translate(str.maketrans("", "", string.punctuation)) - separators = _find(text, delimiter) - if len(separators) + 1 <= word_shingle_size: - return [text] - bounds = [-1] + separators + [len(text)] - return [text[bounds[i] + 1 : bounds[i + word_shingle_size]] for i in range(0, len(bounds) - word_shingle_size)] diff --git a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py similarity index 59% rename from transforms/universal/fdedup/ray/src/fdedup_local_ray.py rename to transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py index af7bec71c..b951e2fc8 100644 --- a/transforms/universal/fdedup/ray/src/fdedup_local_ray.py +++ b/transforms/universal/fdedup/ray/src/data_cleaning_local_ray.py @@ -13,59 +13,57 @@ import os import sys +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration -# create launcher -launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data/input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../output")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, } +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) +) worker_options = {"num_cpus": 0.8} + code_location = {"github": "github", "commit_hash": "12345", "path": "path"} params = { # where to run "run_locally": True, # Data access. Only required parameters are specified "data_local_config": ParamsUtils.convert_to_ast(local_conf), - # Orchestration parameters - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 1, + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info "runtime_pipeline_id": "pipeline_id", "runtime_job_id": "job_id", "runtime_creation_delay": 0, "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 1, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 2, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, } -sys.argv = ParamsUtils.dict_to_req(d=params) -# launch -launcher.launch() + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py new file mode 100644 index 000000000..88171e260 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/data_cleaning_transform_ray.py @@ -0,0 +1,138 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +import ray +from data_cleaning_transform import ( + DataCleaningTransform, + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) +from data_processing.data_access import DataAccessFactoryBase +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, +) +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from ray.actor import ActorHandle + + +logger = get_logger(__name__) + + +class DataCleaningRayTransform(DataCleaningTransform): + """ """ + + def __init__(self, config: dict): + """ + Initialize based on the dictionary of configuration information. + This is generally called with configuration parsed from the CLI arguments defined + by the companion runtime, LangSelectorTransformRuntime. If running inside the RayMutatingDriver, + these will be provided by that class with help from the RayMutatingDriver. + """ + docs2removedf = config.get("df", None) + if docs2removedf is not None: + # This is recommended for production approach. In this case domain list is build by the + # runtime once, loaded to the object store and can be accessed by actors without additional reads + try: + config["df"] = ray.get(config.get("df")) + except Exception as e: + self.logger.warning(f"Exception loading docs2remove list from ray object storage {e}") + raise RuntimeError(f"exception loading from object storage for key {docs2removedf}") + super().__init__(config) + + +class DataCleaningRuntime(DefaultRayTransformRuntime): + """ + Ingest Data cleaning runtime support + """ + + def __init__(self, params: dict[str, Any]): + """ + Create filter runtime + :param params: parameters, that should include + ingest_supported_langs_file_key: supported languages file + ingest_detect_programming_lang_key: whether to detect programming language + ingest_domain_key: domain + ingest_snapshot_key: snapshot + """ + super().__init__(params) + from data_processing.utils import get_logger + + self.logger = get_logger(__name__) + + def get_transform_config( + self, + data_access_factory: DataAccessFactoryBase, + statistics: ActorHandle, + files: list[str], + ) -> dict[str, Any]: + """ + Set environment for filter execution + :param data_access_factory - data access factory + :param statistics - reference to the statistics object + :param files - list of files to remove + :return: dictionary of filter init params + """ + data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = dc_data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) + docs_to_remove_list = ray.put(duplicate_list) + return {"df": docs_to_remove_list} | self.params + + +class DataCleaningRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(transform_class=DataCleaningRayTransform), + runtime_class=DataCleaningRuntime, + ) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py b/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py deleted file mode 100644 index 285fcfa22..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_s3_ray.py +++ /dev/null @@ -1,76 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import sys - -from data_processing.utils import ParamsUtils -from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration - - -# create launcher -launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) -# create parameters -s3_cred = { - "access_key": "localminioaccesskey", - "secret_key": "localminiosecretkey", - "url": "http://localhost:9000", -} - -s3_conf = { - "input_folder": "test/fdedup/input", - "output_folder": "test/fdedup/output", -} -worker_options = {"num_cpus": 0.8} -code_location = {"github": "github", "commit_hash": "12345", "path": "path"} -params = { - # where to run - "run_locally": True, - # Data access. Only required parameters are specified - "data_s3_config": ParamsUtils.convert_to_ast(s3_conf), - "data_s3_cred": ParamsUtils.convert_to_ast(s3_cred), - # Orchestration parameters - "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), - "runtime_num_workers": 5, - "runtime_pipeline_id": "pipeline_id", - "runtime_job_id": "job_id", - "runtime_creation_delay": 0, - "runtime_code_location": ParamsUtils.convert_to_ast(code_location), - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 2, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 2, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, -} -sys.argv = ParamsUtils.dict_to_req(d=params) - - -# launch -launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/fdedup_support.py b/transforms/universal/fdedup/ray/src/fdedup_support.py deleted file mode 100644 index 60afb84bf..000000000 --- a/transforms/universal/fdedup/ray/src/fdedup_support.py +++ /dev/null @@ -1,621 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import pickle -import time -from typing import Any, Iterator, Union - -import numpy as np -import ray -from data_processing.data_access import SnapshotUtils -from data_processing.utils import GB, RANDOM_SEED, TransformUtils, get_logger -from data_processing_ray.runtime.ray import RayUtils -from ray.actor import ActorHandle -from ray.util import ActorPool -from scipy.integrate import quad as integrate - - -NO_SIMILARITY = -1 -REQUEST_LEN = 4096 -LONG_BUCKET = 5000 -LONG_BUCKET_PRINT = 1000 - - -def fuzzy_optimal_param( - threshold: float, - num_perm: int, - false_positive_weight: float, - false_negative_weight: float, -) -> tuple[int, int]: - """ - Computes parameters for fuzzy dedup - :param threshold: filtering threshold - :param num_perm: number of permutations - :param false_positive_weight: false positive weight - :param false_negative_weight: false negative weight - :return: number of buckets and bucket length - """ - - def _false_positive_probability(ths: float, b: int, r: int) -> float: - """ - Compute false positive probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - s ** float(r)) ** float(b) - a, err = integrate(_probability, 0.0, ths) - return a - - def _false_negative_probability(ths: float, b: int, r: int) -> float: - """ - Compute false negative probability - :param ths: filtering threshold - :param b: permutation - :param r: rel permutation - :return: probability - """ - _probability = lambda s: 1 - (1 - (1 - s ** float(r)) ** float(b)) - a, err = integrate(_probability, ths, 1.0) - return a - - min_error = float("inf") - opt = (0, 0) - for perm in range(1, num_perm + 1): - max_r = int(num_perm / perm) - for rel in range(1, max_r + 1): - fp = _false_positive_probability(threshold, perm, rel) - fn = _false_negative_probability(threshold, perm, rel) - error = fp * false_positive_weight + fn * false_negative_weight - if error < min_error: - min_error = error - opt = (perm, rel) - return opt - - -class MurmurMH: - def __init__(self, num_perm: int, seed: int = RANDOM_SEED): - self.seed = seed - self.num_perm = num_perm - self.permutations = self._init_permutations(seed, num_perm) - - def minhash(self, shingle_count: int, shingles: Iterator[str]) -> np.array: - def generator(): - for shingle in shingles: - yield TransformUtils.str_to_int(shingle) - - hash_values = np.fromiter(generator(), dtype=np.uint64, count=shingle_count) - - result = np.zeros(self.permutations.shape, dtype=np.uint32) - for i, perm in enumerate(self.permutations): - result[i] = np.right_shift((perm * hash_values).T, 32).astype(np.uint32).min(axis=0, keepdims=False) - return result - - @staticmethod - def _init_permutations(seed: int, num_perm: int) -> np.array: - # see https://en.wikipedia.org/wiki/Universal_hashing#Avoiding_modular_arithmetic - max_int = np.uint64((1 << 64) - 1) - gen = np.random.RandomState(seed) - # get self.num_perm pseudo random numbers between 2 and max_int (excl) - permutations = np.array([gen.randint(0, max_int, dtype=np.uint64) for _ in range(num_perm)], dtype=np.uint64).T - # make all even pseudo random numbers odd by adding 1 - permutations[permutations % 2 == 0] += 1 - return permutations - - @staticmethod - def jaccard(mh1: np.array, mh2: np.array) -> float: - return np.count_nonzero(mh1 == mh2) - - -@ray.remote(scheduling_strategy="SPREAD") -class DocCollector: - """ - An actor collecting de duped document IDs - """ - - def __init__(self, params: dict[str, Any]): - """ - Initializer - """ - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - self.removed = set() - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.ids = {} - else: - try: - bids, _ = self.data_access.get_file(snapshot) - self.ids = pickle.loads(bids) - except Exception as e: - self.logger.warning(f"Failed to load doc collector {self.actor_id} with exception {e}") - raise e - - def add_documents(self, dr: tuple[list[tuple[int, int]], list[int]]) -> None: - """ - Add documents and removed document - :param dr: documents to keep and documents to remove - :return: - """ - docs = dr[0] - rm = dr[1] - # process documents to remove - for did in rm: - self.ids.pop(did, None) - self.removed.update(rm) - # process documents to keep - for key, val in docs: - if key in self.removed: - continue - if key in self.ids and val == NO_SIMILARITY: - # Do not update existing docs with NO_SIMILARITY - continue - else: - self.ids[key] = val - - def filter(self, docs: list[int]) -> dict[int, int]: - """ - Filter documents - :param docs: documents to filter - :return: documents to keep - """ - result = {} - for doc_id in docs: - r = self.ids.get(doc_id, None) - if r is not None: - result[doc_id] = r - return result - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_doc = pickle.dumps(self.ids) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}docs/doc_collector_{self.actor_id}", b_doc - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot doc collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float, int, float]: - """ - get sizes - :return: number of ids, its memory utilization, number of removed, its memory utilization - """ - return ( - len(self.ids), - TransformUtils.deep_get_size(self.ids) / GB, - len(self.removed), - TransformUtils.deep_get_size(self.removed) / GB, - ) - - -@ray.remote(scheduling_strategy="SPREAD") -class DocsMinHash: - """ - An actor storing min hashes for a doc id - """ - - def __init__(self, params: dict[str, Any]): - """ - Initialize - :param params: parameters - """ - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.docs = {} - else: - try: - bdocs, _ = self.data_access.get_file(snapshot) - self.docs = pickle.loads(bdocs) - except Exception as e: - self.logger.warning(f"Failed to load minhash collector {self.actor_id} with exception {e}") - raise e - - def add_minhashes(self, updates: list[tuple[int, int, np.array]]) -> None: - """ - Add minhashes - :param updates: minhash for doc_id a tuple of doc len and array of hashes - :return: None - """ - for doc_id, length, minhash in updates: - self.docs[doc_id] = np.concatenate(([length], minhash)) - - def get_minhashes(self, doc_ids: list[int]) -> list[tuple[int, int, np.array]]: - """ - Get minhashes for a list of documents - :param doc_ids: list of doc ids - :return: doc id, len, minhashes - """ - result = [] - for doc_id in doc_ids: - info = self.docs.get(doc_id) - if info is not None: - result.append((doc_id, info[0], info[1:])) - return result - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_doc = pickle.dumps(self.docs) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}minhash/minhash_collector_{self.actor_id}", - b_doc, - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot minhash collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float]: - """ - Get size of used min hashes - :return: number of docs, its memory utilization - """ - return len(self.docs), TransformUtils.deep_get_size(self.docs) / GB - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHash: - """ - Actor storing buckets information - """ - - def __init__(self, params: dict[str, Any]): - """ - Initialization - """ - from ray.util.metrics import Counter - - self.submitter = None - self.n_buckets = 0 - self.bucket_memory = 0 - self.logger = get_logger(__name__) - self.actor_id = params.get("id") - data_access_factory = params.get("data_access") - self.data_access = data_access_factory.create_data_access() - snapshot = params.get("snapshot", None) - if snapshot is None: - self.buckets = {} - else: - try: - b_buckets, _ = self.data_access.get_file(snapshot) - self.buckets = pickle.loads(b_buckets) - except Exception as e: - self.logger.warning(f"Failed to load buckets collector {self.actor_id} with exception {e}") - raise e - self.bucket_created_counter = Counter("bucket_created", "Amount of buckets created") - self.long_bucket_submit_counter = Counter("long_bucket_submitted", "Amount of long buckets submitted") - self.short_bucket_submit_counter = Counter("short_bucket_submitted", "Amount of short buckets submitted") - - def add_buckets(self, bck: list[tuple[int, list[int]]]) -> None: - """ - Add additional buckets to hash - :param bck: bucket information - :return: None - """ - for bucket in bck: - b_hash = bucket[0] - buckets_for_hash = self.buckets.get(b_hash) - if buckets_for_hash: - if type(buckets_for_hash) == int: - self.buckets[b_hash] = [buckets_for_hash] + bucket[1] - else: - buckets_for_hash.extend(bucket[1]) - else: - if len(bucket[1]) == 1: - self.buckets[b_hash] = bucket[1][0] - else: - self.buckets[b_hash] = bucket[1] - self.bucket_created_counter.inc(1) - - def add_processing_submitter(self, submitter: ActorHandle) -> None: - """ - Add process submitter - :param submitter: reference to submitter - :return: - """ - self.submitter = submitter - - def process_buckets(self) -> None: - """ - Process buckets to generate documents - :return: None - """ - - # Remember usage - self.n_buckets = len(self.buckets) - self.bucket_memory = TransformUtils.deep_get_size(self.buckets) / GB - - # split buckets into short and long. Long buckets can take very long to process - long_buckets = [] - short_buckets = [] - while len(self.buckets) > 0: - doc_id, bucket = self.buckets.popitem() - if type(bucket) == list and len(bucket) > LONG_BUCKET: - # Its long - long_buckets.append(bucket) - else: - short_buckets.append(bucket) - self.logger.info(f"processing buckets {len(long_buckets)} long, {len(short_buckets)} short") - - # process long buckets first - we are submitting them one at a time - for bucket in long_buckets: - if len(bucket) > 2 * LONG_BUCKET: - # For very long buckets, split them - self.logger.info(f"Splitting bucket of length len(bucket) into chunks") - smaller_bucket = [ - bucket[i * LONG_BUCKET : (i + 1) * LONG_BUCKET] - for i in range((len(bucket) + LONG_BUCKET - 1) // LONG_BUCKET) - ] - for b in smaller_bucket: - ray.get(self.submitter.submit_for_processing.remote([b])) - self.long_bucket_submit_counter.inc(1) - else: - ray.get(self.submitter.submit_for_processing.remote([bucket])) - self.long_bucket_submit_counter.inc(1) - self.logger.info("Done submitting long buckets") - - # And now the rest of buckets - bucket_chunks = [short_buckets[i * 100 : (i + 1) * 100] for i in range((len(short_buckets) + 99) // 100)] - for b in bucket_chunks: - ray.get(self.submitter.submit_for_processing.remote(b)) - self.short_bucket_submit_counter.inc(len(b)) - - def snapshot(self) -> None: - """ - Snapshotting itself - """ - try: - b_buckets = pickle.dumps(self.buckets) - self.data_access.save_file( - f"{SnapshotUtils.get_snapshot_folder(self.data_access)}buckets/buckets_collector_{self.actor_id}", - b_buckets, - ) - except Exception as e: - self.logger.warning(f"Failed to snapshot buckets collector {self.actor_id} with exception {e}") - raise e - - def get_size(self) -> tuple[int, float]: - """ - Get buckets resource utilization - :return: number of buckets and memory utilization - """ - return self.n_buckets, self.bucket_memory - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHashProcessor: - """ - Actor for processing buckets - """ - - def __init__(self, params: dict[str, Any]): - """ - Init method - :param params - dictionary of parameters containing the following keys - remote_docs - handles to the remote docs - remote_minhashes - handles to the remote minhashes - mn_min_hash - MurmurMH class - threshold - threshold - statistics - statistics actor - """ - from ray.util.metrics import Counter - - self.threshold = params["threshold"] - self.mn_min_hash = params["mn_min_hash"] - self.remote_docs = params["remote_docs"] - self.remote_minhashes = params["remote_minhashes"] - self.stats = params["statistics"] - self.logger = get_logger(__name__) - self.bucket_processed_counter = Counter("bucket_processed", "Amount of buckets processed") - - def _submit_generated_docs(self, docs: dict[int, int], removed: set[int]) -> None: - """ - Submit generated documents - :param docs: docs to submit - :param removed: removed documents - :return: None - """ - # Remove doc ids that are already removed - for did in removed: - docs.pop(did, None) - # Build remote requests - request = [([], []) for _ in range(len(self.remote_docs))] - for key, value in docs.items(): - req_tuple = request[key % len(self.remote_docs)] - req_tuple[0].append((key, value)) - for did in removed: - req_tuple = request[did % len(self.remote_docs)] - req_tuple[1].append(did) - # Submit requests and wait for replies - remote_replies = [] - i = 0 - for req in request: - if len(req[0]) > 0 or len(req[1]) > 0: # Only submit if the request has data - remote_replies.append(self.remote_docs[i].add_documents.remote(req)) - i += 1 - # Process replies - RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies) - - # get minhashes and length for docs in the bucket - def _get_minhashes_docs(self, doc_ids: list[int]) -> dict[int, tuple[int, list[int]]]: - """ - Get minhashes for documents by submitting requests to an appropriate doc collectors - :param doc_ids: doc ids - :return: doc ids with hashes - """ - request = [[] for _ in range(len(self.remote_minhashes))] - for value in doc_ids: - request[value % len(self.remote_minhashes)].append(value) - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.remote_minhashes[i].get_minhashes.remote(req)) - i += 1 - # Process replies - hashes = {} - while remote_replies: - # Wait for replies - ready, not_ready = ray.wait(remote_replies) - reply = ray.get(ready)[0] - for r in reply: - hashes[r[0]] = (r[1], r[2]) - remote_replies = not_ready - return hashes - - def process_buckets(self, buckets: list[Union[int, list[int]]]) -> None: - """ - process buckets to generate documents - :param buckets: buckets - :return: none - """ - t_start = time.time() - docs = {} - removed = set() - for bucket in buckets: - if type(bucket) == int: - # This hash has a single document - if bucket not in docs: - docs[bucket] = NO_SIMILARITY - self.bucket_processed_counter.inc(1) - continue - # multiple documents - start = time.time() - bucket_len = len(bucket) - very_long = bucket_len > LONG_BUCKET - - hashes = self._get_minhashes_docs(bucket) - set_list = [] - unvisited = set(bucket) - - # combine similar documents - index = 0 - while len(unvisited) > 0: - current_doc_id = unvisited.pop() - current_mh = hashes[current_doc_id][1] - current_set = set() - for other_doc_id in bucket: - if other_doc_id in unvisited: - other_mh = hashes[other_doc_id][1] - if self.mn_min_hash.jaccard(current_mh, other_mh) >= self.threshold: - current_set.add(current_doc_id) - current_set.add(other_doc_id) - unvisited.discard(other_doc_id) - if len(current_set) > 0: - set_list.append(current_set) - index += 1 - if index % LONG_BUCKET_PRINT == 0: - self.logger.info(f"processing very long {bucket_len} bucket, {index} documents so far") - if index > LONG_BUCKET_PRINT: - self.logger.info(f"done processing very long {bucket_len}") - - # process created sets - for current_set in set_list: - for d in current_set: - bucket.remove(d) - removed.update(current_set) - for i, doc_id in enumerate(current_set): - if i == 0: - cluster_id = doc_id - remaining = doc_id - min_len = hashes[doc_id][0] - max_len = min_len - continue - c_len = hashes[doc_id][0] - if c_len > max_len: - max_len = c_len - remaining = doc_id - continue - if c_len <= min_len: - min_len = c_len - cluster_id = doc_id - docs[remaining] = cluster_id - removed.discard(remaining) - - # if we did not find docs in connections, submit them as NO_SIMILARITY - for d in bucket: - if d not in docs: - docs[d] = NO_SIMILARITY - if very_long: - self.logger.info( - f"Processed long ({bucket_len}) bucket in {round((time.time() - start) / 60.,3)} " - f"min; " - f"docs chains {len(set_list)}" - ) - self.bucket_processed_counter.inc(1) - # Submit docs - self._submit_generated_docs(docs, removed) - # peg stats - self.stats.add_stats.remote({"generated doc_ids": len(docs), "bucket processing time": time.time() - t_start}) - - -@ray.remote(scheduling_strategy="SPREAD") -class BucketsHashProcessorInvoker(object): - """ - Bucket hash processing coordinator (singleton) - """ - - def __init__(self, bucket_processors: list[ActorHandle]) -> None: - self.n_processors = len(bucket_processors) - self.pool = ActorPool(bucket_processors) - self.submitted = 0 - self.processed = 0 - self.logger = get_logger(__name__) - self.start = time.time() - - def submit_for_processing(self, buckets: list[Union[int, list[int]]]) -> None: - # Get completed results - if self.submitted < self.n_processors: # still have room - self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets) - self.logger.debug("Submitted bucket processing request") - self.submitted += 1 - return - else: - while True: - # we can have several workers fail here - try: - self.pool.get_next_unordered() - break - except Exception as e: - self.logger.error(f"Failed to process request worker exception {e}") - self.processed += 1 - self.processed += 1 - if self.processed % 100 == 0: - self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min") - self.logger.debug("Completed bucket processing request") - self.pool.submit(lambda a, v: a.process_buckets.remote(v), buckets) - self.submitted += 1 - self.logger.debug("Submitted bucket processing request") - return - - def wait_for_completion(self) -> None: - self.logger.info(f"Waiting bucket processing completion. Submitted requests {self.submitted}") - while self.pool.has_next(): - try: - self.pool.get_next_unordered() - except Exception as e: - self.logger.error(f"Failed to process request worker exception {e}") - self.processed += 1 - if self.processed % 100 == 0: - self.logger.info(f"processed {self.processed} buckets in {(time.time() - self.start)/60} min") diff --git a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py index 6c6c02bb3..be1bf5fcb 100644 --- a/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py +++ b/transforms/universal/fdedup/ray/src/fdedup_transform_ray.py @@ -10,794 +10,67 @@ # limitations under the License. ################################################################################ -import random -import time -from argparse import ArgumentParser, Namespace -from typing import Any - -import mmh3 -import numpy as np -import pyarrow as pa -import ray -from data_processing.data_access import DataAccessFactoryBase, SnapshotUtils -from data_processing.transform import AbstractTableTransform, TransformConfiguration -from data_processing.utils import ( - RANDOM_SEED, - CLIArgumentProvider, - TransformUtils, - str2bool, -) -from data_processing_ray.runtime.ray import ( - DefaultRayTransformRuntime, - RayTransformFileProcessor, - RayTransformLauncher, - RayUtils, +import argparse +import os +import sys + +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from fdedup_transform_python import ServiceOrchestrator, parse_args +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, ) -from data_processing_ray.runtime.ray.runtime_configuration import ( - RayTransformRuntimeConfiguration, +from get_duplicate_list_transform_ray import ( + GetDuplicateListRayRuntime, + GetDuplicateListRayTransformConfiguration, ) -from fdedup_support import ( - REQUEST_LEN, - BucketsHash, - BucketsHashProcessor, - BucketsHashProcessorInvoker, - DocCollector, - DocsMinHash, - MurmurMH, - fuzzy_optimal_param, -) -from ray.actor import ActorHandle -from ray.util import ActorPool - - -short_name = "fdedup" -cli_prefix = f"{short_name}_" - - -class FdedupTransform(AbstractTableTransform): - """ - Implements fuzzy dedup data preprocessor (building tables and minhashes). - """ - - def __init__(self, config: dict): - """ - Initialize based on the dictionary of configuration information. - :param config: initialization parameters, with the following keys - doc_column - name of doc column - doc_id_int_column - name of int doc id column - word_shingle_size - word shingle size - mn_min_hash - MurmurMH class - num_bands - number of bands - length_band band length - remote_buckets - bucket actors - remote_minhashes - minhash actors - delimiter - delimiter - random_delay_limit - random delay limit - """ - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.doc_id_column = config.get("doc_id_int_column", "") - self.word_shingle_size = config.get("word_shingle_size", 1) - self.delimiter = config.get("delimiter", " ") - self.mn_min_hash = config.get("mn_min_hash", None) - self.num_bands = config.get("num_bands", 1) - self.length_band = config.get("length_band", 1) - self.buckets = config.get("remote_buckets", []) - self.minhashes = config.get("remote_minhashes", []) - self.random_delay_limit = config.get("random_delay_limit", 10) - - def _generate_minhashes(self, shingles: list[str]) -> np.array: - """ - Generate minhashes - :param shingles: - :return: generated minhashes - """ - min_hashes = self.mn_min_hash.minhash(len(shingles), shingles) - num_min_hashes = len(min_hashes) - assert self.num_bands * self.length_band <= num_min_hashes, ( - f"num_bans*band_len must be <= num min hashes, was num_bands={self.num_bands}, " - f"bands_len={self.length_band}, num_min hashes={num_min_hashes}" - ) - return min_hashes - - def _generate_buckets(self, min_hashes: np.array) -> list[int]: - """ - Generate buckets - :param min_hashes: array of minhashes - :return: - """ - return [ - mmh3.hash64(min_hashes[i * self.length_band : (i + 1) * self.length_band], seed=RANDOM_SEED, signed=False)[ - 0 - ] - for i in range(self.num_bands) - ] - - def _submit_buckets_minhashes( - self, buckets: dict[int, list[int]], minhashes: list[tuple[int, int, np.array]] - ) -> None: - """ - Submit buckets to hash - :param buckets: buckets - :param minhashes: minhashes - :return: None - """ - # bucket requests - request = [[] for _ in range(len(self.buckets))] - for key, value in buckets.items(): - request[key % len(self.buckets)].append((key, value)) - # Submit requests to appropriate bucket collectors - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.buckets[i].add_buckets.remote(req)) - i += 1 - # Minhashes - request = [[] for _ in range(len(self.minhashes))] - for minh in minhashes: - request[minh[0] % len(self.minhashes)].append(minh) - # Submit requests to appropriate minhash collectors - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.minhashes[i].add_minhashes.remote(req)) - i += 1 - # wait for completion - RayUtils.wait_for_execution_completion(logger=self.logger, replies=remote_replies) - - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - """ - Preprocessing table content. - :param table: table - :param file_name - name of currently processed file - :return: resulting table, statistics - """ - from compute_shingles import compute_shingles - - def flush(limit: int) -> None: - """ - flushing buckets and minhashes to dedicated actors - :param limit: number of buckets to flush - :return: None - """ - if len(buckets) >= limit: # time to submit - nonlocal num_buckets - nonlocal num_minhashes - self._submit_buckets_minhashes(buckets, minhashes) - num_buckets = num_buckets + len(buckets) - num_minhashes = num_minhashes + len(minhashes) - buckets.clear() - minhashes.clear() - - # make sure that the doc column exists - TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column]) - # Inner variables - buckets = {} - minhashes = [] - num_buckets = 0 - num_minhashes = 0 - docs = table[self.doc_column] - doc_ids = table[self.doc_id_column] - # for every document/its integer id - for n in range(table.num_rows): - doc = docs[n].as_py() - doc_id = doc_ids[n].as_py() - shingles = compute_shingles(txt=doc, word_shingle_size=self.word_shingle_size, delimiter=self.delimiter) - if len(shingles) > 0: - mh = self._generate_minhashes(shingles) - minhashes.append((doc_id, len(doc), mh)) - candidates = self._generate_buckets(mh) - - for b_hash in candidates: - bucket_array = buckets.get(b_hash) - if bucket_array is None: - buckets[b_hash] = [doc_id] - else: - bucket_array.append(doc_id) - flush(REQUEST_LEN) - flush(0) - # peg stats - stats = {"generated buckets": num_buckets, "generated minhashes": num_minhashes} - time.sleep(int(random.random() * self.random_delay_limit)) - return [], stats - - -class FdedupFilter(AbstractTableTransform): - """ - Filtering documents - """ - - def __init__(self, config: dict): - """ - Initialize based on the dictionary of configuration information. - The dictionary should contain the following: - doc_column - name of doc column - doc_id_int_column - name of int doc id column - cluster_column - name of the cluster column - remote_docs - list of remote doc collectors - random_delay_limit - random delay limit - """ - super().__init__(config) - self.doc_column = config.get("doc_column", "") - self.doc_id_column = config.get("doc_id_int_column", "") - self.cluster_column = config.get("cluster_column", "") - self.docs = config.get("remote_docs", "") - self.random_delay_limit = config.get("random_delay_limit", 10) - - def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict[str, Any]]: - """ - De duping (filtering) table content. - :param table: table - :param file_name: name of the currently processing file - :return: resulting table, statistics - """ - # make sure that the doc column exists - TransformUtils.validate_columns(table=table, required=[self.doc_column, self.doc_id_column]) - # inner variables - ids = table.column(self.doc_id_column) - # Submit requests to an appropriate doc collectors - request = [[] for _ in range(len(self.docs))] - for value in ids: - doc_id = value.as_py() - request[doc_id % len(self.docs)].append(doc_id) - remote_replies = [] - i = 0 - for req in request: - if len(req) > 0: # Only submit if the length is greater then 0 - remote_replies.append(self.docs[i].filter.remote(req)) - i += 1 - # Process replies - unique = {} - while remote_replies: - # Wait for replies - ready, not_ready = ray.wait(remote_replies) - reply = ray.get(ready)[0] - unique.update(reply) - remote_replies = not_ready - # Filter out table - mask = [] - clusters = [] - # Actual filtering - for n in range(table.num_rows): - doc_id = ids[n].as_py() - if doc_id in unique: - mask.append(True) - clusters.append(unique.pop(doc_id)) - else: - mask.append(False) - # build out table - out_table = TransformUtils.add_column(table=table.filter(mask), name=self.cluster_column, content=clusters) - # build execution statistics - stats = {"source_documents": table.num_rows, "result_documents": out_table.num_rows} - time.sleep(int(random.random() * self.random_delay_limit)) - return [out_table], stats - - -class FdedupRuntime(DefaultRayTransformRuntime): - """ - Fuzzy dedup runtime support. Here we are using set environment to implement first two steps of fuzzy dedup - processing - preprocessing and bucket hash processing - """ - - def __init__(self, params: dict[str, Any]): - """ - Create filter runtime - :param params: parameters, that should include - doc_column - name of the document column - id_column - name of the integer doc id column - cluster_column - name of the cluster column - worker_options - start options for preprocessor - from the orchestrator configuration - bucket_cpu - number of cpus for bucket actor - doc_cpu - number of cpus for doc actor - mhash_cpu - number of cpus for minhash actor - num_doc_actors - number of document actors - num_bucket_actors - number of bucket actors - num_minhash_actors - number of minhash actors - num_preprocessors - number of preprocessors - snapshot_delay - delay (sec) in sending snapshot requests to actors - use_bucket_snapshot - use bucket snapshot - use_doc_snapshot - use doc snapshot - random_delay_limit - random_delay limit - # fuzzy specific parameters - num_permutations - number of permutations - threshold - threshold - world_shingle_size - word shingles size - delimiters - delimiter - """ - from data_processing.utils import get_logger - - super().__init__(params) - self.logger = get_logger(__name__) - self.sum_buckets = 0 - self.sum_buckets_mem = 0 - self.sum_mh = 0 - self.sum_mh_mem = 0 - self.document_collectors = [] - self.snapshot_delay = self.params.get("snapshot_delay", 1) - self.random_delay_limit = self.params.get("random_delay_limit", 10) - - def get_transform_config( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] - ) -> dict[str, Any]: - """ - Set environment for filter execution - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param files - list of files to process - :return: dictionary of filter init params - """ - if self.params.get("use_doc_snapshot", False): - self.logger.info("continuing from the document actors snapshot") - data_access = data_access_factory.create_data_access() - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}docs" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.info(f"Found the following snapshot files {files.keys()}") - self.document_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - self.document_collectors[i] = DocCollector.options( - **{"num_cpus": self.params.get("doc_cpu", 0.5)} - ).remote({"id": i, "data_access": data_access_factory, "snapshot": file}) - time.sleep(self.snapshot_delay) - self.logger.info(f"Created {len(self.document_collectors)} document collectors to continue processing") - else: - self.logger.info("starting run from the beginning") - self._create_doc_actors(data_access_factory=data_access_factory, statistics=statistics, files=files) - return { - "doc_column": self.params.get("doc_column", ""), - "doc_id_int_column": self.params.get("id_column", ""), - "cluster_column": self.params.get("cluster_column", ""), - "remote_docs": self.document_collectors, - "random_delay_limit": self.random_delay_limit, - } - - def _create_doc_actors( - self, data_access_factory: DataAccessFactoryBase, statistics: ActorHandle, files: list[str] - ) -> None: - """ - Create document actors - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param files - list of files to process - :return: None - """ - mn_min_hash = MurmurMH(num_perm=self.params.get("num_permutations", 64), seed=RANDOM_SEED) - if self.params.get("use_bucket_snapshot", False): - self.logger.info("continuing from the bucket actors snapshot") - data_access = data_access_factory.create_data_access() - # recreate bucket collectors - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}buckets" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.debug(f"Found the following bucket snapshot files {files.keys()}") - bucket_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory, "snapshot": file} - ) - time.sleep(self.snapshot_delay) - self.logger.info(f"Created {len(bucket_collectors)} bucket collectors to continue processing") - # recreate minhash collectors - path = f"{SnapshotUtils.get_snapshot_folder(data_access)}minhash" - files, retries = data_access.get_folder_files(path=path) - if retries > 0: - statistics.add_stats.remote({"data access retries": retries}) - self.logger.debug(f"Found the following minhash snapshot files {files.keys()}") - minhash_collectors = [None] * len(files) - for file in files.keys(): - i = int(file[file.rfind("_") + 1 :]) - minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory, "snapshot": file} - ) - time.sleep(self.snapshot_delay) - self._process_buckets( - data_access_factory=data_access_factory, - statistics=statistics, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - mn_min_hash=mn_min_hash, - ) - self.logger.info(f"Created {len(minhash_collectors)} minhash collectors to continue processing") - else: - self.logger.info("continuing from the very beginning") - self._create_doc_actors_internal( - data_access_factory=data_access_factory, statistics=statistics, mn_min_hash=mn_min_hash, files=files - ) - - def _create_doc_actors_internal( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - mn_min_hash: MurmurMH, - files: list[str], - ) -> None: - """ - Create document actors - :param data_access_factory - data access factory - :param statistics - reference to the statistics object - :param mn_min_hash - MurmurMH class - :param files - list of files to process - :return: None - """ - # compute fuzzy dedup parameters - num_buckets, length_bucket = fuzzy_optimal_param( - threshold=self.params.get("threshold", 0.8), - num_perm=self.params.get("num_permutations", 64), - false_positive_weight=0.5, - false_negative_weight=0.5, - ) - self.logger.info(f"Fuzzy: num buckets {num_buckets}, bucket length {length_bucket}") - # Build bucket and minhash collectors - bucket_collectors = [None] * self.params.get("num_bucket_actors", 1) - for i in range(self.params.get("num_bucket_actors", 1)): - bucket_collectors[i] = BucketsHash.options(**{"num_cpus": self.params.get("bucket_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(bucket_collectors)} bucket actors") - minhash_collectors = [None] * self.params.get("num_minhash_actors", 1) - for i in range(self.params.get("num_minhash_actors", 1)): - minhash_collectors[i] = DocsMinHash.options(**{"num_cpus": self.params.get("mhash_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(minhash_collectors)} minhash actors") - self._preprocess_tables( - data_access_factory=data_access_factory, - statistics=statistics, - files=files, - mn_min_hash=mn_min_hash, - num_buckets=num_buckets, - length_bucket=length_bucket, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - random_delay_limit=self.random_delay_limit, - ) - # At this point we can snapshot both bucket and minhash collectors for potential restart - self.logger.info("creating minhash snapshots") - minhash_replies = [None] * len(minhash_collectors) - index = 0 - for collector in minhash_collectors: - minhash_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while minhash_replies: - ready, not_ready = ray.wait(minhash_replies) - minhash_replies = not_ready - self.logger.info("minhash snapshots created") - self.logger.info("creating bucket snapshots") - bucket_replies = [None] * len(bucket_collectors) - index = 0 - for collector in bucket_collectors: - bucket_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while bucket_replies: - ready, not_ready = ray.wait(bucket_replies) - bucket_replies = not_ready - self.logger.info("bucket snapshots created") - self._process_buckets( - data_access_factory=data_access_factory, - statistics=statistics, - bucket_collectors=bucket_collectors, - minhash_collectors=minhash_collectors, - mn_min_hash=mn_min_hash, - ) - - def _process_buckets( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - bucket_collectors: list[ActorHandle], - minhash_collectors: list[ActorHandle], - mn_min_hash: MurmurMH, - ) -> None: - """ - Process buckets - :param data_access_factory - data access factory - :param statistics - statistics actor - :param bucket_collectors - bucket collectors - :param minhash_collectors - minhash collectors - :param mn_min_hash - MMurmurMH class - :return: None - """ - # Create document collectors - self.document_collectors = [None] * self.params.get("num_doc_actors", 1) - for i in range(self.params.get("num_doc_actors", 1)): - self.document_collectors[i] = DocCollector.options(**{"num_cpus": self.params.get("doc_cpu", 0.5)}).remote( - {"id": i, "data_access": data_access_factory} - ) - self.logger.info(f"created {len(self.document_collectors)} document actors") - # create bucket processors - bucket_processors_list = RayUtils.create_actors( - clazz=BucketsHashProcessor, - params={ - "remote_docs": self.document_collectors, - "remote_minhashes": minhash_collectors, - "mn_min_hash": mn_min_hash, - "threshold": self.params.get("threshold", 0.8) * self.params.get("num_permutations", 64), - "statistics": statistics, - }, - actor_options=self.params.get("worker_options", None), - n_actors=self.params.get("num_preprocessors", 1), - ) - self.logger.info(f"created {len(bucket_processors_list)} bucket processor actors") - # create bucket processors invoker - bucket_processor_invoker = BucketsHashProcessorInvoker.options( - num_cpus=self.params.get("bucket_cpu", 0.5) - ).remote(bucket_processors=bucket_processors_list) - self.logger.info(f"created bucket processor invoker") - # Add invoker to the buckets - bucket_replies = [ - collector.add_processing_submitter.remote(submitter=bucket_processor_invoker) - for collector in bucket_collectors - ] - RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies) - self.logger.info(f"added invoker to bucket collectors") - # start bucket processing and wait for completion - start = time.time() - bucket_replies = [collector.process_buckets.remote() for collector in bucket_collectors] - RayUtils.wait_for_execution_completion(logger=self.logger, replies=bucket_replies) - # Wait for pool to complete - ray.get(bucket_processor_invoker.wait_for_completion.remote()) - self.logger.info(f"Done processing buckets in {round((time.time() - start) / 60.,3)} min") - # At this point we can save doc actors, in case we would want to restart here - self.logger.info(f"creating document snapshots") - doc_replies = [None] * len(self.document_collectors) - index = 0 - for collector in self.document_collectors: - doc_replies[index] = collector.snapshot.remote() - index += 1 - time.sleep(self.snapshot_delay) - while doc_replies: - ready, not_ready = ray.wait(doc_replies) - doc_replies = not_ready - self.logger.info(f"document snapshots created") - # At this point we do not need bucket and minhash actors, remove them - # but first get usage information - # Bucket collector - replies = [collector.get_size.remote() for collector in bucket_collectors] - while replies: - ready, not_ready = ray.wait(replies) - b_amount, b_memory = ray.get(ready)[0] - self.sum_buckets += b_amount - self.sum_buckets_mem += b_memory - replies = not_ready - for collector in bucket_collectors: - ray.kill(actor=collector, no_restart=True) - # minhash collector - replies = [collector.get_size.remote() for collector in minhash_collectors] - while replies: - ready, not_ready = ray.wait(replies) - m_amount, m_memory = ray.get(ready)[0] - self.sum_mh += m_amount - self.sum_mh_mem += m_memory - replies = not_ready - for collector in minhash_collectors: - ray.kill(actor=collector, no_restart=True) - # Clean up processors - for processor in bucket_processors_list: - ray.kill(actor=processor, no_restart=True) - ray.kill(bucket_processor_invoker) - - def _preprocess_tables( - self, - data_access_factory: DataAccessFactoryBase, - statistics: ActorHandle, - files: list[str], - mn_min_hash: MurmurMH, - num_buckets: int, - length_bucket: int, - bucket_collectors: list[ActorHandle], - minhash_collectors: list[ActorHandle], - random_delay_limit: int, - ) -> None: - """ - Preprocess tables - build, run and cleanup - :param data_access_factory - data access factory - :param statistics - statistics actor - :param files - list of files to process - :param mn_min_hash - MurmurMH class - :param num_buckets - number of buckets - :param length_bucket - bucket length - :param bucket_collectors - bucket collector actors - :param minhash_collectors - minhash_collector actors - :param random_delay_limit - max for random dalay limit - :return: None - """ - from ray.util.metrics import Gauge - - worker_options = self.params.get("worker_options", None) - # Here we are limiting the number of readers not to overwhelm COS - n_readers = self.params.get("num_preprocessors", 1) - if n_readers > 1000: - n_readers = 1000 - self.logger.info(f"Table preprocessing uses {n_readers} readers") - # Create preprocessing actors - processor_params = { - "data_access_factory": data_access_factory, - "transform_class": FdedupTransform, - "statistics": statistics, - "transform_params": { - "doc_column": self.params.get("doc_column", ""), - "doc_id_int_column": self.params.get("id_column", ""), - "word_shingle_size": self.params.get("world_shingle_size", 1), - "mn_min_hash": mn_min_hash, - "num_bands": num_buckets, - "length_band": length_bucket, - "remote_buckets": bucket_collectors, - "remote_minhashes": minhash_collectors, - "delimiter": self.params.get("delimiter", " "), - "random_delay_limit": random_delay_limit, - }, - "base_table_stats": False, - } - processors_list = RayUtils.create_actors( - clazz=RayTransformFileProcessor, - params=processor_params, - actor_options=worker_options, - n_actors=n_readers, - ) - self.logger.info(f"created {len(processors_list)} table processor actors") - # Execute preprocessing - # create gauges - files_in_progress_gauge = Gauge( - "preprocessing_files_in_progress", "Number of files in progress, preprocessing" - ) - files_completed_gauge = Gauge( - "preprocessing_files_processed_total", "Number of files completed, preprocessing" - ) - available_cpus_gauge = Gauge("preprocessing_available_cpus", "Number of available CPUs, preprocessing") - available_gpus_gauge = Gauge("preprocessing_available_gpus", "Number of available GPUs, preprocessing") - available_memory_gauge = Gauge("preprocessing_available_memory", "Available memory, preprocessing") - available_object_memory_gauge = Gauge( - "preprocessing_available_object_store", "Available object store, preprocessing" - ) - print_interval = int(len(files) / 100) - if print_interval == 0: - print_interval = 1 - # process data - processors = ActorPool(processors_list) - failures = RayUtils.process_files( - executors=processors, - files=files, - print_interval=print_interval, - files_in_progress_gauge=files_in_progress_gauge, - files_completed_gauge=files_completed_gauge, - available_cpus_gauge=available_cpus_gauge, - available_gpus_gauge=available_gpus_gauge, - available_memory_gauge=available_memory_gauge, - object_memory_gauge=available_object_memory_gauge, - logger=self.logger, - ) - if failures > 0: - statistics.add_stats.remote({"actor failures": failures}) - # Clean up processors - for processor in processors_list: - ray.kill(actor=processor, no_restart=True) - del processors - - def compute_execution_stats(self, stats: dict[str, Any]) -> dict[str, Any]: - """ - Compute execution statistics - :param stats: output of statistics - :return: job execution statistics - """ - # Get document collector statistics - sum_docs = 0 - sum_docs_mem = 0 - sum_removed = 0 - sum_removed_mem = 0 - replies = [collector.get_size.remote() for collector in self.document_collectors] - while replies: - ready, not_ready = ray.wait(replies) - d_amount, d_memory, r_amount, r_memory = ray.get(ready)[0] - sum_docs += d_amount - sum_docs_mem += d_memory - sum_removed += r_amount - sum_removed_mem += r_memory - replies = not_ready - overall_hash_memory = self.sum_buckets_mem + self.sum_mh_mem + sum_docs_mem + sum_docs_mem + sum_removed_mem - dedup_prst = 100 * (1.0 - stats.get("result_documents", 1) / stats.get("source_documents", 1)) - return { - "number of buckets": self.sum_buckets, - "number of docs": sum_docs, - "number of removed docs": sum_removed, - "number of min hashes": self.sum_mh, - "overall hash memory GB": overall_hash_memory, - "de duplication %": dedup_prst, - } | stats - +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration -class FdedupTableTransformConfiguration(TransformConfiguration): - """ - Provides support for configuring and using the associated Transform class include - configuration with CLI args and combining of metadata. - """ - def __init__(self): - super().__init__( - name=short_name, - transform_class=FdedupFilter, - ) - from data_processing.utils import get_logger +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} - self.logger = get_logger(__name__) - def add_input_params(self, parser: ArgumentParser) -> None: - """ - Add Transform-specific arguments to the given parser. - """ - parser.add_argument(f"--{cli_prefix}doc_column", type=str, default="contents", help="document column name") - parser.add_argument( - f"--{cli_prefix}id_column", type=str, default="int_document_id", help="integer document id column name" - ) - parser.add_argument(f"--{cli_prefix}cluster_column", type=str, default="cluster", help="cluster column name") - parser.add_argument( - f"--{cli_prefix}bucket_cpu", type=float, default=0.5, help="number of CPUs per bucket hash" - ) - parser.add_argument( - f"--{cli_prefix}mhash_cpu", type=float, default=0.5, help="number of CPUs per minhash hash" - ) - parser.add_argument(f"--{cli_prefix}doc_cpu", type=float, default=0.5, help="number of CPUs per doc hash") - parser.add_argument(f"--{cli_prefix}num_doc_actors", type=int, default=1, help="number of doc actors to use") - parser.add_argument( - f"--{cli_prefix}num_minhash_actors", type=int, default=1, help="number of minhash actors to use" - ) - parser.add_argument( - f"--{cli_prefix}num_bucket_actors", type=int, default=1, help="number of bucket actors to use" - ) - parser.add_argument( - f"--{cli_prefix}num_preprocessors", type=int, default=1, help="number of preprocessors to use" - ) - parser.add_argument(f"--{cli_prefix}num_permutations", type=int, default=64, help="number of permutations") - parser.add_argument(f"--{cli_prefix}threshold", type=float, default=0.8, help="threshold") - parser.add_argument(f"--{cli_prefix}shingles_size", type=int, default=5, help="number of words in shingle") - parser.add_argument( - f"--{cli_prefix}delimiters", type=str, default=" ", help="delimiter for splitting document" - ) - parser.add_argument(f"--{cli_prefix}snapshot_delay", type=int, default=1, help="snapshot delay time") - parser.add_argument( - f"--{cli_prefix}use_bucket_snapshot", - type=lambda x: bool(str2bool(x)), - default=False, - help="flag to continue with bucket snapshot", - ) - parser.add_argument( - f"--{cli_prefix}use_doc_snapshot", - type=lambda x: bool(str2bool(x)), - default=False, - help="flag to continue with doc snapshot", - ) - parser.add_argument( - f"--{cli_prefix}random_delay_limit", type=int, default=10, help="maximum delay between read" - ) +ray_worker_options = {"num_cpus": 0.8} +ray_params = { + # where to run + "run_locally": True, + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(ray_worker_options), + "runtime_num_workers": 3, +} - def apply_input_params(self, args: Namespace) -> bool: - """ - Validate and apply the arguments that have been parsed - :param args: user defined arguments. - :return: True, if validate pass or False otherwise - """ - captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) - self.params = self.params | captured - self.params["worker_options"] = args.runtime_worker_options - if self.params["use_bucket_snapshot"] and self.params["use_doc_snapshot"]: - self.logger.warning("both bucket and doc snapshot are specified. Only one allowed") - return False +ray_params_argv = ParamsUtils.dict_to_req(ray_params) - self.logger.info(f"fuzzy dedup params are {self.params}") - return True +class RayServiceOrchestrator(ServiceOrchestrator): + def __init__(self, global_params: argparse.Namespace = None): + super().__init__(global_params=global_params) -class FdedupRayTransformConfiguration(RayTransformRuntimeConfiguration): - def __init__(self): - super().__init__(transform_config=FdedupTableTransformConfiguration(), runtime_class=FdedupRuntime) + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params if service_short_name == "fdlist" else ray_params_argv + params[1:] + if service_short_name == "minhash": + launcher = RayTransformLauncher(runtime_config=SignatureCalculationRayTransformConfiguration()) + elif service_short_name == "cluster": + launcher = RayTransformLauncher(runtime_config=ClusterAnalysisRayTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = RayTransformLauncher(runtime_config=GetDuplicateListRayTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = RayTransformLauncher(runtime_config=DataCleaningRayTransformConfiguration()) + status = launcher.launch() + return status if __name__ == "__main__": - launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) - launcher.launch() + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = RayServiceOrchestrator(global_params=args) + # Launch ray fuzzy dedup execution + orchestrator.orchestrate() diff --git a/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py new file mode 100644 index 000000000..40081e658 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/get_duplicate_list_transform_ray.py @@ -0,0 +1,69 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from data_processing.data_access import DataAccess +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray import ( + DefaultRayTransformRuntime, + RayTransformLauncher, + RayTransformRuntimeConfiguration, +) +from get_duplicate_list_transform import ( + GetDuplicateListTransformConfiguration, + subfolder_key, +) + + +logger = get_logger(__name__) + + +class GetDuplicateListRayRuntime(DefaultRayTransformRuntime): + """ + Get duplicate list runtime support for Ray + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + return [self.params[subfolder_key]] + + +class GetDuplicateListRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for Fuzzy Dedup Get Duplicate List + as required by the RayTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=GetDuplicateListTransformConfiguration(), + runtime_class=GetDuplicateListRayRuntime, + ) + + +if __name__ == "__main__": + launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration()) + logger.info("Launching fuzzy dedup get duplicate list ray transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py new file mode 100644 index 000000000..cb87b56af --- /dev/null +++ b/transforms/universal/fdedup/ray/src/signature_calc_local_ray.py @@ -0,0 +1,54 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +worker_options = {"num_cpus": 0.8} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # where to run + "run_locally": True, + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # orchestrator + "runtime_worker_options": ParamsUtils.convert_to_ast(worker_options), + "runtime_num_workers": 3, + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_creation_delay": 0, + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + # execution info + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + # Launch the ray actor(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py new file mode 100644 index 000000000..678d953f2 --- /dev/null +++ b/transforms/universal/fdedup/ray/src/signature_calc_transform_ray.py @@ -0,0 +1,43 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import CLIArgumentProvider, get_logger +from data_processing_ray.runtime.ray.runtime_configuration import ( + RayTransformRuntimeConfiguration, +) +from data_processing_ray.runtime.ray.transform_launcher import RayTransformLauncher +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationRayTransformConfiguration(RayTransformRuntimeConfiguration): + """ + Implements the RayTransformConfiguration for NOOP as required by the RayTransformLauncher. + NOOP does not use a RayRuntime class so the superclass only needs the base + python-only configuration. + """ + + def __init__(self): + """ + Initialization + :param base_configuration - base configuration class + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + # launcher = NOOPRayLauncher() + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + logger.info("Launching transform") + launcher.launch() diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet new file mode 100644 index 000000000..f5da05a10 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet new file mode 100644 index 000000000..0e089dee3 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet new file mode 100644 index 000000000..4b0fecb15 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 000000000..5601f5cb0 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 000000000..02bedff1c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 000000000..bf131f43c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet new file mode 100644 index 000000000..d41b35de2 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 000000000..2838dd972 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 000000000..7cb2cbac4 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 000000000..9de625746 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet new file mode 100644 index 000000000..8e1fe121e Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 000000000..37aea5168 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 000000000..3d1f158e9 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json new file mode 100644 index 000000000..c08326355 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/cluster_analysis/docs_to_remove/metadata.json @@ -0,0 +1,58 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "cluster", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:32:15", + "end_time": "2024-10-18 10:32:15", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "jaccard_similarity_threshold": 0.7, + "num_bands": 14, + "num_segments": 2, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 91.7, + "gpus": 0, + "memory": 24.01, + "object_store": 0, + "execution time, min": 0.001 + }, + "job_output_stats": { + "result_files": 28, + "result_size": 38040, + "processing_time": 0.061, + "input_files": 28, + "input_bytes": 115324, + "input_rows": 168, + "consolidated_files": 28, + "consolidated_bytes": 80640, + "consolidated_rows": 168, + "groupby_clusters": 35, + "cluster_duplicate_docs": 79, + "jaccard_clusters": 35, + "jaccard_duplicate_docs": 44, + "num_duplicate_documents": 44 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet new file mode 100644 index 000000000..03a0c321a Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json new file mode 100644 index 000000000..047921334 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/annotated/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-10-14 10:43:38", + "end_time": "2024-10-14 10:43:55", + "status": "success" + }, + "code": null, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "annotate", + "RDD parallelization": -1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "num partitions": 20, + "execution time, min": 0.284, + "cpus": 20, + "gpus": 0, + "memory": 0.36, + "object_store": 0 + }, + "job_output_stats": { + "source_size": 4111, + "output_bytes": 8856, + "processing_time": 0.46729254722595215, + "input_bytes": 8753, + "result_size": 6923, + "input_files": 1, + "source_files": 1, + "input_docs": 12, + "output_docs": 12, + "filtered_docs": 0, + "output_files": 1, + "result_files": 1, + "source_doc_count": 12, + "filtered_bytes": -103, + "result_doc_count": 12 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 000000000..d67b5bcf8 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 000000000..267e78385 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json new file mode 100644 index 000000000..717d9bbe9 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/data_cleaning/cleaned/metadata.json @@ -0,0 +1,59 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:10:22", + "end_time": "2024-10-18 10:10:23", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "filter_duplicates", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 112.7, + "gpus": 0, + "memory": 24.17, + "object_store": 0, + "execution time, min": 0.005 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 4490, + "result_files": 2, + "result_size": 18001, + "processing_time": 0.308, + "input_files": 2, + "input_docs": 12, + "input_bytes": 8753, + "output_files": 2, + "output_docs": 4, + "output_bytes": 4650, + "filtered_docs": 8, + "filtered_bytes": 4103, + "source_doc_count": 12, + "result_doc_count": 4 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..8aa870c00 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..34b15a76c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/metadata.json index 4a1b54395..a0b26f931 100644 --- a/transforms/universal/fdedup/ray/test-data/expected/metadata.json +++ b/transforms/universal/fdedup/ray/test-data/expected/metadata.json @@ -2,86 +2,48 @@ "pipeline": "pipeline_id", "job details": { "job category": "preprocessing", - "job name": "fdedup", - "job type": "ray", + "job name": "fdlist", + "job type": "pure python", "job id": "job_id", - "start_time": "2024-06-24 19:39:44", - "end_time": "2024-06-24 19:39:57", + "start_time": "2024-10-18 11:36:37", + "end_time": "2024-10-18 11:36:37", "status": "success" }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, + "code": null, "job_input_params": { - "doc_column": "contents", - "id_column": "int_id_column", - "cluster_column": "cluster", - "bucket_cpu": 0.5, - "mhash_cpu": 0.5, - "doc_cpu": 0.5, - "num_doc_actors": 1, - "num_minhash_actors": 1, - "num_bucket_actors": 1, - "num_preprocessors": 2, - "num_permutations": 64, - "threshold": 0.8, - "shingles_size": 5, - "delimiters": " ", - "snapshot_delay": 1, - "use_bucket_snapshot": false, - "use_doc_snapshot": false, - "random_delay_limit": 5, - "worker_options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, "checkpointing": false, "max_files": -1, "random_samples": -1, "files_to_use": [".parquet"], - "number of workers": 1, - "worker options": { - "num_cpus": 0.8, - "max_restarts": -1 - }, - "actor creation delay": 0 + "num_processors": 0 }, "execution_stats": { - "cpus": 16, + "cpus": 4.5, "gpus": 0, - "memory": 14.396823502145708, - "object_store": 2.0, - "execution time, min": 0.22008283535639445 + "memory": 15.91, + "object_store": 0, + "execution time, min": 0.0 }, "job_output_stats": { - "number of buckets": 15, - "number of docs": 3, - "number of removed docs": 2, - "number of min hashes": 5, - "overall hash memory GB": 7.152557373046875e-6, - "de duplication %": 40.0, - "source_files": 2, - "source_size": 73126, - "generated buckets": 15, - "generated minhashes": 5, - "source_doc_count": 10, - "generated doc_ids": 3, - "bucket processing time": 0.04204988479614258, "result_files": 1, - "result_size": 36941, - "processing_time": 2.286285161972046, - "source_documents": 5, - "result_documents": 3, - "result_doc_count": 3 + "result_size": 663, + "processing_time": 0.024, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 }, "source": { - "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/test-data/input", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", "type": "path" }, "target": { - "name": "/Users/boris/Projects/data-prep-kit/transforms/universal/fdedup/ray/output", + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", "type": "path" } } diff --git a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet b/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet deleted file mode 100644 index 92b4e58c7..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/expected/sample1.parquet and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet new file mode 100644 index 000000000..c7d3d8072 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 000000000..c355b299a Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet new file mode 100644 index 000000000..ad59ee31c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet new file mode 100644 index 000000000..fb2a0b13d Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet new file mode 100644 index 000000000..aca2026d8 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 000000000..1a46cb40f Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet new file mode 100644 index 000000000..56934cab8 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet new file mode 100644 index 000000000..f82d9daca Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 000000000..842ce2caa Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet new file mode 100644 index 000000000..fcb03c17a Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 000000000..84c399e67 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 000000000..79a6f24b3 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet new file mode 100644 index 000000000..e67164596 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet new file mode 100644 index 000000000..cd2e75eaa Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet new file mode 100644 index 000000000..5212dff6d Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 000000000..d0f1bd9b4 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet new file mode 100644 index 000000000..1cc7b2c26 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet new file mode 100644 index 000000000..f892d384d Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet new file mode 100644 index 000000000..1a786300b Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet new file mode 100644 index 000000000..bc20a7699 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 000000000..151008dc4 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 000000000..b485d3882 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 000000000..0da33db3c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet new file mode 100644 index 000000000..1e1b4765c Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 000000000..7e9af93b0 Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 000000000..d112e179e Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 000000000..f3f7d2a7d Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet new file mode 100644 index 000000000..06444accf Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..f7f0fe9df --- /dev/null +++ b/transforms/universal/fdedup/ray/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-14 10:43:37", + "end_time": "2024-10-14 10:43:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 31.7, + "gpus": 0, + "memory": 15.83, + "object_store": 0, + "execution time, min": 0.003 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.2, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 deleted file mode 100644 index c92d73bfb..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/buckets/buckets_collector_0 and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 deleted file mode 100644 index c3966bec2..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/docs/doc_collector_0 and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 b/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 deleted file mode 100644 index e419c9516..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/expected/snapshot/minhash/minhash_collector_0 and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test-data/input/df1.parquet b/transforms/universal/fdedup/ray/test-data/input/df1.parquet new file mode 100644 index 000000000..2584725bb Binary files /dev/null and b/transforms/universal/fdedup/ray/test-data/input/df1.parquet differ diff --git a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet b/transforms/universal/fdedup/ray/test-data/input/sample1.parquet deleted file mode 100644 index 58387d07d..000000000 Binary files a/transforms/universal/fdedup/ray/test-data/input/sample1.parquet and /dev/null differ diff --git a/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py new file mode 100644 index 000000000..a3771fbd8 --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_cluster_analysis_transform_ray.py @@ -0,0 +1,52 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import ( + jaccard_similarity_threshold_cli_param, + num_bands_cli_param, + num_segments_cli_param, + sort_output_cli_param, +) +from cluster_analysis_transform_ray import ClusterAnalysisRayTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher + + +class TestRayClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + num_bands_cli_param: 14, + num_segments_cli_param: 2, + jaccard_similarity_threshold_cli_param: 0.7, + sort_output_cli_param: True, + } + launcher = RayTransformLauncher(ClusterAnalysisRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "signature_calc", "bands"), + os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py new file mode 100644 index 000000000..a62105b2c --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_data_cleaning_transform_ray.py @@ -0,0 +1,61 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, + operation_mode_cli_param, +) +from data_cleaning_transform_ray import DataCleaningRayTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher + + +class TestRayDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "get_list_transform", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + "run_locally": True, + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + operation_mode_cli_param: "annotate", + } + launcher = RayTransformLauncher(DataCleaningRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "data_cleaning", "annotated"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_fdedup.py b/transforms/universal/fdedup/ray/test/test_fdedup.py deleted file mode 100644 index fa46fb071..000000000 --- a/transforms/universal/fdedup/ray/test/test_fdedup.py +++ /dev/null @@ -1,18 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -# There is no local test for fdedup -# This is just a place holder t satisfy overall framework - - -def test_fdedup(): - pass diff --git a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py b/transforms/universal/fdedup/ray/test/test_fdedup_ray.py deleted file mode 100644 index 78ee7cc04..000000000 --- a/transforms/universal/fdedup/ray/test/test_fdedup_ray.py +++ /dev/null @@ -1,60 +0,0 @@ -# (C) Copyright IBM Corp. 2024. -# Licensed under the Apache License, Version 2.0 (the “License”); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -################################################################################ - -import os - -from data_processing.test_support.launch.transform_test import ( - AbstractTransformLauncherTest, -) -from data_processing_ray.runtime.ray import RayTransformLauncher -from fdedup_transform_ray import FdedupRayTransformConfiguration - - -class TestRayFdedupTransform(AbstractTransformLauncherTest): - """ - Extends the super-class to define the test data for the tests defined there. - The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. - """ - - def get_test_transform_fixtures(self) -> list[tuple]: - basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) - config = { - "run_locally": True, - # When running in ray, our Runtime's get_transform_config() method will load the domains using - # the orchestrator's DataAccess/Factory. So we don't need to provide the bl_local_config configuration. - # columns used - "fdedup_doc_column": "contents", - "fdedup_id_column": "int_id_column", - "fdedup_cluster_column": "cluster", - # infrastructure - "fdedup_bucket_cpu": 0.5, - "fdedup_doc_cpu": 0.5, - "fdedup_mhash_cpu": 0.5, - "fdedup_num_doc_actors": 1, - "fdedup_num_bucket_actors": 1, - "fdedup_num_minhash_actors": 1, - "fdedup_num_preprocessors": 1, - # fuzzy parameters - "fdedup_num_permutations": 64, - "fdedup_threshold": 0.8, - "fdedup_shingles_size": 5, - "fdedup_delimiters": " ", - # Random delay between reads - "fdedup_random_delay_limit": 5, - # snapshotting - "fdedup_snapshot_delay": 1, - "fdedup_use_doc_snapshot": False, - "fdedup_use_bucket_snapshot": False, - } - launcher = RayTransformLauncher(FdedupRayTransformConfiguration()) - fixtures = [(launcher, config, basedir + "/input", basedir + "/expected")] - return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py new file mode 100644 index 000000000..55869598c --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_get_duplicate_list_transform_ray.py @@ -0,0 +1,44 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_ray.runtime.ray import RayTransformLauncher +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_ray import GetDuplicateListRayTransformConfiguration + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + sort_output_cli_param: True, + } + launcher = RayTransformLauncher(GetDuplicateListRayTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py new file mode 100644 index 000000000..34f3ee403 --- /dev/null +++ b/transforms/universal/fdedup/ray/test/test_signature_calc_transform_ray.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from data_processing_ray.runtime.ray import RayTransformLauncher +from signature_calc_transform import ( + num_bands_cli_param, + num_permutations_cli_param, + num_segments_cli_param, +) +from signature_calc_transform_ray import SignatureCalculationRayTransformConfiguration + + +class TestRaySignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "run_locally": True, + num_permutations_cli_param: 112, + num_bands_cli_param: 14, + num_segments_cli_param: 2, + } + launcher = RayTransformLauncher(SignatureCalculationRayTransformConfiguration()) + fixtures = [ + (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/Dockerfile b/transforms/universal/fdedup/spark/Dockerfile new file mode 100644 index 000000000..b04994d46 --- /dev/null +++ b/transforms/universal/fdedup/spark/Dockerfile @@ -0,0 +1,51 @@ +ARG BASE_IMAGE=data-prep-kit-spark-3.5.2:0.3.0 +FROM ${BASE_IMAGE} + +# install pytest +RUN pip install --no-cache-dir pytest +ARG DPK_WHEEL_FILE_NAME + +WORKDIR ${SPARK_HOME}/work-dir + +# Copy in the data processing framework source/project and install it +# This is expected to be placed in the docker context before this is run (see the make image). +COPY --chown=spark:root data-processing-dist data-processing-dist +RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[spark] + +## Copy the python version of the tansform +COPY --chown=spark:root python-transform/ python-transform/ +RUN cd python-transform && pip install --no-cache-dir -e . + +# Install spark project source +COPY --chown=spark:root src/ src/ +COPY --chown=spark:root pyproject.toml pyproject.toml +COPY --chown=spark:root README.md README.md +RUN mkdir -p /opt/spark/work-dir/src/templates && \ + mkdir -p /opt/spark/work-dir/config +COPY --chown=spark:root deployment/kubernetes/spark-executor-pod-template.yml /opt/spark/work-dir/src/templates/ +COPY --chown=spark:root deployment/kubernetes/spark_profile.yml /opt/spark/work-dir/config/ + +# install requirements from requirements.txt +COPY requirements.txt . +RUN pip3 install -r requirements.txt + +RUN pip install --no-cache-dir -e . + +# copy the main() entry point to the image +COPY ./src/fdedup_transform_spark.py . + +# copy test +COPY test/ test/ +COPY test-data/ test-data/ + +USER spark + +# Set environment +ENV PYTHONPATH=${SPARK_HOME}/work-dir/:${SPARK_HOME}/work-dir/src/:${PYTHONPATH} +ENV PATH=${SPARK_HOME}/work-dir/.local/bin/:${PATH} + +# Put these at the end since they seem to upset the docker cache. +ARG BUILD_DATE +ARG GIT_COMMIT +LABEL build-date=$BUILD_DATE +LABEL git-commit=$GIT_COMMIT diff --git a/transforms/universal/fdedup/spark/Makefile b/transforms/universal/fdedup/spark/Makefile new file mode 100644 index 000000000..ac2735e7d --- /dev/null +++ b/transforms/universal/fdedup/spark/Makefile @@ -0,0 +1,57 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../../.. + +# Set this, before including .make.defaults, to +# 1 if requirements reference the latest code in the data processing library +# in this repo (that is not yet published to pypi). This is the default setting. +# 0 if the transforms DPK dependencies are on wheels published to +# pypi (e.g. data-prep-toolkit=0.2.1) +#USE_REPO_LIB_SRC=1 + +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. +include $(REPOROOT)/transforms/.make.transforms + +# Include the common configuration for this transform +include ../transform.config + +venv:: .transforms.spark-venv + +test:: .transforms.spark-test + +clean:: .transforms.clean + +image:: .transforms.spark-image + +test-src:: .transforms.test-src + +setup:: .transforms.setup + +build:: build-dist image + +publish: publish-image + +publish-image:: .transforms.publish-image-spark + +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(FDEDUP_PYTHON_VERSION) TOML_VERSION=$(FDEDUP_SPARK_VERSION) .transforms.set-versions + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +test-image:: .transforms.spark-test-image + +run-cli-sample: .transforms.run-cli-spark-sample + +run-local-sample: .transforms.run-local-sample + +minio-start: .minio-start + +kind-load-image:: .transforms.kind-load-image + +docker-load-image: .defaults.docker-load-image + +docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/fdedup/spark/README.md b/transforms/universal/fdedup/spark/README.md new file mode 100644 index 000000000..dd0294aed --- /dev/null +++ b/transforms/universal/fdedup/spark/README.md @@ -0,0 +1,67 @@ +# Fuzzy Dedup + +Please see the set of [transform project conventions](../../../README.md) for details on general project conventions, transform +configuration, testing and IDE set up. + +## Summary + +This project wraps the [Fuzzy Dedup transform](../python) with a Spark runtime. + +## Configuration and command line Options + +Fuzzy Dedup configuration and command line options are the same as for the base python transform. + +## Running +### Launched Command Line Options +When running the transform with the Spark launcher (i.e. TransformLauncher), +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[spark launcher](../../../../data-processing-lib/doc/spark-launcher-options.md) are available. + +### Running the samples +To run the samples, use the following `make` target to create a virtual environment: + +```commandline +make venv +``` +Subsequently, the main orchestration program can run with: +```commandline +source venv/bin/activate +cd src +python fdedup_transform_spark.py +``` +Alternatively the transforms included in fuzzy dedup can be launched independently: +```commandline +source venv/bin/activate +cd src +python signature_calc_local_spark.py +python cluster_analysis_local_spark.py +python get_duplicate_list_local_spark.py +python data_cleaning_local_spark.py +``` +After running the transforms, execute: +```shell +ls output +``` +To see results of the transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +For testing fuzzy deduplication in a spark runtime, use the following `make` targets. To launch integration tests +for all the component transforms of fuzzy dedup (signature calculation, cluster analysis, get duplicate list and data +cleaning) use: +```commandline +make test-src +``` + +To test the creation of the Docker image for fuzzy dedup transform and the capability to run a local program inside that +image, use: +```commandline +make test-image +``` \ No newline at end of file diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml new file mode 100644 index 000000000..d9579e0c7 --- /dev/null +++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark-executor-pod-template.yml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Pod +metadata: +spec: + imagePullSecrets: + - name: prod-all-icr-io + securityContext: + fsGroup: 0 diff --git a/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml new file mode 100644 index 000000000..eeddbd694 --- /dev/null +++ b/transforms/universal/fdedup/spark/deployment/kubernetes/spark_profile.yml @@ -0,0 +1,14 @@ +spark.app.name: ${APP_NAME} +spark.driver.memory: ${DRIVER_MEMORY} +spark.executor.instances: ${NUM_EXECUTORS} +spark.executor.memory: ${EXECUTOR_MEMORY} +spark.executor.cores: ${EXECUTOR_CORES} +spark.sql.shuffle.partitions: ${NUM_TASKS} +spark.task.cpus: ${TASK_CPUS} +spark.sql.legacy.parquet.nanosAsLong: true +spark.executor.decommission.forceKillTimeout: "10h" +# spark.sql.files.ignoreCorruptFiles: true +# configuration needed when running in kubernetes +spark.kubernetes.authenticate.driver.serviceAccountName: ${SERVICE_ACCOUNT} +spark.kubernetes.container.image: ${EXECUTOR_DOCKER_IMAGE} +spark.kubernetes.namespace: ${EXECUTOR_NAMESPACE} diff --git a/transforms/universal/fdedup/spark/pyproject.toml b/transforms/universal/fdedup/spark/pyproject.toml new file mode 100644 index 000000000..8a072b31b --- /dev/null +++ b/transforms/universal/fdedup/spark/pyproject.toml @@ -0,0 +1,45 @@ +[project] +name = "dpk_fdedup_transform_spark" +version = "0.2.3.dev0" +requires-python = ">=3.10,<3.13" +description = "Fuzzy Dedup Spark Transform" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Nelson Bore", email = "k.nelsonbore@gmail.com" }, + { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, +] +dynamic = ["dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/transforms/universal/fdedup/spark/requirements.txt b/transforms/universal/fdedup/spark/requirements.txt new file mode 100644 index 000000000..653b94256 --- /dev/null +++ b/transforms/universal/fdedup/spark/requirements.txt @@ -0,0 +1,11 @@ +dpk_fdedup_transform_python==0.2.3.dev0 +data-prep-toolkit[spark]>=0.2.3.dev0 +pyyaml>=6.0.2 +boto3>=1.34.69 +kubernetes>=30.1.0 +polars==1.9.0 +disjoint-set>=0.8.0 +numpy<1.29.0 +sentencepiece>=0.2.0 +mmh3>=4.1.0 +scipy>=1.12.0, <2.0.0 diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py new file mode 100644 index 000000000..c9950657c --- /dev/null +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_local_spark.py @@ -0,0 +1,49 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +# create parameters +input_folder = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..", "test-data", "expected", "signature_calc", "bands") +) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "docs_to_remove")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, +} +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py new file mode 100644 index 000000000..feeb3241e --- /dev/null +++ b/transforms/universal/fdedup/spark/src/cluster_analysis_transform_spark.py @@ -0,0 +1,75 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from cluster_analysis_transform import ( + ClusterAnalysisTransformConfiguration, + num_bands_key, + num_segments_key, +) +from data_processing.data_access import DataAccess +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class ClusterAnalysisSparkRuntime(DefaultSparkTransformRuntime): + """ + Cluster analysis runtime support for Spark + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_folders(self, data_access: DataAccess) -> list[str]: + """ + Return the set of folders that will be processed by this transform + :param data_access - data access object + :return: list of folder paths + """ + bands = self.params[num_bands_key] + segments = self.params[num_segments_key] + folders = [os.path.join(f"band={b}", f"segment={s}") for b in range(bands) for s in range(segments)] + return folders + + +class ClusterAnalysisSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Cluster Analysis + as required by the SparkTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=ClusterAnalysisTransformConfiguration(), + runtime_class=ClusterAnalysisSparkRuntime, + ) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup cluster analysis spark transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py new file mode 100644 index 000000000..eb1e61845 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/data_cleaning_local_spark.py @@ -0,0 +1,61 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, +) +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = { + "input_folder": input_folder, + "output_folder": output_folder, +} +duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected", + "docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) +) +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), +} + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py new file mode 100644 index 000000000..2ff0df8bf --- /dev/null +++ b/transforms/universal/fdedup/spark/src/data_cleaning_transform_spark.py @@ -0,0 +1,124 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +from typing import Any + +from data_cleaning_transform import ( + DataCleaningTransformConfiguration, + dataclean_data_access_key, + dataclean_data_factory_key, + duplicate_list_location_default, + duplicate_list_location_key, +) +from data_processing.data_access import DataAccessFactoryBase +from data_processing.transform import TransformStatistics +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + DefaultSparkTransformRuntime, + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) + + +logger = get_logger(__name__) + + +class DataCleaningSparkRuntime(DefaultSparkTransformRuntime): + """ + Data cleaning runtime support for Spark + """ + + def __init__(self, params: dict[str, Any]): + super().__init__(params=params) + self.logger = get_logger(__name__) + + def get_transform_config( + self, partition: int, data_access_factory: DataAccessFactoryBase, statistics: TransformStatistics + ) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :param statistics - reference to statistics actor + :param files - list of files to process + :return: dictionary of transform init params + """ + data_access = data_access_factory.create_data_access() + dc_data_access = self.params.get(dataclean_data_access_key, None) + if dc_data_access is None: + dc_daf = self.params.get(dataclean_data_factory_key, None) + if dc_daf is None: + raise RuntimeError(f"Missing configuration value for key {dataclean_data_factory_key}") + dc_data_access = dc_daf.create_data_access() + if dc_data_access.output_folder is None: + dc_data_access.output_folder = data_access.output_folder + duplicate_list_location = self.params.get(duplicate_list_location_key, duplicate_list_location_default) + if not duplicate_list_location.startswith("/"): + out_paths = dc_data_access.output_folder.rstrip("/").split("/") + dupl_list_paths = duplicate_list_location.split("/") + paths = out_paths[:-1] + dupl_list_paths + duplicate_list_location = "/".join([p.strip("/") for p in paths]) + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = dc_data_access.get_file(duplicate_list_location) + return self.params | {"df": self.duplicate_list} + + +class DataCleaningSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Data Cleaning + as required by the SparkTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__( + transform_config=DataCleaningTransformConfiguration(), + runtime_class=DataCleaningSparkRuntime, + ) + + def get_bcast_params(self, data_access_factory: DataAccessFactoryBase) -> dict[str, Any]: + """ + Download the table of duplicate document ids that will be provided to the + filtering/annotation method. This is the opportunity for this runtime to + create a new set of configuration based on the config/params provided to + this instance's initializer. This may include the addition of new + configuration data such as ray shared memory, new actors, etc., that + might be needed and expected by the transform in its initializer and/or + transform() methods. + :param data_access_factory - data access factory class being used by the RayOrchestrator. + :return: dictionary of parameters to be broadcast + """ + data_access = data_access_factory.create_data_access() + duplicate_list_location = os.path.abspath( + os.path.join(data_access.output_folder, "..", self.transform_config.params["duplicate_list_location"]) + ) + if duplicate_list_location.startswith("s3://"): + _, duplicate_list_location = duplicate_list_location.split("://") + self.duplicate_list, retries = data_access.get_file(duplicate_list_location) + return {"df": self.duplicate_list} + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup data cleaning transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py new file mode 100644 index 000000000..82767f849 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/fdedup_transform_spark.py @@ -0,0 +1,62 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import argparse +import os +import sys + +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing_spark.runtime.spark import SparkTransformLauncher +from fdedup_transform_python import ServiceOrchestrator, parse_args +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +s3_creds = { + "access_key": os.getenv("AWS_ACCESS_KEY_ID"), + "secret_key": os.getenv("AWS_SECRET_ACCESS_KEY"), + "url": os.getenv("AWS_ENDPOINT_URL"), +} + + +class SparkServiceOrchestrator(ServiceOrchestrator): + def __init__(self, global_params: argparse.Namespace = None): + super().__init__(global_params=global_params) + + def execute_service(self, service_short_name: str, params: list) -> int: + sys.argv = params + if service_short_name == "minhash": + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + elif service_short_name == "cluster": + launcher = SparkTransformLauncher(runtime_config=ClusterAnalysisSparkTransformConfiguration()) + elif service_short_name == "fdlist": + launcher = PythonTransformLauncher(runtime_config=GetDuplicateListPythonTransformConfiguration()) + elif service_short_name == "fdclean": + launcher = SparkTransformLauncher(runtime_config=DataCleaningSparkTransformConfiguration()) + status = launcher.launch() + return status + + +if __name__ == "__main__": + + # Parse command line arguments + args = parse_args() + # Initialize the orchestrator + orchestrator = SparkServiceOrchestrator(global_params=args) + # Launch spark fuzzy dedup execution + orchestrator.orchestrate() diff --git a/transforms/universal/fdedup/spark/src/requirements.txt b/transforms/universal/fdedup/spark/src/requirements.txt new file mode 100644 index 000000000..c1a1f2c3d --- /dev/null +++ b/transforms/universal/fdedup/spark/src/requirements.txt @@ -0,0 +1,8 @@ +pyspark +pyarrow +pyyaml +boto3 +kubernetes +disjoint_set +mmh3 +scipy diff --git a/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py new file mode 100644 index 000000000..2db884346 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/signature_calc_local_spark.py @@ -0,0 +1,50 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os +import sys + +import polars as pl +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +# create parameters +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output")) +local_conf = {"input_folder": input_folder, "output_folder": output_folder} +code_location = {"github": "github", "commit_hash": "12345", "path": "path"} + +params = { + # Data access. Only required parameters are specified + "data_local_config": ParamsUtils.convert_to_ast(local_conf), + "scdata_local_config": ParamsUtils.convert_to_ast(local_conf), + # execution info + "runtime_pipeline_id": "pipeline_id", + "runtime_job_id": "job_id", + "runtime_code_location": ParamsUtils.convert_to_ast(code_location), + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, +} + + +if __name__ == "__main__": + # Set the simulated command line args + sys.argv = ParamsUtils.dict_to_req(d=params) + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py new file mode 100644 index 000000000..4e39810c6 --- /dev/null +++ b/transforms/universal/fdedup/spark/src/signature_calc_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from data_processing.utils import get_logger +from data_processing_spark.runtime.spark import ( + SparkTransformLauncher, + SparkTransformRuntimeConfiguration, +) +from signature_calc_transform import SignatureCalculationTransformConfiguration + + +logger = get_logger(__name__) + + +class SignatureCalculationSparkTransformConfiguration(SparkTransformRuntimeConfiguration): + """ + Implements the SparkTransformConfiguration for Fuzzy Dedup Signature Calculation + as required by the PythonTransformLauncher. + """ + + def __init__(self): + """ + Initialization + """ + super().__init__(transform_config=SignatureCalculationTransformConfiguration()) + + +if __name__ == "__main__": + # create launcher + launcher = SparkTransformLauncher(runtime_config=SignatureCalculationSparkTransformConfiguration()) + logger.info("Launching fuzzy dedup signature calculation transform") + # Launch the spark worker(s) to process the input + launcher.launch() diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_0_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet new file mode 100644 index 000000000..f5da05a10 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet new file mode 100644 index 000000000..0e089dee3 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_10_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet new file mode 100644 index 000000000..4b0fecb15 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_11_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet new file mode 100644 index 000000000..5601f5cb0 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_12_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet new file mode 100644 index 000000000..02bedff1c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_13_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet new file mode 100644 index 000000000..bf131f43c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet new file mode 100644 index 000000000..d41b35de2 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_1_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_2_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet new file mode 100644 index 000000000..2838dd972 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_3_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet new file mode 100644 index 000000000..7cb2cbac4 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_4_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet new file mode 100644 index 000000000..79fe53b62 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_5_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet new file mode 100644 index 000000000..57642d199 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet new file mode 100644 index 000000000..9de625746 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_6_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet new file mode 100644 index 000000000..9df2f3bd5 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet new file mode 100644 index 000000000..8e1fe121e Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_7_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet new file mode 100644 index 000000000..37aea5168 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet new file mode 100644 index 000000000..3d1f158e9 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_8_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet new file mode 100644 index 000000000..ca5323db5 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_0.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet new file mode 100644 index 000000000..06b4b7467 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/band_9_segment_1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json new file mode 100644 index 000000000..c08326355 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/cluster_analysis/docs_to_remove/metadata.json @@ -0,0 +1,58 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "cluster", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:32:15", + "end_time": "2024-10-18 10:32:15", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "jaccard_similarity_threshold": 0.7, + "num_bands": 14, + "num_segments": 2, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 91.7, + "gpus": 0, + "memory": 24.01, + "object_store": 0, + "execution time, min": 0.001 + }, + "job_output_stats": { + "result_files": 28, + "result_size": 38040, + "processing_time": 0.061, + "input_files": 28, + "input_bytes": 115324, + "input_rows": 168, + "consolidated_files": 28, + "consolidated_bytes": 80640, + "consolidated_rows": 168, + "groupby_clusters": 35, + "cluster_duplicate_docs": 79, + "jaccard_clusters": 35, + "jaccard_duplicate_docs": 44, + "num_duplicate_documents": 44 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/signature_calc/bands", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet new file mode 100644 index 000000000..03a0c321a Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json new file mode 100644 index 000000000..047921334 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/annotated/metadata.json @@ -0,0 +1,56 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "spark", + "job id": "job_id", + "start_time": "2024-10-14 10:43:38", + "end_time": "2024-10-14 10:43:55", + "status": "success" + }, + "code": null, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "annotate", + "RDD parallelization": -1, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"] + }, + "execution_stats": { + "num partitions": 20, + "execution time, min": 0.284, + "cpus": 20, + "gpus": 0, + "memory": 0.36, + "object_store": 0 + }, + "job_output_stats": { + "source_size": 4111, + "output_bytes": 8856, + "processing_time": 0.46729254722595215, + "input_bytes": 8753, + "result_size": 6923, + "input_files": 1, + "source_files": 1, + "input_docs": 12, + "output_docs": 12, + "filtered_docs": 0, + "output_files": 1, + "result_files": 1, + "source_doc_count": 12, + "filtered_bytes": -103, + "result_doc_count": 12 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/test-data/input", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1/annotated", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet new file mode 100644 index 000000000..d67b5bcf8 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet new file mode 100644 index 000000000..267e78385 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/data_2/df2.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json new file mode 100644 index 000000000..717d9bbe9 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/data_cleaning/cleaned/metadata.json @@ -0,0 +1,59 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdclean", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:10:22", + "end_time": "2024-10-18 10:10:23", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "document_id_column": "int_id_column", + "duplicate_list_location": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "operation_mode": "filter_duplicates", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 112.7, + "gpus": 0, + "memory": 24.17, + "object_store": 0, + "execution time, min": 0.005 + }, + "job_output_stats": { + "source_files": 2, + "source_size": 4490, + "result_files": 2, + "result_size": 18001, + "processing_time": 0.308, + "input_files": 2, + "input_docs": 12, + "input_bytes": 8753, + "output_files": 2, + "output_docs": 4, + "output_bytes": 4650, + "filtered_docs": 8, + "filtered_bytes": 4103, + "source_doc_count": 12, + "result_doc_count": 4 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/input", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cleaned", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..8aa870c00 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet new file mode 100644 index 000000000..34b15a76c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/docs_to_remove_consolidated/docs_to_remove_consolidated.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json new file mode 100644 index 000000000..d4cd3e362 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/get_list_transform/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 10:49:10", + "end_time": "2024-10-18 10:49:10", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 101.1, + "gpus": 0, + "memory": 24.02, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.007, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2/cluster_analysis", + "type": "path" + }, + "target": { + "name": "data-prep-kit/transforms/universal/fdedup/python/test-data/expected2", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/metadata.json new file mode 100644 index 000000000..a0b26f931 --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/metadata.json @@ -0,0 +1,49 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-18 11:36:37", + "end_time": "2024-10-18 11:36:37", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "sort_output": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 4.5, + "gpus": 0, + "memory": 15.91, + "object_store": 0, + "execution time, min": 0.0 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.024, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected/cluster_analysis", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/python/test-data/expected", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet new file mode 100644 index 000000000..c7d3d8072 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet new file mode 100644 index 000000000..c355b299a Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=0/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet new file mode 100644 index 000000000..ad59ee31c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet new file mode 100644 index 000000000..fb2a0b13d Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=1/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet new file mode 100644 index 000000000..aca2026d8 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet new file mode 100644 index 000000000..1a46cb40f Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=10/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet new file mode 100644 index 000000000..56934cab8 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet new file mode 100644 index 000000000..f82d9daca Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=11/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet new file mode 100644 index 000000000..842ce2caa Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet new file mode 100644 index 000000000..fcb03c17a Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=12/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet new file mode 100644 index 000000000..84c399e67 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet new file mode 100644 index 000000000..79a6f24b3 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=13/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet new file mode 100644 index 000000000..e67164596 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet new file mode 100644 index 000000000..cd2e75eaa Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=2/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet new file mode 100644 index 000000000..5212dff6d Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet new file mode 100644 index 000000000..d0f1bd9b4 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=3/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet new file mode 100644 index 000000000..1cc7b2c26 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet new file mode 100644 index 000000000..f892d384d Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=4/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet new file mode 100644 index 000000000..1a786300b Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet new file mode 100644 index 000000000..bc20a7699 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=5/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet new file mode 100644 index 000000000..151008dc4 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet new file mode 100644 index 000000000..b485d3882 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=6/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet new file mode 100644 index 000000000..0da33db3c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet new file mode 100644 index 000000000..1e1b4765c Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=7/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet new file mode 100644 index 000000000..7e9af93b0 Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet new file mode 100644 index 000000000..d112e179e Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=8/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet new file mode 100644 index 000000000..f3f7d2a7d Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=0/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet new file mode 100644 index 000000000..06444accf Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/bands/band=9/segment=1/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json new file mode 100644 index 000000000..f7f0fe9df --- /dev/null +++ b/transforms/universal/fdedup/spark/test-data/expected/signature_calc/metadata.json @@ -0,0 +1,48 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "fdlist", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-10-14 10:43:37", + "end_time": "2024-10-14 10:43:38", + "status": "success" + }, + "code": null, + "job_input_params": { + "docs_to_remove": "docs_to_remove", + "consolidated_filename": "docs_to_remove_consolidated/docs_to_remove_consolidated.parquet", + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [".parquet"], + "num_processors": 0 + }, + "execution_stats": { + "cpus": 31.7, + "gpus": 0, + "memory": 15.83, + "object_store": 0, + "execution time, min": 0.003 + }, + "job_output_stats": { + "result_files": 1, + "result_size": 663, + "processing_time": 0.2, + "input_files": 28, + "input_bytes": 38040, + "input_rows": 44, + "consolidated_files": 1, + "consolidated_bytes": 64, + "consolidated_rows": 8 + }, + "source": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + }, + "target": { + "name": "/home/cma/de/data-prep-kit/transforms/universal/fdedup/spark/output/test_1", + "type": "path" + } +} diff --git a/transforms/universal/fdedup/spark/test-data/input/df1.parquet b/transforms/universal/fdedup/spark/test-data/input/df1.parquet new file mode 100644 index 000000000..2584725bb Binary files /dev/null and b/transforms/universal/fdedup/spark/test-data/input/df1.parquet differ diff --git a/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py new file mode 100644 index 000000000..294c86f25 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_cluster_analysis_transform_spark.py @@ -0,0 +1,46 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from cluster_analysis_transform import sort_output_cli_param +from cluster_analysis_transform_spark import ClusterAnalysisSparkTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +class TestSparkClusterAnalysisTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "cluster_num_bands": 14, + "cluster_num_segments": 2, + "cluster_jaccard_similarity_threshold": 0.7, + sort_output_cli_param: True, + } + launcher = SparkTransformLauncher(ClusterAnalysisSparkTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "signature_calc", "bands"), + os.path.join(basedir, "expected", "cluster_analysis", "docs_to_remove"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py new file mode 100644 index 000000000..919857e23 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_data_cleaning_transform_spark.py @@ -0,0 +1,58 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_cleaning_transform import ( + document_id_column_cli_param, + duplicate_list_location_cli_param, + operation_mode_cli_param, +) +from data_cleaning_transform_spark import DataCleaningSparkTransformConfiguration +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing_spark.runtime.spark import SparkTransformLauncher + + +class TestSparkDataCleaningTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + duplicate_location = os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "test-data", + "expected/get_list_transform/docs_to_remove_consolidated", + "docs_to_remove_consolidated.parquet", + ) + ) + config = { + document_id_column_cli_param: "int_id_column", + duplicate_list_location_cli_param: duplicate_location, + operation_mode_cli_param: "annotate", + } + launcher = SparkTransformLauncher(DataCleaningSparkTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "input"), + os.path.join(basedir, "expected", "data_cleaning", "annotated"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py new file mode 100644 index 000000000..4b59e3a7a --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_get_duplicate_list_transform_spark.py @@ -0,0 +1,45 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.runtime.pure_python import PythonTransformLauncher +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from get_duplicate_list_transform import sort_output_cli_param +from get_duplicate_list_transform_python import ( + GetDuplicateListPythonTransformConfiguration, +) + + +class TestPythonGetDuplicateListTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + sort_output_cli_param: True, + } + launcher = PythonTransformLauncher(GetDuplicateListPythonTransformConfiguration()) + fixtures = [ + ( + launcher, + config, + os.path.join(basedir, "expected", "cluster_analysis"), + os.path.join(basedir, "expected", "get_list_transform"), + ) + ] + return fixtures diff --git a/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py new file mode 100644 index 000000000..6d93dc7a9 --- /dev/null +++ b/transforms/universal/fdedup/spark/test/test_signature_calc_transform_spark.py @@ -0,0 +1,42 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import os + +from data_processing.test_support.launch.transform_test import ( + AbstractTransformLauncherTest, +) +from data_processing.utils import ParamsUtils +from data_processing_spark.runtime.spark import SparkTransformLauncher +from signature_calc_transform_spark import ( + SignatureCalculationSparkTransformConfiguration, +) + + +class TestSparkSignatureCalcTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../test-data")) + config = { + "minhash_num_permutations": 112, + "minhash_num_bands": 14, + "minhash_num_segments": 2, + } + launcher = SparkTransformLauncher(SignatureCalculationSparkTransformConfiguration()) + fixtures = [ + (launcher, config, os.path.join(basedir, "input"), os.path.join(basedir, "expected", "signature_calc")) + ] + return fixtures diff --git a/transforms/universal/fdedup/transform.config b/transforms/universal/fdedup/transform.config index 774716e15..ffaeb9f45 100644 --- a/transforms/universal/fdedup/transform.config +++ b/transforms/universal/fdedup/transform.config @@ -14,5 +14,6 @@ TRANSFORM_NAME=fdedup # # If you change the versions numbers, be sure to run "make set-versions" to # update version numbers across the transform (e.g., pyproject.toml). -FDEDUP_RAY_VERSION=$(DPK_VERSION) - +FDEDUP_PYTHON_VERSION=$(DPK_VERSION) +FDEDUP_RAY_VERSION=$(FDEDUP_PYTHON_VERSION) +FDEDUP_SPARK_VERSION=$(FDEDUP_PYTHON_VERSION) diff --git a/transforms/universal/fdedup/utils/Makefile.local b/transforms/universal/fdedup/utils/Makefile.local new file mode 100644 index 000000000..d9dae01d7 --- /dev/null +++ b/transforms/universal/fdedup/utils/Makefile.local @@ -0,0 +1,18 @@ +PYTHON=python +PIP=pip + +venv: requirements.txt + $(PYTHON) -m venv venv + if [ -e venv/Scripts/activate ]; then \ + echo "For Windows please try the following AS Administrator - no guarantees"; \ + echo " venv\\Scripts\\activate"; \ + echo " pip install --upgrade pip"; \ + echo " pip install -r requirements.txt"; \ + echo " pip install pytest"; \ + else \ + . venv/bin/activate; \ + $(PIP) install --upgrade pip; \ + $(PIP) install -r requirements.txt; \ + fi +set-versions: + @: \ No newline at end of file diff --git a/transforms/universal/fdedup/utils/calc_r_and_b.ipynb b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb new file mode 100644 index 000000000..8398f9efa --- /dev/null +++ b/transforms/universal/fdedup/utils/calc_r_and_b.ipynb @@ -0,0 +1,74 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cf5dba9a-d530-4a0a-ae71-2d741f7e705f", + "metadata": {}, + "source": [ + "This notebook allows calculating the values for `b` (the number of bands) and `r` (the number of minhashes in a band) used in the fuzzy dedup algorithm. The default values are `b=14` and `r=8`, as defined in the [FineWeb datasets paper](https://arxiv.org/pdf/2406.17557). The x-axis of the graph represents the Jaccard similarity between a pair of documents, while the y-axis represents the probability that they become duplication candidates. Please refer to http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf for more details on this methodology." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "800bc113-8b5e-4cec-8717-98fa05753bd0", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Define the parameterized function\n", + "def f(s, r, b):\n", + " return 1 - (1 - s**r)**b\n", + "\n", + "# Set the parameters r and b\n", + "r = 8\n", + "b = 14\n", + "\n", + "# Generate values for s in a range, e.g., from 0 to 1\n", + "s_values = np.linspace(0, 1, 500) # 500 points between 0 and 1\n", + "f_values = f(s_values, r, b)\n", + "\n", + "# Plot the function\n", + "plt.figure(figsize=(8, 6))\n", + "plt.plot(s_values, f_values, label=fr\"$f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\", color='blue')\n", + "plt.xlabel(\"s\")\n", + "plt.ylabel(\"f(s)\")\n", + "plt.title(f\"Plot of the function $f(s) = 1 - (1 - s^{{{r}}})^{{{b}}}$\")\n", + "plt.legend()\n", + "plt.grid(True)\n", + "plt.show()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98016b04-b6a0-465d-b65b-6d402978c9f0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/fdedup/utils/requirements.txt b/transforms/universal/fdedup/utils/requirements.txt new file mode 100644 index 000000000..ce2acfefb --- /dev/null +++ b/transforms/universal/fdedup/utils/requirements.txt @@ -0,0 +1,3 @@ +jupyter +numpy +matplotlib diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index 64f148799..fcf0f6419 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt index 9f1feff29..d97ef5cfd 100644 --- a/transforms/universal/filter/python/requirements.txt +++ b/transforms/universal/filter/python/requirements.txt @@ -1,3 +1,3 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 duckdb>=0.10.1 diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index a794a1a0b..6cb90c2bb 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-filter-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index 7b60dba46..176ff1de3 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[spark]==0.2.2.dev2", + "data-prep-toolkit[spark]>=0.2.3.dev0", ] [project.optional-dependencies] diff --git a/transforms/universal/hap/python/README.md b/transforms/universal/hap/python/README.md index 29d54d999..2cc8504d2 100644 --- a/transforms/universal/hap/python/README.md +++ b/transforms/universal/hap/python/README.md @@ -1,10 +1,14 @@ # Hate, Abuse, and Profanity (HAP) Annotation Please see the set of [transform project conventions](https://github.com/ian-cho/data-prep-kit/blob/dev/transforms/README.md) for details on general project conventions, transform configuration, testing and IDE set up. -## Prerequisite +## Contributor +- Yang Zhao (yangzhao@ibm.com) + +## Description +### Prerequisite This repository needs [NLTK](https://www.nltk.org/) and please refer to `requirements.txt`. -## Summary +### Overview The hap transform maps a non-empty input table to an output table with an added `hap_score` column. Each row in the table represents a document, and the hap transform performs the following three steps to calculate the hap score for each document: * Sentence spliting: we use NLTK to split the document into sentence pieces. @@ -12,18 +16,7 @@ The hap transform maps a non-empty input table to an output table with an added * Aggregation: the document hap score is determined by selecting the maximum hap score among its sentences. -## Configuration and command line Options -The set of dictionary keys holding [HAPTransformConfiguration](src/hap_transform.py) -configuration for values are as follows: - -* --model_name_or_path - specify the HAP model, which should be compatible with HuggingFace's AutoModelForSequenceClassification. Defaults to IBM's open-source toxicity classifier `ibm-granite/granite-guardian-hap-38m`. -* --batch_size - modify it based on the infrastructure capacity. Defaults to `128`. -* --max_length - the maximum length for the tokenizer. Defaults to `512`. -* --doc_text_column - the column name containing the document text in the input .parquet file. Defaults to `contents`. -* --annotation_column - the column name containing hap (toxicity) score in the output .parquet file. Defaults to `hap_score`. - - -## input format +### input format The input is in .parquet format and contains the following columns: | doc_id | contents | @@ -31,7 +24,8 @@ The input is in .parquet format and contains the following columns: | 1 | GSC is very much a little Swiss Army knife for... | | 2 | Here are only a few examples. And no, I'm not ... | -## output format + +### output format The output is in .parquet format and includes an additional column, in addition to those in the input: | doc_id | contents | hap_score | @@ -39,7 +33,21 @@ The output is in .parquet format and includes an additional column, in addition | 1 | GSC is very much a little Swiss Army knife for... | 0.002463 | | 2 | Here are only a few examples. And no, I'm not ... | 0.989713 | -## How to run +## Configuration +The set of dictionary keys holding [HAPTransformConfiguration](src/hap_transform.py) +configuration for values are as follows: + + +* --model_name_or_path - specify the HAP model, which should be compatible with HuggingFace's AutoModelForSequenceClassification. Defaults to IBM's open-source toxicity classifier `ibm-granite/granite-guardian-hap-38m`. +* --batch_size - modify it based on the infrastructure capacity. Defaults to `128`. +* --max_length - the maximum length for the tokenizer. Defaults to `512`. +* --doc_text_column - the column name containing the document text in the input .parquet file. Defaults to `contents`. +* --annotation_column - the column name containing hap (toxicity) score in the output .parquet file. Defaults to `hap_score`. + + + + +## Usage Place your input Parquet file in the `test-data/input/` directory. A sample file, `test1.parquet`, is available in this directory. Once done, run the script. ```python @@ -48,6 +56,20 @@ python hap_local_python.py You will obtain the output file `test1.parquet` in the output directory. +### Code example +[notebook](./hap_python.ipynb) + +### Transforming data using the transform image +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +Currently we have: +- [hap test](transforms/universal/hap/python/test/test_hap.py) + + ## Throughput The table below shows the throughput (tokens per second) of the HAP transform module, which primarily includes sentence splitting, HAP annotation, and HAP score aggregation. We herein compare two models: @@ -62,6 +84,7 @@ We processed 6,000 documents (12 MB in Parquet file size) using the HAP transfor | granite-guardian-hap-125m | 1.14 k | - +### Credits +The HAP transform is jointly developed by IBM Research - Tokyo and Yorktown. diff --git a/transforms/universal/hap/python/hap_python.ipynb b/transforms/universal/hap/python/hap_python.ipynb new file mode 100644 index 000000000..62486fb0d --- /dev/null +++ b/transforms/universal/hap/python/hap_python.ipynb @@ -0,0 +1,217 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cefa9cf6-e043-4b75-b416-a0b26c8cb3ad", + "metadata": {}, + "source": [ + "**** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + " make venv \n", + " source venv/bin/activate \n", + " pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4a84e965-feeb-424d-9263-9f127e53a1aa", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "%pip install data-prep-toolkit\n", + "%pip install data-prep-toolkit-transforms==0.2.2.dev3" + ] + }, + { + "cell_type": "markdown", + "id": "1d695832-16bc-48d3-a9c3-6ce650ae4a5c", + "metadata": {}, + "source": [ + "**** Configure the transform parameters. The set of dictionary keys holding DocQualityTransform configuration for values are as follows:\n", + " - model_name_or_path - specify the HAP model, which should be compatible with HuggingFace's AutoModelForSequenceClassification. Defaults to IBM's open-source toxicity classifier ibm-granite/granite-guardian-hap-38m.\n", + " - annotation_column - the column name containing hap (toxicity) score in the output .parquet file. Defaults to hap_score.\n", + " - doc_text_column - the column name containing the document text in the input .parquet file. Defaults to contents.\n", + " - batch_size - modify it based on the infrastructure capacity. Defaults to 128.\n", + " - max_length - the maximum length for the tokenizer. Defaults to 512." + ] + }, + { + "cell_type": "markdown", + "id": "3f9dbf94-2db4-492d-bbcb-53ac3948c256", + "metadata": {}, + "source": [ + "***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "38aebf49-9460-4951-bb04-7045dec28690", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt_tab to /Users/ian/nltk_data...\n", + "[nltk_data] Package punkt_tab is already up-to-date!\n" + ] + } + ], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from hap_transform_python import HAPPythonTransformConfiguration" + ] + }, + { + "cell_type": "markdown", + "id": "f443108f-40e4-40e5-a052-e8a7f4fbccdf", + "metadata": {}, + "source": [ + "***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "6a8ec5e4-1f52-4c61-9c9e-4618f9034b80", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "__file__ = os.getcwd()\n", + "input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), \"../test-data/input\"))\n", + "output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), \"../output\"))\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "code_location = {\"github\": \"github\", \"commit_hash\": \"12345\", \"path\": \"path\"}\n", + "\n", + "params = {\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " \"runtime_code_location\": ParamsUtils.convert_to_ast(code_location),\n", + "}\n", + "\n", + "\n", + "hap_params = {\n", + " \"model_name_or_path\": 'ibm-granite/granite-guardian-hap-38m',\n", + " \"annotation_column\": \"hap_score\",\n", + " \"doc_text_column\": \"contents\",\n", + " \"inference_engine\": \"CPU\",\n", + " \"max_length\": 512,\n", + " \"batch_size\": 128,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "d70abda8-3d66-4328-99ce-4075646a7756", + "metadata": {}, + "source": [ + "***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "94e908e2-1891-4dc7-9f85-85bbf8d44c5e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "11:29:11 INFO - hap params are {'model_name_or_path': 'ibm-granite/granite-guardian-hap-38m', 'annotation_column': 'hap_score', 'doc_text_column': 'contents', 'inference_engine': 'CPU', 'max_length': 512, 'batch_size': 128} \n", + "11:29:11 INFO - pipeline id pipeline_id\n", + "11:29:11 INFO - code location {'github': 'github', 'commit_hash': '12345', 'path': 'path'}\n", + "11:29:11 INFO - data factory data_ is using local data access: input_folder - /Users/ian/Desktop/data-prep-kit/transforms/universal/hap/test-data/input output_folder - /Users/ian/Desktop/data-prep-kit/transforms/universal/hap/output\n", + "11:29:11 INFO - data factory data_ max_files -1, n_sample -1\n", + "11:29:11 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "11:29:11 INFO - orchestrator hap started at 2024-12-03 11:29:11\n", + "11:29:11 ERROR - No input files to process - exiting\n", + "11:29:11 INFO - Completed execution in 0.0 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params | hap_params)\n", + "launcher = PythonTransformLauncher(runtime_config=HAPPythonTransformConfiguration())\n", + "launcher.launch()" + ] + }, + { + "cell_type": "markdown", + "id": "0bd4ad5c-a1d9-4ea2-abb7-e43571095392", + "metadata": {}, + "source": [ + "**** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f21d5d9b-562d-4530-8cea-2de5b63eb1dc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['../output/metadata.json', '../output/test1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the outputs will be located in the following folders\n", + "import glob\n", + "glob.glob(\"../output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cd3367a-205f-4d33-83fb-106e32173bc0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml index 389788363..bf7c85577 100644 --- a/transforms/universal/hap/python/pyproject.toml +++ b/transforms/universal/hap/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HAP Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index 505dd9ceb..fdf9a425e 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 nltk==3.9.1 transformers==4.38.2 -torch==2.4.1 +torch>=2.2.2,<=2.4.1 pandas==2.2.2 diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml index abbb1a30c..38e78938b 100644 --- a/transforms/universal/hap/ray/pyproject.toml +++ b/transforms/universal/hap/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "HAP Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 0ed65f625..adf675cac 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -1,6 +1,6 @@ -data-prep-toolkit[ray]==0.2.2.dev2 -dpk-hap-transform-python==0.2.2.dev2 +data-prep-toolkit[ray]>=0.2.3.dev0 +dpk-hap-transform-python==0.2.3.dev0 nltk==3.9.1 transformers==4.38.2 -torch==2.4.1 +torch>=2.2.2,<=2.4.1 pandas==2.2.2 diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index 998161e31..16f07053a 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev2", + "data-prep-toolkit>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index 5d475fe12..e848ec793 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-noop-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index f867fb070..5fe682eef 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-noop-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 95775e3a6..39d9788f8 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "profiler Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt index 89801e4ad..2b32cd843 100644 --- a/transforms/universal/profiler/python/requirements.txt +++ b/transforms/universal/profiler/python/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 mmh3==4.1.0 xxhash==3.4.1 diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index 6060653fa..9e1c49adf 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit[ray]==0.2.2.dev2", - "dpk_profiler_transform_python==0.2.2.dev2", + "data-prep-toolkit[ray]>=0.2.3.dev0", + "dpk_profiler_transform_python==0.2.3.dev0", "tqdm==4.66.3", ] diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml index 455684b4f..08e770278 100644 --- a/transforms/universal/profiler/spark/pyproject.toml +++ b/transforms/universal/profiler/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Profiler Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-profiler-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-profiler-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 082f37f0c..6fdad69d0 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "resize Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt index 368287e5d..08447f212 100644 --- a/transforms/universal/resize/python/requirements.txt +++ b/transforms/universal/resize/python/requirements.txt @@ -1 +1 @@ -data-prep-toolkit==0.2.2.dev2 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 \ No newline at end of file diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index 1490303bb..1b056fc8f 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-resize-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml index 538c12d20..dc5bd98e3 100644 --- a/transforms/universal/resize/spark/pyproject.toml +++ b/transforms/universal/resize/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_spark" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Resize Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2.dev2", - "data-prep-toolkit[spark]==0.2.2.dev2", + "dpk-resize-transform-python==0.2.3.dev0", + "data-prep-toolkit[spark]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index bc352f0fd..dbb8e84ba 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt index 5e00dbaa1..1fca1f418 100644 --- a/transforms/universal/tokenization/python/requirements.txt +++ b/transforms/universal/tokenization/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev2 +data-prep-toolkit>=0.2.3.dev0 transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index 095cb63e0..6df6b746c 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.2.dev2" +version = "0.2.3.dev0" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.2.dev2", - "data-prep-toolkit[ray]==0.2.2.dev2", + "dpk-tokenization-transform-python==0.2.3.dev0", + "data-prep-toolkit[ray]>=0.2.3.dev0", ] [build-system] diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index 1841403a7..1a8ecb408 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -21,16 +21,24 @@ For configuring the crawl, users need to specify the following parameters: The transform can be installed directly from pypi and has a dependency on the data-prep-toolkit and the data-prep-connector +Set up the local environment to run Jupyter notebook: +``` +python -v venv venv +source venv/bin/activate +pip install jupyter lab +``` +Install pre-requisites: + ``` pip install data-prep-connector pip install data-prep-toolkit>=0.2.2.dev2 -pip install data-prep-toolkit-transform[web2parquet]>=0.2.2.dev3 +pip install 'data-prep-toolkit-transforms[web2parquet]>=0.2.2.dev3' ``` If working from a fork in the git repo, from the root folder of the git repo, do the following: ``` -cd transform/universal/web2parquet +cd transforms/universal/web2parquet make venv source venv/bin/activate pip install -r requirements.txt @@ -49,4 +57,4 @@ Web2Parquet(urls= ['https://thealliance.ai/'], depth=2, downloads=10, folder='downloads').transform() -```` \ No newline at end of file +```` diff --git a/transforms/universal/web2parquet/requirements.txt b/transforms/universal/web2parquet/requirements.txt index 5c989591d..1af3f12a4 100644 --- a/transforms/universal/web2parquet/requirements.txt +++ b/transforms/universal/web2parquet/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit>=0.2.2.dev2 -data_prep_connector>=0.2.3.dev0 \ No newline at end of file +data-prep-toolkit>=0.2.3.dev0 +data_prep_connector>=0.2.3 \ No newline at end of file diff --git a/transforms/universal/web2parquet/web2parquet.ipynb b/transforms/universal/web2parquet/web2parquet.ipynb index 2bd55f0bc..ea802d734 100644 --- a/transforms/universal/web2parquet/web2parquet.ipynb +++ b/transforms/universal/web2parquet/web2parquet.ipynb @@ -5,12 +5,12 @@ "id": "afd55886-5f5b-4794-838e-ef8179fb0394", "metadata": {}, "source": [ - "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release\n", + "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "##### \n", "\n", - "##### **** example: \n", "```\n", - "python -m venv && source venv/bin/activate\n", - "pip install -r requirements.txt\n", + "make venv \n", + "source venv/bin/activate \n", "pip install jupyterlab\n", "```" ]