From ddedccccd0ec60ea8ff63ab0b9e2354fbb7d1168 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 8 Aug 2024 18:59:34 -0400 Subject: [PATCH 01/29] pytest for most but faild for pdf2parquet and resize Signed-off-by: Maroun Touma --- transforms/packaging/python/pyproject.toml | 42 ++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 transforms/packaging/python/pyproject.toml diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml new file mode 100644 index 000000000..7c8422bc7 --- /dev/null +++ b/transforms/packaging/python/pyproject.toml @@ -0,0 +1,42 @@ +[project] +name = "data_prep_toolkit_transforms" +version = "0.2.1.dev0" +requires-python = ">=3.10,<3.12" +keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +description = "Data Preparation Toolkit Transforms" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] + +dependencies = [ + "dpk_text_encoder_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/text_encoder/python", + "dpk_doc_chunk_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_chunk/python", + "dpk_lang_id_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/lang_id/python", + "dpk_pdf2parquet_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/pdf2parquet/python", + "dpk_doc_quality_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_quality/python", + "dpk_ededup_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/ededup/python", + "dpk_filter_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/filter/python", + "dpk_resize_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/resize/python", + "dpk_tokenization_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/tokenization/python", + "dpk_code_quality_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code_quality/python", + "dpk_code2parquet_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code2parquet/python", +# "dpk_header_cleanser_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/header_cleanser/python", +# "dpk_malware_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/malware/python", + "dpk_proglang_select_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/proglang_select/python", +] + +[project_urls] +Repository = "https://github.com/IBM/data-prep-kit" +Issues = "https://github.com/IBM/data-prep-kit/issues" +Documentation = "https://ibm.github.io/data-prep-kit/" + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages] +find = {namespaces = false} # Disable implicit namespaces + + From 134c2c202a3a43e1f1fe8781761daf96b0482c0d Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 9 Aug 2024 08:39:17 -0400 Subject: [PATCH 02/29] header_cleanser available for non Darwin installation Signed-off-by: Maroun Touma --- transforms/packaging/python/pyproject.toml | 29 +++++++------ transforms/packaging/ray/pyproject.toml | 50 ++++++++++++++++++++++ 2 files changed, 66 insertions(+), 13 deletions(-) create mode 100644 transforms/packaging/ray/pyproject.toml diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml index 7c8422bc7..07fc7bacd 100644 --- a/transforms/packaging/python/pyproject.toml +++ b/transforms/packaging/python/pyproject.toml @@ -11,20 +11,23 @@ authors = [ ] dependencies = [ - "dpk_text_encoder_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/text_encoder/python", - "dpk_doc_chunk_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_chunk/python", - "dpk_lang_id_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/lang_id/python", - "dpk_pdf2parquet_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/pdf2parquet/python", - "dpk_doc_quality_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_quality/python", - "dpk_ededup_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/ededup/python", - "dpk_filter_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/filter/python", - "dpk_resize_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/resize/python", - "dpk_tokenization_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/tokenization/python", - "dpk_code_quality_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code_quality/python", - "dpk_code2parquet_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code2parquet/python", -# "dpk_header_cleanser_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/header_cleanser/python", -# "dpk_malware_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/malware/python", +## Code + "dpk_code_quality_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code_quality/python", + "dpk_code2parquet_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code2parquet/python", + "dpk_header_cleanser_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/header_cleanser/python ; platform_system != 'Darwin'", + "dpk_malware_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/malware/python", "dpk_proglang_select_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/proglang_select/python", +## Language + "dpk_doc_chunk_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_chunk/python", + "dpk_doc_quality_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_quality/python", + "dpk_lang_id_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/lang_id/python", + "dpk_pdf2parquet_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/pdf2parquet/python", + "dpk_text_encoder_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/text_encoder/python", +## Universal + "dpk_ededup_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/ededup/python", + "dpk_filter_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/filter/python", + "dpk_resize_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/resize/python", + "dpk_tokenization_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/tokenization/python", ] [project_urls] diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml new file mode 100644 index 000000000..c9104d99e --- /dev/null +++ b/transforms/packaging/ray/pyproject.toml @@ -0,0 +1,50 @@ +[project] +name = "data_prep_toolkit_transforms_ray" +version = "0.2.1.dev0" +requires-python = ">=3.10,<3.12" +keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +description = "Data Preparation Toolkit Transforms" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] + + +dependencies = [ +## Code + "dpk_code_quality_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code_quality/ray", + "dpk_code2parquet_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code2parquet/ray", + "dpk_header_cleanser_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/header_cleanser/ray ; platform_system != 'Darwin'", + "dpk_malware_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/malware/ray", + "dpk_proglang_select_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/proglang_select/ray", + "dpk_repo_level_order_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/repo_level_ordering/ray", +## Language + "dpk_doc_chunk_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_chunk/ray", + "dpk_doc_quality_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_quality/ray", + "dpk_lang_id_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/lang_id/ray", + "dpk_pdf2parquet_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/pdf2parquet/ray", + "dpk_text_encoder_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/text_encoder/ray", +## Universal + "dpk_docid_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/doc_id/ray", + "dpk_ededup_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/ededup/ray", + "dpk_fdedup_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/fdedup/ray", + "dpk_filter_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/filter/ray", + "dpk_resize_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/resize/ray", + "dpk_profiler_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/profiler/ray", + "dpk_tokenization_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/tokenization/ray", +] + +[project_urls] +Repository = "https://github.com/IBM/data-prep-kit" +Issues = "https://github.com/IBM/data-prep-kit/issues" +Documentation = "https://ibm.github.io/data-prep-kit/" + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.packages] +find = {namespaces = false} # Disable implicit namespaces + + From e227b9df8d993734bfc68fbf5f54e22149a7e937 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 9 Aug 2024 23:56:21 -0400 Subject: [PATCH 03/29] Added make file with unit tests for all transforms Signed-off-by: Maroun Touma --- transforms/packaging/python/Makefile | 65 ++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 transforms/packaging/python/Makefile diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile new file mode 100644 index 000000000..c66d0b159 --- /dev/null +++ b/transforms/packaging/python/Makefile @@ -0,0 +1,65 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. + +# $(REPOROOT)/.make.versions file contains the versions + +#TRANSFORM_NAME=doc_quality + +include $(REPOROOT)/transforms/.make.transforms + +venv:: .transforms.python-venv + + +TRANSFORMS_NAMES = code/code_quality \ + code/code2parquet \ + code/header_cleanser \ + code/code_quality \ + code/code2parquet \ + code/header_cleanser \ + code/proglang_select \ + language/doc_chunk \ + language/doc_quality \ + language/lang_id \ + language/pdf2parquet \ + language/text_encoder \ + universal/ededup \ + universal/filter \ + universal/resize \ + universal/tokenization + + +test:: test-src + +clean:: .transforms.clean + +image:: .transforms.python-image + +test-src:: venv + source venv/bin/activate; \ + for T in $(TRANSFORMS_NAMES); do \ + @echo running unit test on: $$T ; \ + $(PYTEST) $(REPOROOT)/transforms/$$T/python/test; \ + done; + + +clean:: .transforms.clean + +image:: .transforms.python-image + +setup:: .transforms.setup + +build:: build-dist + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + +run-local-sample: .transforms.run-local-sample + +run-local-python-sample: .transforms.run-local-python-sample + + From d155a245702017c9cdab39efca0cce6424df0eef Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 12 Aug 2024 17:46:04 -0400 Subject: [PATCH 04/29] Build and deploy pypi package using makefile Signed-off-by: Maroun Touma --- transforms/packaging/.gitignore | 5 ++ transforms/packaging/python/Makefile | 42 +++++++---- transforms/packaging/python/README.md | 33 +++++++++ transforms/packaging/python/pyproject.toml | 66 +++++++++++------ transforms/packaging/ray/Makefile | 85 ++++++++++++++++++++++ transforms/packaging/ray/README.md | 37 ++++++++++ transforms/packaging/ray/pyproject.toml | 64 ++++++++-------- 7 files changed, 263 insertions(+), 69 deletions(-) create mode 100644 transforms/packaging/.gitignore create mode 100644 transforms/packaging/python/README.md create mode 100644 transforms/packaging/ray/Makefile create mode 100644 transforms/packaging/ray/README.md diff --git a/transforms/packaging/.gitignore b/transforms/packaging/.gitignore new file mode 100644 index 000000000..863607847 --- /dev/null +++ b/transforms/packaging/.gitignore @@ -0,0 +1,5 @@ +**/src +**/dist +**/*.egg-info +**/build + diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index c66d0b159..b1c3ad383 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -11,15 +11,10 @@ REPOROOT=../../.. include $(REPOROOT)/transforms/.make.transforms -venv:: .transforms.python-venv - - TRANSFORMS_NAMES = code/code_quality \ code/code2parquet \ code/header_cleanser \ code/code_quality \ - code/code2parquet \ - code/header_cleanser \ code/proglang_select \ language/doc_chunk \ language/doc_quality \ @@ -31,35 +26,52 @@ TRANSFORMS_NAMES = code/code_quality \ universal/resize \ universal/tokenization +venv: + $(MAKE) .defaults.create-venv + source venv/bin/activate; \ + $(PYTHON) -m pip install . + -test:: test-src +test:: setup venv test-src clean:: .transforms.clean + -rm -fr src image:: .transforms.python-image -test-src:: venv +test-src:: source venv/bin/activate; \ for T in $(TRANSFORMS_NAMES); do \ - @echo running unit test on: $$T ; \ + echo running unit test on: $$T ; \ $(PYTEST) $(REPOROOT)/transforms/$$T/python/test; \ done; +test-with-pypi: + $(MAKE) .defaults.create-venv + source venv/bin/activate; \ + $(PYTHON) -m pip install data_prep_toolkit_transforms==0.2.1.dev0 + $(MAKE) test-src -clean:: .transforms.clean -image:: .transforms.python-image +setup: .transforms.setup + $(MAKE) src + +src: + for T in $(TRANSFORMS_NAMES); do \ + echo copy src from $$T ; \ + cp -R $(REPOROOT)/transforms/$$T/python/src/ src/ ; \ + rm -fr *.egg-info ; \ + rm -fr dist ; \ + rm -fr build ; \ + done; -setup:: .transforms.setup build:: build-dist +publish:: publish-dist + build-dist:: .defaults.build-dist publish-dist:: .defaults.publish-dist -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: .transforms.run-local-python-sample - diff --git a/transforms/packaging/python/README.md b/transforms/packaging/python/README.md new file mode 100644 index 000000000..e200a3568 --- /dev/null +++ b/transforms/packaging/python/README.md @@ -0,0 +1,33 @@ +# DPK Python Transforms + +## installation + +The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: + +`python -m pip install data-prep-toolkit-transforms` + +installing the python transforms will also install `data-prep-toolkit` + +## List of Transforms in current package + +* code + * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md) + * header_cleanser (Not available on MacOS) + * code_quality + * proglang_select +* language + * doc_chunk + * *doc_quality + * lang_id + * pdf2parquet + * text_encoder +* universal + * ededup + * filter + * resize + * tokenization + + + + + diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml index 07fc7bacd..e51097b9a 100644 --- a/transforms/packaging/python/pyproject.toml +++ b/transforms/packaging/python/pyproject.toml @@ -11,35 +11,53 @@ authors = [ ] dependencies = [ -## Code - "dpk_code_quality_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code_quality/python", - "dpk_code2parquet_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code2parquet/python", - "dpk_header_cleanser_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/header_cleanser/python ; platform_system != 'Darwin'", - "dpk_malware_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/malware/python", - "dpk_proglang_select_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/proglang_select/python", -## Language - "dpk_doc_chunk_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_chunk/python", - "dpk_doc_quality_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_quality/python", - "dpk_lang_id_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/lang_id/python", - "dpk_pdf2parquet_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/pdf2parquet/python", - "dpk_text_encoder_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/text_encoder/python", -## Universal - "dpk_ededup_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/ededup/python", - "dpk_filter_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/filter/python", - "dpk_resize_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/resize/python", - "dpk_tokenization_transform_python @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/tokenization/python", + "data-prep-toolkit==0.2.1.dev0", + "argparse", + "boto3==1.34.69", + "bs4==0.0.2", + "clamd==1.0.2", + "docling[ocr]==1.1.2", + "duckdb==0.10.1", + "fasttext==0.9.2", + "filetype >=1.2.0, <2.0.0", + "huggingface-hub >= 0.21.4, <1.0.0", + "langcodes==3.3.0", + "mmh3==4.1.0", + "numpy==1.26.4", + "pandas", + "parameterized", + "pyarrow==16.1.0", + "python-dateutil>=2.8.2", + "pytz>=2020.1", + "quackling==0.1.0", + "scancode-toolkit==32.1.0 ; platform_system != 'Darwin'", + "sentence-transformers==3.0.1", + "transformers==4.38.2", + "tzdata>=2022.7", + "xxhash==3.4.1", ] -[project_urls] -Repository = "https://github.com/IBM/data-prep-kit" -Issues = "https://github.com/IBM/data-prep-kit/issues" -Documentation = "https://ibm.github.io/data-prep-kit/" - [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" -[tool.setuptools.packages] -find = {namespaces = false} # Disable implicit namespaces + +[options] +package_dir = ["src"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] + + + + diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile new file mode 100644 index 000000000..f05164da1 --- /dev/null +++ b/transforms/packaging/ray/Makefile @@ -0,0 +1,85 @@ +# Define the root of the local git clone for the common rules to be able +# know where they are running from. +REPOROOT=../../.. +# Include a library of common .transform.* targets which most +# transforms should be able to reuse. However, feel free +# to override/redefine the rules below. + +# $(REPOROOT)/.make.versions file contains the versions + +include $(REPOROOT)/transforms/.make.transforms + + +## Ray Transforms: `find . -name src | grep ray/src` +TRANSFORMS_NAMES = code/proglang_select \ + code/header_cleanser \ + code/code_quality \ + code/repo_level_ordering \ + code/code2parquet \ + language/doc_quality \ + language/doc_chunk \ + language/lang_id \ + language/text_encoder \ + language/pdf2parquet \ + universal/fdedup \ + universal/tokenization \ + universal/ededup \ + universal/profiler \ + universal/doc_id \ + universal/filter \ + universal/resize + + +venv: + $(MAKE) .defaults.create-venv + source venv/bin/activate; \ + $(PYTHON) -m pip install . + + +test:: setup venv test-src + +clean:: .transforms.clean + -rm -fr src + -rm -fr *.egg-info + -rm -fr dist + -rm -fr build + +image:: .transforms.python-image + +test-src:: + source venv/bin/activate; \ + for T in $(TRANSFORMS_NAMES); do \ + echo running unit test on: $$T/ray/test ; \ + $(PYTEST) $(REPOROOT)/transforms/$$T/ray/test; \ + done; + +test-with-pypi: + $(MAKE) clean + $(MAKE) .defaults.create-venv + source venv/bin/activate; \ + $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==0.2.1.dev0 + $(MAKE) test-src + + +setup: .transforms.setup + $(MAKE) src + +src: + for T in $(TRANSFORMS_NAMES); do \ + echo copy src from $$T/ray/src ; \ + cp -R $(REPOROOT)/transforms/$$T/ray/src/ src/ ; \ + done; + -rm -fr *.egg-info + -rm -fr dist + -rm -fr build + + +build:: build-dist + +publish:: publish-dist + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + + diff --git a/transforms/packaging/ray/README.md b/transforms/packaging/ray/README.md new file mode 100644 index 000000000..9de17d903 --- /dev/null +++ b/transforms/packaging/ray/README.md @@ -0,0 +1,37 @@ +# DPK Ray Transforms + +## installation + +The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: + +`python -m pip install data-prep-toolkit-transforms-ray` + +installing the Ray transforms will also install `data_prep_toolkit_transforms` and `data-prep-toolkit-ray` + +## List of Ray Transforms availabe in current package + +* code + * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) + * proglang_select + * header_cleanser (Not available on MacOS) + * code_quality + * repo_level_ordering +* language + * doc_quality + * doc_chunk + * lang_id + * text_encoder + * pdf2parquet +* universal + * fdedup + * tokenization + * ededup + * profiler + * doc_id + * filter + * resize + + + + + diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml index c9104d99e..047e20692 100644 --- a/transforms/packaging/ray/pyproject.toml +++ b/transforms/packaging/ray/pyproject.toml @@ -3,48 +3,52 @@ name = "data_prep_toolkit_transforms_ray" version = "0.2.1.dev0" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -description = "Data Preparation Toolkit Transforms" +description = "Data Preparation Toolkit Transforms using Ray" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Maroun Touma", email = "touma@us.ibm.com" }, ] - dependencies = [ -## Code - "dpk_code_quality_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code_quality/ray", - "dpk_code2parquet_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/code2parquet/ray", - "dpk_header_cleanser_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/header_cleanser/ray ; platform_system != 'Darwin'", - "dpk_malware_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/malware/ray", - "dpk_proglang_select_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/proglang_select/ray", - "dpk_repo_level_order_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/code/repo_level_ordering/ray", -## Language - "dpk_doc_chunk_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_chunk/ray", - "dpk_doc_quality_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/doc_quality/ray", - "dpk_lang_id_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/lang_id/ray", - "dpk_pdf2parquet_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/pdf2parquet/ray", - "dpk_text_encoder_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/language/text_encoder/ray", -## Universal - "dpk_docid_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/doc_id/ray", - "dpk_ededup_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/ededup/ray", - "dpk_fdedup_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/fdedup/ray", - "dpk_filter_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/filter/ray", - "dpk_resize_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/resize/ray", - "dpk_profiler_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/profiler/ray", - "dpk_tokenization_transform_ray @ git+https://github.com/IBM/data-prep-kit.git#subdirectory=transforms/universal/tokenization/ray", + "data_prep_toolkit_transforms==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev0", + "scancode-toolkit==32.1.0 ; platform_system != 'Darwin'", + "parameterized", + "pandas", + "networkx==3.3", + "colorlog==6.8.2", + "func-timeout==4.3.5", + "pandas==2.2.2", + "emerge-viz==2.0.0", + "tqdm==4.66.3", + "mmh3==4.1.0", + "xxhash==3.4.1", + "tqdm==4.66.3", + "scipy==1.12.0", ] -[project_urls] -Repository = "https://github.com/IBM/data-prep-kit" -Issues = "https://github.com/IBM/data-prep-kit/issues" -Documentation = "https://ibm.github.io/data-prep-kit/" - [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" -[tool.setuptools.packages] -find = {namespaces = false} # Disable implicit namespaces + +[options] +package_dir = ["src"] + +[options.packages.find] +where = ["src/"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] + + + + From fe0d3807ae02d413e7692005ec1f7e3efd9249d9 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 13 Aug 2024 00:29:11 -0400 Subject: [PATCH 05/29] remove pandas and keep pandas=2.2.2 Signed-off-by: Maroun Touma --- transforms/packaging/ray/pyproject.toml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml index 047e20692..856f95ac9 100644 --- a/transforms/packaging/ray/pyproject.toml +++ b/transforms/packaging/ray/pyproject.toml @@ -15,17 +15,16 @@ dependencies = [ "data-prep-toolkit-ray==0.2.1.dev0", "scancode-toolkit==32.1.0 ; platform_system != 'Darwin'", "parameterized", - "pandas", - "networkx==3.3", - "colorlog==6.8.2", - "func-timeout==4.3.5", - "pandas==2.2.2", - "emerge-viz==2.0.0", "tqdm==4.66.3", "mmh3==4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", "scipy==1.12.0", + "networkx==3.3", + "colorlog==6.8.2", + "func-timeout==4.3.5", + "pandas==2.2.2", + "emerge-viz==2.0.0", ] [build-system] From d96477af0668acc731807a14239f088ab3cacb44 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 21 Aug 2024 19:32:33 -0400 Subject: [PATCH 06/29] added PII and HTML2Parquet Signed-off-by: Maroun Touma --- transforms/packaging/.make.packaging | 48 +++++++++++++++++ transforms/packaging/python/Makefile | 57 ++++---------------- transforms/packaging/python/pyproject.toml | 4 +- transforms/packaging/ray/Makefile | 62 ++++++---------------- transforms/packaging/ray/pyproject.toml | 4 +- 5 files changed, 78 insertions(+), 97 deletions(-) create mode 100644 transforms/packaging/.make.packaging diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging new file mode 100644 index 000000000..cb44cdbcf --- /dev/null +++ b/transforms/packaging/.make.packaging @@ -0,0 +1,48 @@ + +venv: + $(MAKE) .defaults.create-venv + +test:: setup test-src + @# Help: Setup environment, load wheel from pyproject.toml and run unit tests for all transforms + +clean:: .transforms.clean + -rm -fr src + +image:: .transforms.python-image + +test-src:: + source venv/bin/activate; \ + for T in $(TRANSFORMS_NAMES); do \ + echo running unit test on: $$T ; \ + $(PYTEST) $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/test; \ + done; + @# Help: Run all unit tests from the same venv environment (should follow make venv) + + +setup: .transforms.setup venv + $(MAKE) src + source venv/bin/activate; \ + $(PYTHON) -m pip install . + @# Help: Do any default transform setup before running make src + + +src: + for T in $(TRANSFORMS_NAMES); do \ + echo copy src from $$T ; \ + cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/ src/ ; \ + rm -fr *.egg-info ; \ + rm -fr dist ; \ + rm -fr build ; \ + done; + @# Help: Setup src folder and remove old distribution + + +build:: build-dist + +publish:: publish-dist + +build-dist:: .defaults.build-dist + +publish-dist:: .defaults.publish-dist + + diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index b1c3ad383..10c736c37 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -1,15 +1,17 @@ -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../.. # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free -# to override/redefine the rules below. +# to override/redefine the rules below. # $(REPOROOT)/.make.versions file contains the versions -#TRANSFORM_NAME=doc_quality - include $(REPOROOT)/transforms/.make.transforms +include ../.make.packaging + +PACKAGING_RUN_TIME=python +DPK_TRNASFORM_REV=0.2.1.dev1 TRANSFORMS_NAMES = code/code_quality \ code/code2parquet \ @@ -21,57 +23,20 @@ TRANSFORMS_NAMES = code/code_quality \ language/lang_id \ language/pdf2parquet \ language/text_encoder \ + language/pii_redactor \ universal/ededup \ universal/filter \ universal/resize \ - universal/tokenization - -venv: - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install . - - -test:: setup venv test-src - -clean:: .transforms.clean - -rm -fr src - -image:: .transforms.python-image - -test-src:: - source venv/bin/activate; \ - for T in $(TRANSFORMS_NAMES); do \ - echo running unit test on: $$T ; \ - $(PYTEST) $(REPOROOT)/transforms/$$T/python/test; \ - done; + universal/tokenization \ + universal/html2parquet test-with-pypi: $(MAKE) .defaults.create-venv source venv/bin/activate; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms==0.2.1.dev0 + $(PYTHON) -m pip install data_prep_toolkit_transforms==$(DPK_TRNASFORM_REV) $(MAKE) test-src + @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) -setup: .transforms.setup - $(MAKE) src - -src: - for T in $(TRANSFORMS_NAMES); do \ - echo copy src from $$T ; \ - cp -R $(REPOROOT)/transforms/$$T/python/src/ src/ ; \ - rm -fr *.egg-info ; \ - rm -fr dist ; \ - rm -fr build ; \ - done; - - -build:: build-dist - -publish:: publish-dist - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml index e51097b9a..cee11f584 100644 --- a/transforms/packaging/python/pyproject.toml +++ b/transforms/packaging/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.1.dev0" +version = "0.2.1.dev1" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms" @@ -11,7 +11,7 @@ authors = [ ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev1", "argparse", "boto3==1.34.69", "bs4==0.0.2", diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile index f05164da1..0f5aafc5e 100644 --- a/transforms/packaging/ray/Makefile +++ b/transforms/packaging/ray/Makefile @@ -1,14 +1,17 @@ -# Define the root of the local git clone for the common rules to be able +# Define the root of the local git clone for the common rules to be able # know where they are running from. REPOROOT=../../.. # Include a library of common .transform.* targets which most # transforms should be able to reuse. However, feel free -# to override/redefine the rules below. +# to override/redefine the rules below. # $(REPOROOT)/.make.versions file contains the versions include $(REPOROOT)/transforms/.make.transforms +include ../.make.packaging +PACKAGING_RUN_TIME=ray +DPK_TRNASFORM_REV=0.2.1.dev0 ## Ray Transforms: `find . -name src | grep ray/src` TRANSFORMS_NAMES = code/proglang_select \ @@ -21,6 +24,7 @@ TRANSFORMS_NAMES = code/proglang_select \ language/lang_id \ language/text_encoder \ language/pdf2parquet \ + language/pii_redactor \ universal/fdedup \ universal/tokenization \ universal/ededup \ @@ -29,57 +33,21 @@ TRANSFORMS_NAMES = code/proglang_select \ universal/filter \ universal/resize - -venv: +test-with-local-python: + $(MAKE) clean $(MAKE) .defaults.create-venv source venv/bin/activate; \ - $(PYTHON) -m pip install . - - -test:: setup venv test-src - -clean:: .transforms.clean - -rm -fr src - -rm -fr *.egg-info - -rm -fr dist - -rm -fr build - -image:: .transforms.python-image - -test-src:: - source venv/bin/activate; \ - for T in $(TRANSFORMS_NAMES); do \ - echo running unit test on: $$T/ray/test ; \ - $(PYTEST) $(REPOROOT)/transforms/$$T/ray/test; \ - done; + cd ../python; \ + $(PYTHON) -m pip install . ; \ + cd ../ray; \ + $(PYTHON) -m pip install . ; \ + $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRNASFORM_REV) + $(MAKE) test-src test-with-pypi: $(MAKE) clean $(MAKE) .defaults.create-venv source venv/bin/activate; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==0.2.1.dev0 + $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRNASFORM_REV) $(MAKE) test-src - -setup: .transforms.setup - $(MAKE) src - -src: - for T in $(TRANSFORMS_NAMES); do \ - echo copy src from $$T/ray/src ; \ - cp -R $(REPOROOT)/transforms/$$T/ray/src/ src/ ; \ - done; - -rm -fr *.egg-info - -rm -fr dist - -rm -fr build - - -build:: build-dist - -publish:: publish-dist - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - - diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml index 856f95ac9..1ce30880b 100644 --- a/transforms/packaging/ray/pyproject.toml +++ b/transforms/packaging/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev1" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -11,7 +11,7 @@ authors = [ ] dependencies = [ - "data_prep_toolkit_transforms==0.2.1.dev0", + #"data_prep_toolkit_transforms==0.2.1.dev1", "data-prep-toolkit-ray==0.2.1.dev0", "scancode-toolkit==32.1.0 ; platform_system != 'Darwin'", "parameterized", From 35c7e60add792f02bca0e1cc1bfb24310d8b8671 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 28 Aug 2024 10:39:00 -0400 Subject: [PATCH 07/29] Update with latest available transforms Signed-off-by: Maroun Touma --- .make.versions | 6 +++-- data-processing-lib/python/pyproject.toml | 2 +- data-processing-lib/ray/pyproject.toml | 4 ++-- transforms/packaging/python/requirements.txt | 24 ++++++++++++++++++++ transforms/packaging/ray/requirements.txt | 15 ++++++++++++ 5 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 transforms/packaging/python/requirements.txt create mode 100644 transforms/packaging/ray/requirements.txt diff --git a/.make.versions b/.make.versions index 3ea2ae2d1..973344f56 100644 --- a/.make.versions +++ b/.make.versions @@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2 DPK_MICRO_VERSION=1 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev0 +DPK_VERSION_SUFFIX=.dev2 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) @@ -101,6 +101,8 @@ HEADER_CLEANSER_RAY_VERSION=$(DPK_VERSION) PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION) +DPK_TRNASFORM_REV=$(DPK_VERSION) + ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. @@ -114,4 +116,4 @@ ifeq ($(KFPv2), 1) WORKFLOW_SUPPORT_LIB=kfp_v2_workflow_support else WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support -endif \ No newline at end of file +endif diff --git a/data-processing-lib/python/pyproject.toml b/data-processing-lib/python/pyproject.toml index d8e98aa8d..ebeb2f524 100644 --- a/data-processing-lib/python/pyproject.toml +++ b/data-processing-lib/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.1.dev0" +version = "0.2.1.dev2" requires-python = ">=3.10" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Library" diff --git a/data-processing-lib/ray/pyproject.toml b/data-processing-lib/ray/pyproject.toml index 8fad2e9de..602877d68 100644 --- a/data-processing-lib/ray/pyproject.toml +++ b/data-processing-lib/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev2" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10" description = "Data Preparation Toolkit Library for Ray" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev2", "ray[default]==2.24.0", # These two are to fix security issues identified by quay.io "fastapi>=0.110.2", diff --git a/transforms/packaging/python/requirements.txt b/transforms/packaging/python/requirements.txt new file mode 100644 index 000000000..1fc9bab2c --- /dev/null +++ b/transforms/packaging/python/requirements.txt @@ -0,0 +1,24 @@ + data-prep-toolkit==0.2.1.dev2 + argparse + boto3>=1.34.69 + bs4==0.0.2 + clamd==1.0.2 + docling[ocr]==1.1.2 + duckdb==0.10.1 + fasttext==0.9.2 + filetype >=1.2.0, <2.0.0 + huggingface-hub >= 0.21.4, <1.0.0 + langcodes==3.3.0 + mmh3==4.1.0 + numpy==1.26.4 + pandas + parameterized + pyarrow==16.1.0 + python-dateutil>=2.8.2 + pytz>=2020.1 + quackling==0.1.0 + scancode-toolkit==32.1.0 ; platform_system != 'Darwin' + sentence-transformers==3.0.1 + transformers==4.38.2 + tzdata>=2022.7 + xxhash==3.4.1 diff --git a/transforms/packaging/ray/requirements.txt b/transforms/packaging/ray/requirements.txt new file mode 100644 index 000000000..026552f5f --- /dev/null +++ b/transforms/packaging/ray/requirements.txt @@ -0,0 +1,15 @@ +data-prep-toolkit-ray==0.2.1.dev2 +data_prep_toolkit_transforms==0.2.1.dev2 +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' +parameterized +tqdm==4.66.3 +mmh3==4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +scipy==1.12.0 +networkx==3.3 +colorlog==6.8.2 +func-timeout==4.3.5 +pandas==2.2.2 +emerge-viz==2.0.0 + From d66df271211401fd7646adc555b63c552843f6cf Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 28 Aug 2024 10:47:42 -0400 Subject: [PATCH 08/29] restructure things to be able to test and build independently Signed-off-by: Maroun Touma --- transforms/packaging/.make.packaging | 13 ++++----- transforms/packaging/python/Makefile | 15 +++++++--- transforms/packaging/python/pyproject.toml | 34 ++++------------------ transforms/packaging/ray/Makefile | 20 +++++-------- transforms/packaging/ray/pyproject.toml | 25 ++++------------ 5 files changed, 36 insertions(+), 71 deletions(-) diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging index cb44cdbcf..c17db7572 100644 --- a/transforms/packaging/.make.packaging +++ b/transforms/packaging/.make.packaging @@ -2,29 +2,28 @@ venv: $(MAKE) .defaults.create-venv -test:: setup test-src - @# Help: Setup environment, load wheel from pyproject.toml and run unit tests for all transforms +test:: test-src clean:: .transforms.clean -rm -fr src image:: .transforms.python-image -test-src:: +run-ut:: source venv/bin/activate; \ for T in $(TRANSFORMS_NAMES); do \ echo running unit test on: $$T ; \ $(PYTEST) $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/test; \ done; - @# Help: Run all unit tests from the same venv environment (should follow make venv) + @# Help: Setup environment and run unit tests for all transforms setup: .transforms.setup venv $(MAKE) src source venv/bin/activate; \ $(PYTHON) -m pip install . - @# Help: Do any default transform setup before running make src - + @# Help: Do any default transform setup before running make src and setting up a test environment + src: for T in $(TRANSFORMS_NAMES); do \ @@ -41,7 +40,7 @@ build:: build-dist publish:: publish-dist -build-dist:: .defaults.build-dist +build-dist:: src .defaults.build-dist publish-dist:: .defaults.publish-dist diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index 10c736c37..f9a541e44 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -11,7 +11,6 @@ include $(REPOROOT)/transforms/.make.transforms include ../.make.packaging PACKAGING_RUN_TIME=python -DPK_TRNASFORM_REV=0.2.1.dev1 TRANSFORMS_NAMES = code/code_quality \ code/code2parquet \ @@ -23,18 +22,26 @@ TRANSFORMS_NAMES = code/code_quality \ language/lang_id \ language/pdf2parquet \ language/text_encoder \ - language/pii_redactor \ universal/ededup \ universal/filter \ universal/resize \ universal/tokenization \ - universal/html2parquet + universal/doc_id \ + universal/resize + + +test-src:: .transforms.setup venv + source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . + source venv/bin/activate && cd ../python && $(MAKE) src && $(PYTHON) -m pip install . + $(MAKE) run-ut + @# Help: Do any default transform setup before running make src and setting up a test environment test-with-pypi: + $(MAKE) clean $(MAKE) .defaults.create-venv source venv/bin/activate; \ $(PYTHON) -m pip install data_prep_toolkit_transforms==$(DPK_TRNASFORM_REV) - $(MAKE) test-src + $(MAKE) run-ut @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml index cee11f584..966710b3e 100644 --- a/transforms/packaging/python/pyproject.toml +++ b/transforms/packaging/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.1.dev1" +version = "0.2.1.dev2" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms" @@ -9,41 +9,17 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Maroun Touma", email = "touma@us.ibm.com" }, ] - -dependencies = [ - "data-prep-toolkit==0.2.1.dev1", - "argparse", - "boto3==1.34.69", - "bs4==0.0.2", - "clamd==1.0.2", - "docling[ocr]==1.1.2", - "duckdb==0.10.1", - "fasttext==0.9.2", - "filetype >=1.2.0, <2.0.0", - "huggingface-hub >= 0.21.4, <1.0.0", - "langcodes==3.3.0", - "mmh3==4.1.0", - "numpy==1.26.4", - "pandas", - "parameterized", - "pyarrow==16.1.0", - "python-dateutil>=2.8.2", - "pytz>=2020.1", - "quackling==0.1.0", - "scancode-toolkit==32.1.0 ; platform_system != 'Darwin'", - "sentence-transformers==3.0.1", - "transformers==4.38.2", - "tzdata>=2022.7", - "xxhash==3.4.1", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} [options] -package_dir = ["src"] +package_dir = ["src", "test"] [options.packages.find] where = ["src/"] diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile index 0f5aafc5e..339c7cd9c 100644 --- a/transforms/packaging/ray/Makefile +++ b/transforms/packaging/ray/Makefile @@ -11,7 +11,6 @@ include $(REPOROOT)/transforms/.make.transforms include ../.make.packaging PACKAGING_RUN_TIME=ray -DPK_TRNASFORM_REV=0.2.1.dev0 ## Ray Transforms: `find . -name src | grep ray/src` TRANSFORMS_NAMES = code/proglang_select \ @@ -24,7 +23,6 @@ TRANSFORMS_NAMES = code/proglang_select \ language/lang_id \ language/text_encoder \ language/pdf2parquet \ - language/pii_redactor \ universal/fdedup \ universal/tokenization \ universal/ededup \ @@ -33,16 +31,14 @@ TRANSFORMS_NAMES = code/proglang_select \ universal/filter \ universal/resize -test-with-local-python: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - cd ../python; \ - $(PYTHON) -m pip install . ; \ - cd ../ray; \ - $(PYTHON) -m pip install . ; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRNASFORM_REV) - $(MAKE) test-src +test-src:: .transforms.setup venv + source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . + source venv/bin/activate && cd ../../../data-processing-lib/ray && $(PYTHON) -m pip install . + source venv/bin/activate && cd ../python && $(MAKE) src && $(PYTHON) -m pip install . + source venv/bin/activate && cd ../ray && $(MAKE) src && $(PYTHON) -m pip install . + $(MAKE) run-ut + @# Help: Do any default transform setup before running make src and setting up a test environment + test-with-pypi: $(MAKE) clean diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml index 1ce30880b..9beefd32a 100644 --- a/transforms/packaging/ray/pyproject.toml +++ b/transforms/packaging/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms_ray" -version = "0.2.1.dev1" +version = "0.2.1.dev2" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -9,31 +9,18 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Maroun Touma", email = "touma@us.ibm.com" }, ] - -dependencies = [ - #"data_prep_toolkit_transforms==0.2.1.dev1", - "data-prep-toolkit-ray==0.2.1.dev0", - "scancode-toolkit==32.1.0 ; platform_system != 'Darwin'", - "parameterized", - "tqdm==4.66.3", - "mmh3==4.1.0", - "xxhash==3.4.1", - "tqdm==4.66.3", - "scipy==1.12.0", - "networkx==3.3", - "colorlog==6.8.2", - "func-timeout==4.3.5", - "pandas==2.2.2", - "emerge-viz==2.0.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [options] -package_dir = ["src"] +package_dir = ["src","test"] [options.packages.find] where = ["src/"] From a4f7e0af2d6858a61408333e6822fafa586e501c Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 30 Aug 2024 15:08:37 -0400 Subject: [PATCH 09/29] publish dev2 for python Signed-off-by: Maroun Touma --- transforms/packaging/python/Makefile | 2 +- transforms/packaging/python/requirements.txt | 48 ++++++++++---------- transforms/packaging/ray/Makefile | 4 +- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index f9a541e44..020b9b661 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -31,7 +31,7 @@ TRANSFORMS_NAMES = code/code_quality \ test-src:: .transforms.setup venv - source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . +# source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . source venv/bin/activate && cd ../python && $(MAKE) src && $(PYTHON) -m pip install . $(MAKE) run-ut @# Help: Do any default transform setup before running make src and setting up a test environment diff --git a/transforms/packaging/python/requirements.txt b/transforms/packaging/python/requirements.txt index 1fc9bab2c..326c6c3f1 100644 --- a/transforms/packaging/python/requirements.txt +++ b/transforms/packaging/python/requirements.txt @@ -1,24 +1,24 @@ - data-prep-toolkit==0.2.1.dev2 - argparse - boto3>=1.34.69 - bs4==0.0.2 - clamd==1.0.2 - docling[ocr]==1.1.2 - duckdb==0.10.1 - fasttext==0.9.2 - filetype >=1.2.0, <2.0.0 - huggingface-hub >= 0.21.4, <1.0.0 - langcodes==3.3.0 - mmh3==4.1.0 - numpy==1.26.4 - pandas - parameterized - pyarrow==16.1.0 - python-dateutil>=2.8.2 - pytz>=2020.1 - quackling==0.1.0 - scancode-toolkit==32.1.0 ; platform_system != 'Darwin' - sentence-transformers==3.0.1 - transformers==4.38.2 - tzdata>=2022.7 - xxhash==3.4.1 +data-prep-toolkit==0.2.1.dev2 +argparse +boto3>=1.34.69 +bs4==0.0.2 +clamd==1.0.2 +docling[ocr]==1.1.2 +duckdb==0.10.1 +fasttext==0.9.2 +filetype >=1.2.0, <2.0.0 +huggingface-hub >= 0.21.4, <1.0.0 +langcodes==3.3.0 +mmh3==4.1.0 +numpy==1.26.4 +pandas +parameterized +pyarrow==16.1.0 +python-dateutil>=2.8.2 +pytz>=2020.1 +quackling==0.1.0 +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' +sentence-transformers==3.0.1 +transformers==4.38.2 +tzdata>=2022.7 +xxhash==3.4.1 diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile index 339c7cd9c..276b54fd0 100644 --- a/transforms/packaging/ray/Makefile +++ b/transforms/packaging/ray/Makefile @@ -32,8 +32,8 @@ TRANSFORMS_NAMES = code/proglang_select \ universal/resize test-src:: .transforms.setup venv - source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . - source venv/bin/activate && cd ../../../data-processing-lib/ray && $(PYTHON) -m pip install . +# source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . +# source venv/bin/activate && cd ../../../data-processing-lib/ray && $(PYTHON) -m pip install . source venv/bin/activate && cd ../python && $(MAKE) src && $(PYTHON) -m pip install . source venv/bin/activate && cd ../ray && $(MAKE) src && $(PYTHON) -m pip install . $(MAKE) run-ut From 0af03cc2f9e06b9f1ccc50f7e92a423430ddddf4 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Fri, 30 Aug 2024 19:52:24 -0400 Subject: [PATCH 10/29] finish testing and publish dev2 python and ray packages: Signed-off-by: Maroun Touma --- transforms/packaging/python/Makefile | 14 +++++++++----- transforms/packaging/python/README.md | 2 ++ transforms/packaging/ray/Makefile | 10 ++++++++++ transforms/packaging/ray/README.md | 1 + 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index 020b9b661..4f6845532 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -12,10 +12,15 @@ include ../.make.packaging PACKAGING_RUN_TIME=python + +#Excluded List +# ./code/malware +# ./language/pii_redactor +# ./universal/html2parquet + TRANSFORMS_NAMES = code/code_quality \ - code/code2parquet \ - code/header_cleanser \ - code/code_quality \ + code/code2parquet \ + code/header_cleanser \ code/proglang_select \ language/doc_chunk \ language/doc_quality \ @@ -26,8 +31,7 @@ TRANSFORMS_NAMES = code/code_quality \ universal/filter \ universal/resize \ universal/tokenization \ - universal/doc_id \ - universal/resize + universal/doc_id test-src:: .transforms.setup venv diff --git a/transforms/packaging/python/README.md b/transforms/packaging/python/README.md index e200a3568..08e8d04de 100644 --- a/transforms/packaging/python/README.md +++ b/transforms/packaging/python/README.md @@ -26,6 +26,8 @@ installing the python transforms will also install `data-prep-toolkit` * filter * resize * tokenization + * doc_id + diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile index 276b54fd0..ce32186f8 100644 --- a/transforms/packaging/ray/Makefile +++ b/transforms/packaging/ray/Makefile @@ -12,6 +12,11 @@ include ../.make.packaging PACKAGING_RUN_TIME=ray +# Excluded from build +# ./code/malware/ray +# ./language/pii_redactor/ray + + ## Ray Transforms: `find . -name src | grep ray/src` TRANSFORMS_NAMES = code/proglang_select \ code/header_cleanser \ @@ -39,6 +44,11 @@ test-src:: .transforms.setup venv $(MAKE) run-ut @# Help: Do any default transform setup before running make src and setting up a test environment +test-with-python-pypi: + $(MAKE) clean + $(MAKE) .defaults.create-venv + source venv/bin/activate && cd ../ray && $(MAKE) src && $(PYTHON) -m pip install . + $(MAKE) test-src test-with-pypi: $(MAKE) clean diff --git a/transforms/packaging/ray/README.md b/transforms/packaging/ray/README.md index 9de17d903..497ed42d0 100644 --- a/transforms/packaging/ray/README.md +++ b/transforms/packaging/ray/README.md @@ -34,4 +34,5 @@ installing the Ray transforms will also install `data_prep_toolkit_transforms` a + From 703ebe023af731a6d90f3e98499b7860e356d8a6 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 9 Sep 2024 12:59:40 -0500 Subject: [PATCH 11/29] try different dependencies in attempt to resolve conflicts Signed-off-by: Maroun Touma --- data-processing-lib/python/pyproject.toml | 2 +- data-processing-lib/ray/pyproject.toml | 4 +-- examples/notebooks/rag/requirements.txt | 6 ++-- transforms/packaging/.make.packaging | 3 +- transforms/packaging/python/Makefile | 4 +++ transforms/packaging/python/pyproject.toml | 6 ++-- .../python/requirements.transforms.python.txt | 32 +++++++++++++++++++ transforms/packaging/python/requirements.txt | 24 -------------- transforms/packaging/ray/Makefile | 7 +++- transforms/packaging/ray/pyproject.toml | 6 ++-- .../ray/requirements.transforms.ray.txt | 25 +++++++++++++++ transforms/packaging/ray/requirements.txt | 15 --------- 12 files changed, 81 insertions(+), 53 deletions(-) create mode 100644 transforms/packaging/python/requirements.transforms.python.txt delete mode 100644 transforms/packaging/python/requirements.txt create mode 100644 transforms/packaging/ray/requirements.transforms.ray.txt delete mode 100644 transforms/packaging/ray/requirements.txt diff --git a/data-processing-lib/python/pyproject.toml b/data-processing-lib/python/pyproject.toml index ebeb2f524..9ff6c2d7f 100644 --- a/data-processing-lib/python/pyproject.toml +++ b/data-processing-lib/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.1.dev2" +version = "0.2.1.dev3" requires-python = ">=3.10" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Library" diff --git a/data-processing-lib/ray/pyproject.toml b/data-processing-lib/ray/pyproject.toml index 602877d68..3f347cdf4 100644 --- a/data-processing-lib/ray/pyproject.toml +++ b/data-processing-lib/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_ray" -version = "0.2.1.dev2" +version = "0.2.1.dev3" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10" description = "Data Preparation Toolkit Library for Ray" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev2", + "data-prep-toolkit>=0.2.1.dev3", "ray[default]==2.24.0", # These two are to fix security issues identified by quay.io "fastapi>=0.110.2", diff --git a/examples/notebooks/rag/requirements.txt b/examples/notebooks/rag/requirements.txt index 3c1a464d0..4578b1ea8 100644 --- a/examples/notebooks/rag/requirements.txt +++ b/examples/notebooks/rag/requirements.txt @@ -1,6 +1,6 @@ ## Data prep kit -data-prep-toolkit-transforms==0.2.1.dev1 -data-prep-toolkit-transforms-ray==0.2.1.dev1 +#data-prep-toolkit-transforms==0.2.1.dev1 +#data-prep-toolkit-transforms-ray==0.2.1.dev1 @@ -53,4 +53,4 @@ ipython ipywidgets IProgress chardet==5.2.0 -charset-normalizer==3.3.2 \ No newline at end of file +charset-normalizer==3.3.2 diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging index c17db7572..0ecc05484 100644 --- a/transforms/packaging/.make.packaging +++ b/transforms/packaging/.make.packaging @@ -26,9 +26,10 @@ setup: .transforms.setup venv src: + mkdir src for T in $(TRANSFORMS_NAMES); do \ echo copy src from $$T ; \ - cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/ src/ ; \ + cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \ rm -fr *.egg-info ; \ rm -fr dist ; \ rm -fr build ; \ diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index 4f6845532..60fd0f766 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -33,6 +33,10 @@ TRANSFORMS_NAMES = code/code_quality \ universal/tokenization \ universal/doc_id +# language/doc_chunk has conflict dependencies with pdf2parquet that need to be resolved +# doc_chunk depends on docling>=1.8.2,<2.0.0 +# pdf2parquet depends on docling==1.7.0 + test-src:: .transforms.setup venv # source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml index 966710b3e..f4e09f29b 100644 --- a/transforms/packaging/python/pyproject.toml +++ b/transforms/packaging/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.1.dev2" +version = "0.2.1.dev3" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms" @@ -16,10 +16,10 @@ requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} +dependencies = {file = ["requirements.transforms.python.txt"]} [options] -package_dir = ["src", "test"] +package_dir = ["src"] [options.packages.find] where = ["src/"] diff --git a/transforms/packaging/python/requirements.transforms.python.txt b/transforms/packaging/python/requirements.transforms.python.txt new file mode 100644 index 000000000..a907e60a4 --- /dev/null +++ b/transforms/packaging/python/requirements.transforms.python.txt @@ -0,0 +1,32 @@ +data-prep-toolkit>=0.2.1.dev3 +bs4==0.0.2 +#docling 1.9.0 depends on docling-parse<2.0.0 and >=1.1.3 +#pdf2parquet depends on docling-parse==1.0.0 +#docling 1.8.5 depends on docling-parse<2.0.0 and >=1.1.3 +docling-parse>=1.0.0, +# language/doc_chunk has conflict dependencies with pdf2parquet that need to be resolved +# doc_chunk depends on docling>=1.8.2,<2.0.0 +# pdf2parquet depends on docling==1.7.0 +#docling==1.7.0, +docling>=1.8.2,<2.0.0, +llama-index-core>=0.11.1,<0.12.0, +docling-core>=1.1.2,<2.0.0, +quackling==0.1.1, +# quackling will pull +# docling>=1.8.2,<2.0.0 +# llama-index-core<0.12.0,>=0.11.1 +# docling-core<2.0.0,>=1.1.2 +filetype >=1.2.0, <2.0.0 +duckdb==0.10.1 +fasttext==0.9.2 +huggingface-hub >= 0.21.4, <1.0.0 +langcodes==3.3.0 +mmh3==4.1.0 +numpy==1.26.4 +pandas +parameterized +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' +sentence-transformers==3.0.1 +transformers==4.38.2 +xxhash==3.4.1 + diff --git a/transforms/packaging/python/requirements.txt b/transforms/packaging/python/requirements.txt deleted file mode 100644 index 326c6c3f1..000000000 --- a/transforms/packaging/python/requirements.txt +++ /dev/null @@ -1,24 +0,0 @@ -data-prep-toolkit==0.2.1.dev2 -argparse -boto3>=1.34.69 -bs4==0.0.2 -clamd==1.0.2 -docling[ocr]==1.1.2 -duckdb==0.10.1 -fasttext==0.9.2 -filetype >=1.2.0, <2.0.0 -huggingface-hub >= 0.21.4, <1.0.0 -langcodes==3.3.0 -mmh3==4.1.0 -numpy==1.26.4 -pandas -parameterized -pyarrow==16.1.0 -python-dateutil>=2.8.2 -pytz>=2020.1 -quackling==0.1.0 -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -sentence-transformers==3.0.1 -transformers==4.38.2 -tzdata>=2022.7 -xxhash==3.4.1 diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile index ce32186f8..370948be7 100644 --- a/transforms/packaging/ray/Makefile +++ b/transforms/packaging/ray/Makefile @@ -23,8 +23,8 @@ TRANSFORMS_NAMES = code/proglang_select \ code/code_quality \ code/repo_level_ordering \ code/code2parquet \ - language/doc_quality \ language/doc_chunk \ + language/doc_quality \ language/lang_id \ language/text_encoder \ language/pdf2parquet \ @@ -36,6 +36,11 @@ TRANSFORMS_NAMES = code/proglang_select \ universal/filter \ universal/resize +# doc chunk has conflict dependencies with pdf2parquet that need to be resolved +# doc_chunk depends on docling>=1.8.2,<2.0.0 +# pdf2parquet depends on docling==1.7.0 + + test-src:: .transforms.setup venv # source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . # source venv/bin/activate && cd ../../../data-processing-lib/ray && $(PYTHON) -m pip install . diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml index 9beefd32a..a7f229647 100644 --- a/transforms/packaging/ray/pyproject.toml +++ b/transforms/packaging/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms_ray" -version = "0.2.1.dev2" +version = "0.2.1.dev3" requires-python = ">=3.10,<3.12" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -16,11 +16,11 @@ requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} +dependencies = {file = ["requirements.transforms.ray.txt"]} [options] -package_dir = ["src","test"] +package_dir = ["src"] [options.packages.find] where = ["src/"] diff --git a/transforms/packaging/ray/requirements.transforms.ray.txt b/transforms/packaging/ray/requirements.transforms.ray.txt new file mode 100644 index 000000000..cd1e0142f --- /dev/null +++ b/transforms/packaging/ray/requirements.transforms.ray.txt @@ -0,0 +1,25 @@ +data-prep-toolkit-ray>=0.2.1.dev3 +data_prep_toolkit_transforms>=0.2.1.dev3 +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' +parameterized +tqdm==4.66.3 +mmh3==4.1.0 +xxhash==3.4.1 +tqdm==4.66.3 +#The conflict is caused by: +# ray fdedup depends on scipy==1.12.0 +# docling 1.7.0 depends on scipy<2.0.0 and >=1.14.1 +scipy>=1.12.0 +networkx==3.3 +colorlog==6.8.2 +func-timeout==4.3.5 +pandas==2.2.2 +emerge-viz==2.0.0 + +#Note: +# when installing data-processing-library-ray, get the following +# ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. +# deepsearch-toolkit 1.0.0 requires platformdirs<4.0.0,>=3.5.1, but you have platformdirs 4.3.2 which is incompatible. + + + diff --git a/transforms/packaging/ray/requirements.txt b/transforms/packaging/ray/requirements.txt deleted file mode 100644 index 026552f5f..000000000 --- a/transforms/packaging/ray/requirements.txt +++ /dev/null @@ -1,15 +0,0 @@ -data-prep-toolkit-ray==0.2.1.dev2 -data_prep_toolkit_transforms==0.2.1.dev2 -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -parameterized -tqdm==4.66.3 -mmh3==4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -scipy==1.12.0 -networkx==3.3 -colorlog==6.8.2 -func-timeout==4.3.5 -pandas==2.2.2 -emerge-viz==2.0.0 - From d54708a69d41a4e0b8197710cf10921a736dc910 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 10 Sep 2024 23:20:46 -0500 Subject: [PATCH 12/29] added pii redactor Signed-off-by: Maroun Touma --- transforms/packaging/python/Makefile | 6 +++++- .../python/requirements.transforms.python.txt | 14 +++++++++----- transforms/packaging/ray/Makefile | 1 + 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index 60fd0f766..7a2cf0180 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -15,8 +15,11 @@ PACKAGING_RUN_TIME=python #Excluded List # ./code/malware -# ./language/pii_redactor # ./universal/html2parquet +# ./universal/profiler # Missing implementation +# ./universal/fdedup # Missing implementation +# code/repo_level_ordering # Missing implementation + TRANSFORMS_NAMES = code/code_quality \ code/code2parquet \ @@ -26,6 +29,7 @@ TRANSFORMS_NAMES = code/code_quality \ language/doc_quality \ language/lang_id \ language/pdf2parquet \ + language/pii_redactor \ language/text_encoder \ universal/ededup \ universal/filter \ diff --git a/transforms/packaging/python/requirements.transforms.python.txt b/transforms/packaging/python/requirements.transforms.python.txt index a907e60a4..31acfb21f 100644 --- a/transforms/packaging/python/requirements.transforms.python.txt +++ b/transforms/packaging/python/requirements.transforms.python.txt @@ -3,15 +3,13 @@ bs4==0.0.2 #docling 1.9.0 depends on docling-parse<2.0.0 and >=1.1.3 #pdf2parquet depends on docling-parse==1.0.0 #docling 1.8.5 depends on docling-parse<2.0.0 and >=1.1.3 -docling-parse>=1.0.0, +#docling-parse>=1.0.0, # language/doc_chunk has conflict dependencies with pdf2parquet that need to be resolved # doc_chunk depends on docling>=1.8.2,<2.0.0 # pdf2parquet depends on docling==1.7.0 #docling==1.7.0, -docling>=1.8.2,<2.0.0, -llama-index-core>=0.11.1,<0.12.0, -docling-core>=1.1.2,<2.0.0, -quackling==0.1.1, +docling==1.8.5, +quackling==0.4.0, # quackling will pull # docling>=1.8.2,<2.0.0 # llama-index-core<0.12.0,>=0.11.1 @@ -29,4 +27,10 @@ scancode-toolkit==32.1.0 ; platform_system != 'Darwin' sentence-transformers==3.0.1 transformers==4.38.2 xxhash==3.4.1 +# PII-redactor +presidio-analyzer>=2.2.355 +presidio-anonymizer>=2.2.355 +flair>=0.14.0 +pandas>=2.2.2 + diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile index 370948be7..c26879781 100644 --- a/transforms/packaging/ray/Makefile +++ b/transforms/packaging/ray/Makefile @@ -27,6 +27,7 @@ TRANSFORMS_NAMES = code/proglang_select \ language/doc_quality \ language/lang_id \ language/text_encoder \ + language/pii_redactor \ language/pdf2parquet \ universal/fdedup \ universal/tokenization \ From 109ea2943e05f572a1d2d96d91d340c8dfa11611 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 10 Sep 2024 23:30:32 -0500 Subject: [PATCH 13/29] updated with latest release for pdf2parquet Signed-off-by: Maroun Touma --- .../packaging/python/requirements.transforms.python.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/transforms/packaging/python/requirements.transforms.python.txt b/transforms/packaging/python/requirements.transforms.python.txt index 31acfb21f..c3a7ae52e 100644 --- a/transforms/packaging/python/requirements.transforms.python.txt +++ b/transforms/packaging/python/requirements.transforms.python.txt @@ -8,13 +8,16 @@ bs4==0.0.2 # doc_chunk depends on docling>=1.8.2,<2.0.0 # pdf2parquet depends on docling==1.7.0 #docling==1.7.0, -docling==1.8.5, +#pdf2parquet +docling-core==1.2.0, +docling==1.11.0, +filetype >=1.2.0, <2.0.0, +#DockChunking quackling==0.4.0, # quackling will pull # docling>=1.8.2,<2.0.0 # llama-index-core<0.12.0,>=0.11.1 # docling-core<2.0.0,>=1.1.2 -filetype >=1.2.0, <2.0.0 duckdb==0.10.1 fasttext==0.9.2 huggingface-hub >= 0.21.4, <1.0.0 From a874358830a92c71d51e1f1d99693be2fa824cec Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Wed, 11 Sep 2024 21:58:18 +0200 Subject: [PATCH 14/29] fixes for dev3 release Signed-off-by: Maroun Touma --- .make.versions | 4 +-- transforms/packaging/Makefile | 48 +++++++++++++++++++++++++++ transforms/packaging/python/Makefile | 12 +++---- transforms/packaging/python/README.md | 28 ++++++++-------- transforms/packaging/ray/Makefile | 16 +++++---- transforms/packaging/ray/README.md | 33 +++++++++--------- 6 files changed, 98 insertions(+), 43 deletions(-) create mode 100644 transforms/packaging/Makefile diff --git a/.make.versions b/.make.versions index de063e8ff..54e6d8ca1 100644 --- a/.make.versions +++ b/.make.versions @@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2 DPK_MICRO_VERSION=1 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev2 +DPK_VERSION_SUFFIX=.dev3 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) @@ -103,7 +103,7 @@ PII_REDACTOR_PYTHON_VERSION=$(DPK_VERSION) HTML2PARQUET_PYTHON_VERSION=$(DPK_VERSION) -DPK_TRNASFORM_REV=$(DPK_VERSION) +DPK_TRANSFORMS_VERSION=$(DPK_VERSION) ################## ################## ################## ################## ################## ################## # Begin versions that the repo depends on. diff --git a/transforms/packaging/Makefile b/transforms/packaging/Makefile new file mode 100644 index 000000000..584d40455 --- /dev/null +++ b/transforms/packaging/Makefile @@ -0,0 +1,48 @@ +REPOROOT=../../ +# Use make help, to see the available rules +include ../../.make.defaults + +setup:: + +clean:: + # Clean up workflows common virtual environment. + rm -rf venv || true + rm -rf *.back || true + @# Help: Recursively make $@ all subdirs + $(MAKE) RULE=$@ .recurse + +setup:: + +build:: + +venv:: + +image:: + +publish:: + +test-image:: + +test:: + +test-src:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +kind-load-image:: + +docker-load-image:: + +docker-save-image:: + +workflow-venv:: + +workflow-test:: + +workflow-build:: + +workflow-upload:: + +set-versions:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index 7a2cf0180..926f84030 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -37,14 +37,14 @@ TRANSFORMS_NAMES = code/code_quality \ universal/tokenization \ universal/doc_id -# language/doc_chunk has conflict dependencies with pdf2parquet that need to be resolved -# doc_chunk depends on docling>=1.8.2,<2.0.0 -# pdf2parquet depends on docling==1.7.0 +# distribution versions is the same as image version. +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions test-src:: .transforms.setup venv -# source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . - source venv/bin/activate && cd ../python && $(MAKE) src && $(PYTHON) -m pip install . + source venv/bin/activate && $(PYTHON) -m pip install ../../../data-processing-lib/python + source venv/bin/activate && $(MAKE) src && $(PYTHON) -m pip install . $(MAKE) run-ut @# Help: Do any default transform setup before running make src and setting up a test environment @@ -52,7 +52,7 @@ test-with-pypi: $(MAKE) clean $(MAKE) .defaults.create-venv source venv/bin/activate; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms==$(DPK_TRNASFORM_REV) + $(PYTHON) -m pip install data_prep_toolkit_transforms==$(DPK_TRANSFORMS_VERSION) $(MAKE) run-ut @# Help: Load wheel from pypi and run all unit tests: final step in verification after deploying to pypi) diff --git a/transforms/packaging/python/README.md b/transforms/packaging/python/README.md index 08e8d04de..a2b1f3c78 100644 --- a/transforms/packaging/python/README.md +++ b/transforms/packaging/python/README.md @@ -12,21 +12,23 @@ installing the python transforms will also install `data-prep-toolkit` * code * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md) - * header_cleanser (Not available on MacOS) - * code_quality - * proglang_select + * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/header_cleanser/python/README.md) + * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/python/README.md) + * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/python/README.md) * language - * doc_chunk - * *doc_quality - * lang_id - * pdf2parquet - * text_encoder + * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/python/README.md) + * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/python/README.md) + * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/python/README.md) + * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/python/README.md) + * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/python/README.md) + * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/python/README.md) * universal - * ededup - * filter - * resize - * tokenization - * doc_id + * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/python/README.md) + * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/python/README.md) + * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/python/README.md) + * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/tokenization/doc_chunk/python/README.md) + * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/python/README.md) + diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile index c26879781..345bb9421 100644 --- a/transforms/packaging/ray/Makefile +++ b/transforms/packaging/ray/Makefile @@ -14,7 +14,9 @@ PACKAGING_RUN_TIME=ray # Excluded from build # ./code/malware/ray -# ./language/pii_redactor/ray + +set-versions: + $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions ## Ray Transforms: `find . -name src | grep ray/src` @@ -43,10 +45,12 @@ TRANSFORMS_NAMES = code/proglang_select \ test-src:: .transforms.setup venv -# source venv/bin/activate && cd ../../../data-processing-lib/python && $(PYTHON) -m pip install . -# source venv/bin/activate && cd ../../../data-processing-lib/ray && $(PYTHON) -m pip install . - source venv/bin/activate && cd ../python && $(MAKE) src && $(PYTHON) -m pip install . - source venv/bin/activate && cd ../ray && $(MAKE) src && $(PYTHON) -m pip install . + $(MAKE) src + $(MAKE) -C ../python src + source venv/bin/activate && $(PYTHON) -m pip install ../../../data-processing-lib/python + source venv/bin/activate && $(PYTHON) -m pip install ../python + source venv/bin/activate && $(PYTHON) -m pip install ../../../data-processing-lib/ray + source venv/bin/activate && $(PYTHON) -m pip install . $(MAKE) run-ut @# Help: Do any default transform setup before running make src and setting up a test environment @@ -60,6 +64,6 @@ test-with-pypi: $(MAKE) clean $(MAKE) .defaults.create-venv source venv/bin/activate; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRNASFORM_REV) + $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRANSFORMS_VERSION) $(MAKE) test-src diff --git a/transforms/packaging/ray/README.md b/transforms/packaging/ray/README.md index 497ed42d0..f38ece632 100644 --- a/transforms/packaging/ray/README.md +++ b/transforms/packaging/ray/README.md @@ -12,24 +12,25 @@ installing the Ray transforms will also install `data_prep_toolkit_transforms` a * code * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) - * proglang_select - * header_cleanser (Not available on MacOS) - * code_quality - * repo_level_ordering + * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/ray/README.md) + * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) + * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/ray/README.md) + * [repo_level_ordering](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/repo_level_ordering/ray/README.md) * language - * doc_quality - * doc_chunk - * lang_id - * text_encoder - * pdf2parquet + * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/ray/README.md) + * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/ray/README.md) + * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/ray/README.md) + * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/ray/README.md) + * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/ray/README.md) + * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/ray/README.md) * universal - * fdedup - * tokenization - * ededup - * profiler - * doc_id - * filter - * resize + * [fdedup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/fdedup/ray/README.md) + * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/tokenization/ray/README.md) + * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/ray/README.md) + * [profiler](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/profiler/ray/README.md) + * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/ray/README.md) + * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/ray/README.md) + * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/ray/README.md) From 440975d31105a6f9e8c1b9952cc8d12438bee353 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Thu, 12 Sep 2024 20:28:42 +0200 Subject: [PATCH 15/29] simplify test-src by using exiting targets Signed-off-by: Maroun Touma --- transforms/packaging/python/Makefile | 6 +++--- transforms/packaging/ray/Makefile | 7 ++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile index 926f84030..1271a20c3 100644 --- a/transforms/packaging/python/Makefile +++ b/transforms/packaging/python/Makefile @@ -42,9 +42,9 @@ TRANSFORMS_NAMES = code/code_quality \ set-versions: $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions -test-src:: .transforms.setup venv - source venv/bin/activate && $(PYTHON) -m pip install ../../../data-processing-lib/python - source venv/bin/activate && $(MAKE) src && $(PYTHON) -m pip install . +test-src:: + $(MAKE) src + $(MAKE) .transforms.python-venv $(MAKE) run-ut @# Help: Do any default transform setup before running make src and setting up a test environment diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile index 345bb9421..0a1d6d911 100644 --- a/transforms/packaging/ray/Makefile +++ b/transforms/packaging/ray/Makefile @@ -44,13 +44,10 @@ TRANSFORMS_NAMES = code/proglang_select \ # pdf2parquet depends on docling==1.7.0 -test-src:: .transforms.setup venv +test-src:: $(MAKE) src $(MAKE) -C ../python src - source venv/bin/activate && $(PYTHON) -m pip install ../../../data-processing-lib/python - source venv/bin/activate && $(PYTHON) -m pip install ../python - source venv/bin/activate && $(PYTHON) -m pip install ../../../data-processing-lib/ray - source venv/bin/activate && $(PYTHON) -m pip install . + make .transforms.ray-venv $(MAKE) run-ut @# Help: Do any default transform setup before running make src and setting up a test environment From db609636d279f09cd571b8fff83f89fba74d4f60 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 11:40:22 +0200 Subject: [PATCH 16/29] renamed requirement files Signed-off-by: Maroun Touma --- transforms/packaging/python/pyproject.toml | 2 +- .../{requirements.transforms.python.txt => requirements.txt} | 0 transforms/packaging/ray/pyproject.toml | 2 +- .../ray/{requirements.transforms.ray.txt => requirements.txt} | 0 4 files changed, 2 insertions(+), 2 deletions(-) rename transforms/packaging/python/{requirements.transforms.python.txt => requirements.txt} (100%) rename transforms/packaging/ray/{requirements.transforms.ray.txt => requirements.txt} (100%) diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml index f4e09f29b..5ddb40aae 100644 --- a/transforms/packaging/python/pyproject.toml +++ b/transforms/packaging/python/pyproject.toml @@ -16,7 +16,7 @@ requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] -dependencies = {file = ["requirements.transforms.python.txt"]} +dependencies = {file = ["requirements.txt"]} [options] package_dir = ["src"] diff --git a/transforms/packaging/python/requirements.transforms.python.txt b/transforms/packaging/python/requirements.txt similarity index 100% rename from transforms/packaging/python/requirements.transforms.python.txt rename to transforms/packaging/python/requirements.txt diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml index a7f229647..9c1509472 100644 --- a/transforms/packaging/ray/pyproject.toml +++ b/transforms/packaging/ray/pyproject.toml @@ -16,7 +16,7 @@ requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" [tool.setuptools.dynamic] -dependencies = {file = ["requirements.transforms.ray.txt"]} +dependencies = {file = ["requirements.txt"]} [options] diff --git a/transforms/packaging/ray/requirements.transforms.ray.txt b/transforms/packaging/ray/requirements.txt similarity index 100% rename from transforms/packaging/ray/requirements.transforms.ray.txt rename to transforms/packaging/ray/requirements.txt From 9bb36c5571baa0f35efb26b56dc619574079e5e6 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 11:51:09 +0200 Subject: [PATCH 17/29] use - in transform library name Signed-off-by: Maroun Touma --- transforms/packaging/ray/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transforms/packaging/ray/requirements.txt b/transforms/packaging/ray/requirements.txt index cd1e0142f..3178a2e56 100644 --- a/transforms/packaging/ray/requirements.txt +++ b/transforms/packaging/ray/requirements.txt @@ -1,5 +1,5 @@ data-prep-toolkit-ray>=0.2.1.dev3 -data_prep_toolkit_transforms>=0.2.1.dev3 +data-prep-toolkit-transforms>=0.2.1.dev3 scancode-toolkit==32.1.0 ; platform_system != 'Darwin' parameterized tqdm==4.66.3 From ee63628e884797d97daf53f6acaaa5a9b73299f3 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 14:50:33 +0200 Subject: [PATCH 18/29] update requirements.txt files as appropriate when setting versions Signed-off-by: Maroun Touma --- .make.defaults | 12 ++++++++++++ transforms/packaging/Makefile | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/.make.defaults b/.make.defaults index 510e3fc05..230f3a1c9 100644 --- a/.make.defaults +++ b/.make.defaults @@ -587,6 +587,18 @@ MINIO_ADMIN_PWD= localminiosecretkey > tt.toml; \ mv tt.toml pyproject.toml; \ fi + @if [ -e requirements.txt ]; then \ + cat requirements.txt | sed \ + -e s/"data-prep-toolkit-ray\([=><~][=]\).*"/"data-prep-toolkit-ray\1$(DPK_LIB_VERSION)"/ \ + -e s/"data-prep-toolkit-transforms\([=><~][=]\).*"/"data-prep-toolkit-transforms\1$(DPK_TRANSFORMS_VERSION)"/ \ + -e s/"data-prep-toolkit-spark\([=><~][=]\).*"/"data-prep-toolkit-spark\1$(DPK_LIB_VERSION)"/ \ + -e s/"data-prep-toolkit-kfp\([=><~][=]\).*"/"data-prep-toolkit-kfp\1$(DPK_LIB_KFP_VERSION)"/ \ + -e s/"data-prep-toolkit\([=><~][=]\).*"/"data-prep-toolkit\1$(DPK_LIB_VERSION)"/ \ + -e s/"ray\[default\]\([=><~][=]\).*"/"ray\[default\]\1$(RAY)"/ \ + -e s/"data-prep-toolkit-kfp-shared\(..\).*"/"data-prep-toolkit-kfp-shared\1$(DPK_LIB_KFP_VERSION)"/ \ + > tt.txt; \ + mv tt.txt requirements.txt; \ + fi # Build the distribution, usually in preparation for publishing using ith the .defaults.publish-dist target .PHONY: .defaults.build-dist diff --git a/transforms/packaging/Makefile b/transforms/packaging/Makefile index 584d40455..020ae2a73 100644 --- a/transforms/packaging/Makefile +++ b/transforms/packaging/Makefile @@ -11,6 +11,10 @@ clean:: @# Help: Recursively make $@ all subdirs $(MAKE) RULE=$@ .recurse +src:: + @# Help: Recursively setup $@ in all subdirs + $(MAKE) RULE=$@ .recurse + setup:: build:: From 346b82e550947b99220e3790602df26e2f520dc4 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 15:19:11 +0200 Subject: [PATCH 19/29] Added readme file to test, build and publish package to pypi Signed-off-by: Maroun Touma --- transforms/packaging/README.md | 52 ++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 transforms/packaging/README.md diff --git a/transforms/packaging/README.md b/transforms/packaging/README.md new file mode 100644 index 000000000..1a1ffa5fd --- /dev/null +++ b/transforms/packaging/README.md @@ -0,0 +1,52 @@ +# Transforms Pacakges for both Python and Ray + +Most available Transforms can be published to pypi as a single package. A detailed list of available Python transforms is available at this (link)[python/README.md]. Similarly the following (link)[ray/README.md] provide a derailed list and installation instructions for Ray transforms + + + +## Clone folder and update version number + +git clone https://github.com/IBM/data-prep-kit.git release +cd release + +in `.make.versions`, Set the values for DPK_MAJOR_VERSION, DPK_MINOR_VERSION and DPK_MICRO_VERSION to specify the DPK library to use and as appropriate, set the value for `DPK_TRANSFORMS_VERSION` that will be used to tag the latest version released to pypi + + +## Creating src folder + +Given that the transforms do not currently have their own name spaces, the first step is to copy all the transforms to the same src folder prior to running unit tests of the individual transforms and/or building the distribution: + + +```` +cd release/transforms/packaging +make clean +make src +```` + +## Build and Test + +This procedure will run all the UT for each individual transforms using a single package configuration: + +```` +cd release/transforms/packaging +make clean +make src +make test-src +```` + +## Build and Deploy + +This procedure will buid and publish two wheels to pypi.org: one for the python transforms and one for the ray transforms. + +```` +cd release/transforms/packaging +make clean +make src +make set-version +make build-dist +make publish-dist +```` + + + + From 071836ef710bc9c3bbb0a8bb1bb38d4203381a85 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 16:16:34 +0200 Subject: [PATCH 20/29] fix typos and removed double quotes Signed-off-by: Maroun Touma --- .make.defaults | 14 +++++++------- transforms/packaging/README.md | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.make.defaults b/.make.defaults index 230f3a1c9..b8af116b2 100644 --- a/.make.defaults +++ b/.make.defaults @@ -589,13 +589,13 @@ MINIO_ADMIN_PWD= localminiosecretkey fi @if [ -e requirements.txt ]; then \ cat requirements.txt | sed \ - -e s/"data-prep-toolkit-ray\([=><~][=]\).*"/"data-prep-toolkit-ray\1$(DPK_LIB_VERSION)"/ \ - -e s/"data-prep-toolkit-transforms\([=><~][=]\).*"/"data-prep-toolkit-transforms\1$(DPK_TRANSFORMS_VERSION)"/ \ - -e s/"data-prep-toolkit-spark\([=><~][=]\).*"/"data-prep-toolkit-spark\1$(DPK_LIB_VERSION)"/ \ - -e s/"data-prep-toolkit-kfp\([=><~][=]\).*"/"data-prep-toolkit-kfp\1$(DPK_LIB_KFP_VERSION)"/ \ - -e s/"data-prep-toolkit\([=><~][=]\).*"/"data-prep-toolkit\1$(DPK_LIB_VERSION)"/ \ - -e s/"ray\[default\]\([=><~][=]\).*"/"ray\[default\]\1$(RAY)"/ \ - -e s/"data-prep-toolkit-kfp-shared\(..\).*"/"data-prep-toolkit-kfp-shared\1$(DPK_LIB_KFP_VERSION)"/ \ + -e 's/data-prep-toolkit-ray\([=><~][=]\).*/data-prep-toolkit-ray\1$(DPK_LIB_VERSION)/' \ + -e 's/data-prep-toolkit-transforms\([=><~][=]\).*/data-prep-toolkit-transforms\1$(DPK_TRANSFORMS_VERSION)/' \ + -e 's/data-prep-toolkit-spark\([=><~][=]\).*/data-prep-toolkit-spark\1$(DPK_LIB_VERSION)/' \ + -e 's/data-prep-toolkit-kfp\([=><~][=]\).*/data-prep-toolkit-kfp\1$(DPK_LIB_KFP_VERSION)/' \ + -e 's/data-prep-toolkit\([=><~][=]\).*/data-prep-toolkit\1$(DPK_LIB_VERSION)/' \ + -e 's/ray\[default\]\([=><~][=]\).*/ray\[default\]\1$(RAY)/' \ + -e 's/data-prep-toolkit-kfp-shared\(..\).*/data-prep-toolkit-kfp-shared\1$(DPK_LIB_KFP_VERSION)/' \ > tt.txt; \ mv tt.txt requirements.txt; \ fi diff --git a/transforms/packaging/README.md b/transforms/packaging/README.md index 1a1ffa5fd..f976a4c56 100644 --- a/transforms/packaging/README.md +++ b/transforms/packaging/README.md @@ -1,6 +1,6 @@ # Transforms Pacakges for both Python and Ray -Most available Transforms can be published to pypi as a single package. A detailed list of available Python transforms is available at this (link)[python/README.md]. Similarly the following (link)[ray/README.md] provide a derailed list and installation instructions for Ray transforms +Most available Transforms can be published to pypi as a single package. A detailed list of available Python transforms is available at this [link](python/README.md). Similarly the following [link](ray/README.md) provide a derailed list and installation instructions for Ray transforms From 07b827f31a25923fcad1152509cb0f53e7aa0c2b Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 16:28:05 +0200 Subject: [PATCH 21/29] Apply version update to all transforms Signed-off-by: Maroun Touma --- data-processing-lib/spark/pyproject.toml | 4 ++-- kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml | 4 ++-- kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml | 6 +++--- kfp/kfp_support_lib/shared_workflow_support/pyproject.toml | 4 ++-- transforms/code/code2parquet/python/pyproject.toml | 4 ++-- transforms/code/code2parquet/ray/pyproject.toml | 6 +++--- transforms/code/code_quality/python/pyproject.toml | 4 ++-- transforms/code/code_quality/ray/pyproject.toml | 6 +++--- transforms/code/header_cleanser/python/pyproject.toml | 4 ++-- transforms/code/header_cleanser/ray/pyproject.toml | 6 +++--- transforms/code/malware/python/pyproject.toml | 4 ++-- transforms/code/malware/ray/pyproject.toml | 6 +++--- transforms/code/proglang_select/python/pyproject.toml | 4 ++-- transforms/code/proglang_select/ray/pyproject.toml | 6 +++--- transforms/code/repo_level_ordering/ray/pyproject.toml | 4 ++-- transforms/language/doc_chunk/python/pyproject.toml | 4 ++-- transforms/language/doc_chunk/ray/pyproject.toml | 6 +++--- transforms/language/doc_quality/python/pyproject.toml | 4 ++-- transforms/language/doc_quality/ray/pyproject.toml | 6 +++--- transforms/language/lang_id/python/pyproject.toml | 4 ++-- transforms/language/lang_id/ray/pyproject.toml | 6 +++--- transforms/language/pdf2parquet/python/pyproject.toml | 4 ++-- transforms/language/pdf2parquet/ray/pyproject.toml | 6 +++--- transforms/language/pii_redactor/python/pyproject.toml | 4 ++-- transforms/language/pii_redactor/ray/pyproject.toml | 6 +++--- transforms/language/text_encoder/python/pyproject.toml | 4 ++-- transforms/language/text_encoder/ray/pyproject.toml | 6 +++--- transforms/universal/doc_id/python/pyproject.toml | 4 ++-- transforms/universal/doc_id/ray/pyproject.toml | 6 +++--- transforms/universal/doc_id/spark/pyproject.toml | 4 ++-- transforms/universal/ededup/python/pyproject.toml | 4 ++-- transforms/universal/ededup/ray/pyproject.toml | 6 +++--- transforms/universal/fdedup/ray/pyproject.toml | 4 ++-- transforms/universal/filter/python/pyproject.toml | 4 ++-- transforms/universal/filter/ray/pyproject.toml | 6 +++--- transforms/universal/filter/spark/pyproject.toml | 6 +++--- transforms/universal/html2parquet/python/pyproject.toml | 4 ++-- transforms/universal/noop/python/pyproject.toml | 4 ++-- transforms/universal/noop/ray/pyproject.toml | 6 +++--- transforms/universal/noop/spark/pyproject.toml | 6 +++--- transforms/universal/profiler/ray/pyproject.toml | 4 ++-- transforms/universal/resize/python/pyproject.toml | 4 ++-- transforms/universal/resize/ray/pyproject.toml | 6 +++--- transforms/universal/tokenization/python/pyproject.toml | 4 ++-- transforms/universal/tokenization/ray/pyproject.toml | 6 +++--- 45 files changed, 110 insertions(+), 110 deletions(-) diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml index 30cb8f032..b6e9edddb 100644 --- a/data-processing-lib/spark/pyproject.toml +++ b/data-processing-lib/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_spark" -version = "0.2.1.dev0" +version = "0.2.1.dev3" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10" description = "Data Preparation Toolkit Library for Spark" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "pyspark>=3.5.2", "psutil>=6.0.0" ] diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index 6e8a7e458..eaea5fb0d 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10,<3.12" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.1.dev0", + "data-prep-toolkit-kfp-shared==0.2.1.dev3", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 632e414ca..c5ca32f1a 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10,<3.12" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -12,9 +12,9 @@ authors = [ { name = "Revital Eres", email = "eres@il.ibm.com" }, ] dependencies = [ - "kfp==2.7.0", + "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.1.dev0", + "data-prep-toolkit-kfp-shared==0.2.1.dev3", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index c8d2648df..b4f509433 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10,<3.12" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit-ray==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index b8c97541d..79f0988be 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "parameterized", "pandas", ] diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index f610754d0..c7f1a1563 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", - "dpk-code2parquet-transform-python==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", + "dpk-code2parquet-transform-python==0.2.1.dev3", "parameterized", "pandas", ] diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 60bdf9e91..88c8f9031 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "bs4==0.0.2", "transformers==4.38.2", ] diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index 574c06d5a..6925f45c0 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-code-quality-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index 38f4f6fb1..2799974b4 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index 4105907fe..d40aa9373 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-header-cleanser-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index ce4a6a088..9e5e122ca 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index d19eb2336..60d9a3089 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-malware-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index e21924116..7fcef9bfc 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index 323c16c1e..703bf5279 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-proglang-select-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index d1d973902..6f54a65ed 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml index 1dbf38560..4deb09d47 100644 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ b/transforms/language/doc_chunk/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "chunk documents Python Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "docling-core==1.3.0", "llama-index-core>=0.11.0,<0.12.0", ] diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 1ba60d642..19288e2db 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "chunk documents Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "dpk-doc-chunk-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-doc-chunk-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index e6d9a2ada..e63a6d5e5 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", ] [build-system] diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index 6ed293e09..6bc9cc6c6 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0" + "dpk-doc_quality-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3" ] [build-system] diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index bc472c766..f2dd72919 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "fasttext==0.9.2", "langcodes==3.3.0", "huggingface-hub >= 0.21.4, <1.0.0", diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index 2244d27c5..4833913a4 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-lang_id-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/language/pdf2parquet/python/pyproject.toml b/transforms/language/pdf2parquet/python/pyproject.toml index 91f7a14b5..24f2294b5 100644 --- a/transforms/language/pdf2parquet/python/pyproject.toml +++ b/transforms/language/pdf2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pdf2parquet_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "PDF2PARQUET Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "docling-core==1.2.0", "docling-ibm-models==1.1.7", "deepsearch-glm==0.21.0", diff --git a/transforms/language/pdf2parquet/ray/pyproject.toml b/transforms/language/pdf2parquet/ray/pyproject.toml index 9d81f8ada..950e5ce3d 100644 --- a/transforms/language/pdf2parquet/ray/pyproject.toml +++ b/transforms/language/pdf2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pdf2parquet_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "PDF2PARQUET Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "dpk-pdf2parquet-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-pdf2parquet-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index b63e6d676..a61987a45 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "PII redactor Transform for Python" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Sowmya.L.R", email = "lrsowmya@gmail.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index 349b24075..a1b01be94 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk_pii_redactor_transform_python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index 374c36d12..1ed8725ab 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "sentence-transformers==3.0.1", ] diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index 3c53415f8..aa8af8b44 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-text_encoder-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index c5af73cce..8e4358b28 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "ededup Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0" + "data-prep-toolkit==0.2.1.dev3" ] [build-system] diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index 022a63db6..e5cb79d95 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0" + "dpk_doc_id_transform_python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3" ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index 7efd8cfac..13d7bc2c3 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-spark==0.2.1.dev0", + "data-prep-toolkit-spark==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 2b751b18c..e380bf58e 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "ededup Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "mmh3==4.1.0", "xxhash==3.4.1", ] diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 1503d8c5c..f9442858c 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", - "dpk_ededup_transform_python==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", + "dpk_ededup_transform_python==dummy", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 25e4fe5f9..70f92a23f 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", "mmh3==4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index cbaa4ad20..995247f4f 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Filter Transform for Python" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "duckdb==0.10.1", ] diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index 155e8ef05..fc0035475 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-filter-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index 90974056b..2495106df 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_filter_transform_python==0.2.1.dev0", - "data-prep-toolkit-spark==0.2.1.dev0", + "dpk_filter_transform_python==dummy", + "data-prep-toolkit-spark==0.2.1.dev3", ] [project.optional-dependencies] diff --git a/transforms/universal/html2parquet/python/pyproject.toml b/transforms/universal/html2parquet/python/pyproject.toml index 5d183b49d..f49c498d6 100644 --- a/transforms/universal/html2parquet/python/pyproject.toml +++ b/transforms/universal/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Syed Zawad", email = "szawad@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "trafilatura==1.12.0" ] diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index 889cc6cfd..5714e70de 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index 86eebe633..9f1353b4e 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-noop-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index d22dadfa8..965770d92 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.1.dev0", - "data-prep-toolkit-spark==0.2.1.dev0", + "dpk-noop-transform-python==0.2.1.dev3", + "data-prep-toolkit-spark==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index 81439a390..1473b88b4 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.1.dev0", + "data-prep-toolkit-ray==0.2.1.dev3", "mmh3==4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 2346a0a14..b1cc13314 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "resize Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index 56fb6f077..86834c1b1 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-resize-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index f77067612..1dc0ca104 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "data-prep-toolkit==0.2.1.dev0", + "data-prep-toolkit==0.2.1.dev3", "transformers==4.38.2", ] diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index b77a46d6b..fd259a9b6 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.1.dev0" +version = "0.2.1.dev3" requires-python = ">=3.10" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-tokenization-transform-python==0.2.1.dev3", + "data-prep-toolkit-ray==0.2.1.dev3", ] [build-system] From 0369842d3d2822d477d8d3f0da063190b3bf8391 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 16:58:29 +0200 Subject: [PATCH 22/29] generate workflow for packaging folder Signed-off-by: Maroun Touma --- .github/workflows/test-packaging-python.yml | 60 +++++++++++++++++++++ .github/workflows/test-packaging-ray.yml | 60 +++++++++++++++++++++ 2 files changed, 120 insertions(+) create mode 100644 .github/workflows/test-packaging-python.yml create mode 100644 .github/workflows/test-packaging-ray.yml diff --git a/.github/workflows/test-packaging-python.yml b/.github/workflows/test-packaging-python.yml new file mode 100644 index 000000000..9c7710a1d --- /dev/null +++ b/.github/workflows/test-packaging-python.yml @@ -0,0 +1,60 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test - transforms/packaging/python + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - "transforms/packaging/python/**" + - "data-processing-lib/**" + - "!transforms/packaging/python/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - "transforms/packaging/python/**" + - "data-processing-lib/**" + - "!transforms/packaging/python/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +jobs: + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/packaging/python + run: | + if [ -e "transforms/packaging/python/Makefile" ]; then + make -C transforms/packaging/python DOCKER=docker test-src + else + echo "transforms/packaging/python/Makefile not found - source testing disabled for this transform." + fi diff --git a/.github/workflows/test-packaging-ray.yml b/.github/workflows/test-packaging-ray.yml new file mode 100644 index 000000000..dcd82dca5 --- /dev/null +++ b/.github/workflows/test-packaging-ray.yml @@ -0,0 +1,60 @@ +# +# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files +# +name: Test - transforms/packaging/ray + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - "transforms/packaging/ray/**" + - "data-processing-lib/**" + - "!transforms/packaging/ray/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - "transforms/packaging/ray/**" + - "data-processing-lib/**" + - "!transforms/packaging/ray/**/kfp_ray/**" # This is/will be tested in separate workflow + - "!data-processing-lib/**/test/**" + - "!data-processing-lib/**/test-data/**" + - "!**.md" + - "!**/doc/**" + - "!**/images/**" + - "!**.gitignore" + +jobs: + test-src: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Free up space in github runner + # Free space as indicated here : https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + run: | + df -h + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + sudo rm -rf /usr/share/dotnet /opt/ghc /usr/local/lib/android /usr/local/share/powershell /usr/share/swift /usr/local/.ghcup + sudo docker rmi $(docker image ls -aq) >/dev/null 2>&1 || true + df -h + - name: Test transform source in transforms/packaging/ray + run: | + if [ -e "transforms/packaging/ray/Makefile" ]; then + make -C transforms/packaging/ray DOCKER=docker test-src + else + echo "transforms/packaging/ray/Makefile not found - source testing disabled for this transform." + fi From aa297e031a99d0a8c5309fcea90466f85d1f26ee Mon Sep 17 00:00:00 2001 From: David Wood Date: Mon, 23 Sep 2024 12:25:52 -0400 Subject: [PATCH 23/29] fix ededup dummy version Signed-off-by: David Wood --- transforms/universal/ededup/ray/Makefile | 2 +- transforms/universal/ededup/ray/pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transforms/universal/ededup/ray/Makefile b/transforms/universal/ededup/ray/Makefile index 2d81bbbe2..f828e107e 100644 --- a/transforms/universal/ededup/ray/Makefile +++ b/transforms/universal/ededup/ray/Makefile @@ -33,7 +33,7 @@ setup:: .transforms.setup # TRANSFORM_PYTHON_VERSION has no effect since requirements do not specify a python transform implementation set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=dummy TOML_VERSION=$(EDEDUP_RAY_VERSION) .transforms.set-versions + $(MAKE) TRANSFORM_PYTHON_VERSION=$(EDEDUP_PYTHON_VERSION) TOML_VERSION=$(EDEDUP_RAY_VERSION) .transforms.set-versions build-dist:: .defaults.build-dist diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index f9442858c..2fdf82392 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -11,7 +11,7 @@ authors = [ ] dependencies = [ "data-prep-toolkit-ray==0.2.1.dev3", - "dpk_ededup_transform_python==dummy", + "dpk_ededup_transform_python==0.2.1.dev3", "tqdm==4.66.3", ] From 32578d51570bdd1713c454889f494779d52a9dec Mon Sep 17 00:00:00 2001 From: David Wood Date: Mon, 23 Sep 2024 12:27:32 -0400 Subject: [PATCH 24/29] fix filter/spark dummy version Signed-off-by: David Wood --- transforms/universal/filter/spark/pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index 2495106df..4d31c2ef2 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -9,7 +9,6 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_filter_transform_python==dummy", "data-prep-toolkit-spark==0.2.1.dev3", ] From 3b52ecfb686bc051c7976f67dac593c9ec9e3589 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 20:17:27 +0200 Subject: [PATCH 25/29] updated requirements based on latest release for docling Signed-off-by: Maroun Touma --- .make.defaults | 3 ++- transforms/packaging/python/requirements.txt | 22 +++++++------------- transforms/packaging/ray/requirements.txt | 4 ---- 3 files changed, 9 insertions(+), 20 deletions(-) diff --git a/.make.defaults b/.make.defaults index b8af116b2..8d7f454da 100644 --- a/.make.defaults +++ b/.make.defaults @@ -480,7 +480,8 @@ endif if [ -e requirements.txt ]; then \ echo Installing requirements from requirements.txt; \ pip install $(PIP_INSTALL_EXTRA_ARGS) $$extra_url -r requirements.txt; \ - elif [ -e pyproject.toml ]; then \ + fi; \ + if [ -e pyproject.toml ]; then \ echo Installing from pyproject.toml; \ pip install $(PIP_INSTALL_EXTRA_ARGS) $$extra_url -e .; \ fi diff --git a/transforms/packaging/python/requirements.txt b/transforms/packaging/python/requirements.txt index c3a7ae52e..6dec1e2de 100644 --- a/transforms/packaging/python/requirements.txt +++ b/transforms/packaging/python/requirements.txt @@ -1,23 +1,15 @@ data-prep-toolkit>=0.2.1.dev3 bs4==0.0.2 -#docling 1.9.0 depends on docling-parse<2.0.0 and >=1.1.3 -#pdf2parquet depends on docling-parse==1.0.0 -#docling 1.8.5 depends on docling-parse<2.0.0 and >=1.1.3 -#docling-parse>=1.0.0, -# language/doc_chunk has conflict dependencies with pdf2parquet that need to be resolved -# doc_chunk depends on docling>=1.8.2,<2.0.0 -# pdf2parquet depends on docling==1.7.0 -#docling==1.7.0, #pdf2parquet -docling-core==1.2.0, +# conflict with chunking.... +#docling-core==1.2.0, +docling-ibm-models==1.1.7, +deepsearch-glm==0.21.0, docling==1.11.0, filetype >=1.2.0, <2.0.0, -#DockChunking -quackling==0.4.0, -# quackling will pull -# docling>=1.8.2,<2.0.0 -# llama-index-core<0.12.0,>=0.11.1 -# docling-core<2.0.0,>=1.1.2 +#Doc chunking +docling-core==1.3.0, +llama-index-core>=0.11.0,<0.12.0, duckdb==0.10.1 fasttext==0.9.2 huggingface-hub >= 0.21.4, <1.0.0 diff --git a/transforms/packaging/ray/requirements.txt b/transforms/packaging/ray/requirements.txt index 3178a2e56..2e75ae185 100644 --- a/transforms/packaging/ray/requirements.txt +++ b/transforms/packaging/ray/requirements.txt @@ -16,10 +16,6 @@ func-timeout==4.3.5 pandas==2.2.2 emerge-viz==2.0.0 -#Note: -# when installing data-processing-library-ray, get the following -# ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. -# deepsearch-toolkit 1.0.0 requires platformdirs<4.0.0,>=3.5.1, but you have platformdirs 4.3.2 which is incompatible. From 8d69b71bba4a3fe0ee8eb2a5913e695f0db43293 Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 21:54:27 +0200 Subject: [PATCH 26/29] fix missing steps Signed-off-by: Maroun Touma --- transforms/packaging/README.md | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/transforms/packaging/README.md b/transforms/packaging/README.md index f976a4c56..18b22a6f5 100644 --- a/transforms/packaging/README.md +++ b/transforms/packaging/README.md @@ -5,12 +5,13 @@ Most available Transforms can be published to pypi as a single package. A detail ## Clone folder and update version number - -git clone https://github.com/IBM/data-prep-kit.git release -cd release - +```` +git clone https://github.com/IBM/data-prep-kit.git package-release +cd package-release +```` in `.make.versions`, Set the values for DPK_MAJOR_VERSION, DPK_MINOR_VERSION and DPK_MICRO_VERSION to specify the DPK library to use and as appropriate, set the value for `DPK_TRANSFORMS_VERSION` that will be used to tag the latest version released to pypi +`make set-versions` ## Creating src folder @@ -18,7 +19,7 @@ Given that the transforms do not currently have their own name spaces, the first ```` -cd release/transforms/packaging +cd package-release/transforms/packaging make clean make src ```` @@ -28,7 +29,7 @@ make src This procedure will run all the UT for each individual transforms using a single package configuration: ```` -cd release/transforms/packaging +cd package-release/transforms/packaging make clean make src make test-src @@ -36,17 +37,19 @@ make test-src ## Build and Deploy -This procedure will buid and publish two wheels to pypi.org: one for the python transforms and one for the ray transforms. +This procedure will buid two wheels: one for the python transforms and one for the ray transforms. ```` -cd release/transforms/packaging +cd package-release/transforms/packaging make clean make src -make set-version make build-dist -make publish-dist ```` +To publish the wheels to pypi.org, run: + +`make publish-dist` + From d27a1c2c2664f209f113ba6840af6ec627bc9b0c Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Mon, 23 Sep 2024 22:01:57 +0200 Subject: [PATCH 27/29] -sUpdate makefile to build and publish wheels --- transforms/packaging/Makefile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/transforms/packaging/Makefile b/transforms/packaging/Makefile index 020ae2a73..aa75d525e 100644 --- a/transforms/packaging/Makefile +++ b/transforms/packaging/Makefile @@ -19,6 +19,14 @@ setup:: build:: +build-dist:: + @# Help: Recursively build distributions in all subdirs + $(MAKE) RULE=$@ .recurse + +publish-dist:: + @# Help: Recursively publish distributions in all subdirs + $(MAKE) RULE=$@ .recurse + venv:: image:: From e155d2c93a966195b2de186e712b468ed341156e Mon Sep 17 00:00:00 2001 From: Maroun Touma Date: Tue, 24 Sep 2024 13:42:03 +0200 Subject: [PATCH 28/29] update based on reviewers comments Signed-off-by: Maroun Touma --- .github/workflows/test-packaging-python.yml | 3 --- .github/workflows/test-packaging-ray.yml | 4 +--- transforms/packaging/README.md | 6 +++--- transforms/packaging/python/README.md | 2 ++ transforms/packaging/ray/README.md | 2 ++ 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test-packaging-python.yml b/.github/workflows/test-packaging-python.yml index 9c7710a1d..71423541b 100644 --- a/.github/workflows/test-packaging-python.yml +++ b/.github/workflows/test-packaging-python.yml @@ -1,6 +1,3 @@ -# -# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files -# name: Test - transforms/packaging/python on: diff --git a/.github/workflows/test-packaging-ray.yml b/.github/workflows/test-packaging-ray.yml index dcd82dca5..54ec8ab8f 100644 --- a/.github/workflows/test-packaging-ray.yml +++ b/.github/workflows/test-packaging-ray.yml @@ -1,6 +1,4 @@ -# -# DO NOT EDIT THIS FILE: it is generated from test-transform.template, Edit there and run make to change these files -# + name: Test - transforms/packaging/ray on: diff --git a/transforms/packaging/README.md b/transforms/packaging/README.md index 18b22a6f5..e0d23ad52 100644 --- a/transforms/packaging/README.md +++ b/transforms/packaging/README.md @@ -19,7 +19,7 @@ Given that the transforms do not currently have their own name spaces, the first ```` -cd package-release/transforms/packaging +cd transforms/packaging make clean make src ```` @@ -29,7 +29,7 @@ make src This procedure will run all the UT for each individual transforms using a single package configuration: ```` -cd package-release/transforms/packaging +cd transforms/packaging make clean make src make test-src @@ -40,7 +40,7 @@ make test-src This procedure will buid two wheels: one for the python transforms and one for the ray transforms. ```` -cd package-release/transforms/packaging +cd transforms/packaging make clean make src make build-dist diff --git a/transforms/packaging/python/README.md b/transforms/packaging/python/README.md index a2b1f3c78..45260ce56 100644 --- a/transforms/packaging/python/README.md +++ b/transforms/packaging/python/README.md @@ -10,6 +10,8 @@ installing the python transforms will also install `data-prep-toolkit` ## List of Transforms in current package +Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components + * code * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md) * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/header_cleanser/python/README.md) diff --git a/transforms/packaging/ray/README.md b/transforms/packaging/ray/README.md index f38ece632..b7d4cf2eb 100644 --- a/transforms/packaging/ray/README.md +++ b/transforms/packaging/ray/README.md @@ -10,6 +10,8 @@ installing the Ray transforms will also install `data_prep_toolkit_transforms` a ## List of Ray Transforms availabe in current package +Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components + * code * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/ray/README.md) From 33b885320634d4aed9d26bbe2c9c9f21e575b056 Mon Sep 17 00:00:00 2001 From: David Wood Date: Tue, 24 Sep 2024 12:15:58 -0400 Subject: [PATCH 29/29] fix packaging test workflow paths Signed-off-by: David Wood --- .github/workflows/test-packaging-python.yml | 8 -------- .github/workflows/test-packaging-ray.yml | 9 --------- 2 files changed, 17 deletions(-) diff --git a/.github/workflows/test-packaging-python.yml b/.github/workflows/test-packaging-python.yml index 71423541b..4ee491c8e 100644 --- a/.github/workflows/test-packaging-python.yml +++ b/.github/workflows/test-packaging-python.yml @@ -10,10 +10,6 @@ on: - "*" paths: - "transforms/packaging/python/**" - - "data-processing-lib/**" - - "!transforms/packaging/python/**/kfp_ray/**" # This is/will be tested in separate workflow - - "!data-processing-lib/**/test/**" - - "!data-processing-lib/**/test-data/**" - "!**.md" - "!**/doc/**" - "!**/images/**" @@ -24,10 +20,6 @@ on: - "releases/**" paths: - "transforms/packaging/python/**" - - "data-processing-lib/**" - - "!transforms/packaging/python/**/kfp_ray/**" # This is/will be tested in separate workflow - - "!data-processing-lib/**/test/**" - - "!data-processing-lib/**/test-data/**" - "!**.md" - "!**/doc/**" - "!**/images/**" diff --git a/.github/workflows/test-packaging-ray.yml b/.github/workflows/test-packaging-ray.yml index 54ec8ab8f..4b812540c 100644 --- a/.github/workflows/test-packaging-ray.yml +++ b/.github/workflows/test-packaging-ray.yml @@ -1,4 +1,3 @@ - name: Test - transforms/packaging/ray on: @@ -11,10 +10,6 @@ on: - "*" paths: - "transforms/packaging/ray/**" - - "data-processing-lib/**" - - "!transforms/packaging/ray/**/kfp_ray/**" # This is/will be tested in separate workflow - - "!data-processing-lib/**/test/**" - - "!data-processing-lib/**/test-data/**" - "!**.md" - "!**/doc/**" - "!**/images/**" @@ -25,10 +20,6 @@ on: - "releases/**" paths: - "transforms/packaging/ray/**" - - "data-processing-lib/**" - - "!transforms/packaging/ray/**/kfp_ray/**" # This is/will be tested in separate workflow - - "!data-processing-lib/**/test/**" - - "!data-processing-lib/**/test-data/**" - "!**.md" - "!**/doc/**" - "!**/images/**"