diff --git a/.make.defaults b/.make.defaults index e1bd5275a..a7cc023ba 100644 --- a/.make.defaults +++ b/.make.defaults @@ -305,7 +305,10 @@ endif if [ ! -z "$(EXTRA_INDEX_URL)" ]; then \ extra_url='--extra-index-url $(EXTRA_INDEX_URL)'; \ fi; \ - pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR); + if [ -e $(PYTHON_PROJECT_DIR)/requirements.txt ]; then \ + pip install -r $(PYTHON_PROJECT_DIR)/requirements.txt; \ + fi; \ + pip install $(PIP_INSTALL_EXTRA_ARGS) $${extra_url} -e $(PYTHON_PROJECT_DIR) @echo Done installing source from $(PYTHON_PROJECT_DIR) into venv # Install local requirements last as it generally includes our lib source @@ -348,6 +351,11 @@ endif .defaults.ray-lib-src-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv .defaults.install-local-requirements-venv @# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present. +# Install local requirements last as it generally includes our lib source +.PHONY: .defaults.kfp-venv +.defaults.kfp-venv:: .defaults.create-venv .defaults.install-ray-lib-src-venv + @# Help: Create the venv and install Ray library source, local dependencies and adjacent python source if present. + # Install all source from the repo for a ray runtime transform into an existing venv # And if there is an adjacent python dir (as for transforms), then also install that source .PHONY: .defaults.install-ray-lib-src-venv @@ -633,7 +641,7 @@ endif rm -rf dist || true rm -rf src/*egg-info || true ${PIP} install --upgrade build - ${PYTHON} -m build + ${PYTHON} -m build $(BUILD_WHEEL_EXTRA_ARG) # Publish the distribution in the dist directory, usually created with .defaults.build-dist target .PHONY: .defaults.publish-dist diff --git a/.make.versions b/.make.versions index 4346291cc..4a28c2eb3 100644 --- a/.make.versions +++ b/.make.versions @@ -19,7 +19,7 @@ DPK_MINOR_VERSION=2 DPK_MICRO_VERSION=2 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches. # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. -DPK_VERSION_SUFFIX=.dev0 +DPK_VERSION_SUFFIX=.dev1 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX) diff --git a/README.md b/README.md index 255437482..f2b46e2b7 100644 --- a/README.md +++ b/README.md @@ -76,10 +76,11 @@ conda install gcc_linux-64 conda install gxx_linux-64 ``` -Next, install the data prep toolkit library. This library installs both the python and ray versions of the transforms. +Next, install the data prep toolkit library. This library installs both the python and ray versions of the transforms. For better management of dependencies, it is recommended to install the same tagged version of both the library and the transform. ```bash -pip3 install data-prep-toolkit-transforms-ray +pip3 install data-prep-toolkit[ray]==0.2.2 +pip3 install data-prep-toolkit-transforms[ray,all]==0.2.2 pip3 install jupyterlab ipykernel ipywidgets ## install custom kernel diff --git a/data-processing-lib/Makefile b/data-processing-lib/Makefile index a70a05ff8..fe3932195 100644 --- a/data-processing-lib/Makefile +++ b/data-processing-lib/Makefile @@ -22,6 +22,7 @@ REPOROOT=.. # Get some common rules for the whole repo include $(REPOROOT)/.make.defaults +include $(REPOROOT)/.make.versions ########## ########## ########## ########## ########## ########## ########## ########## # Global rules that are generally to be implemented in the sub-directories and can @@ -53,5 +54,12 @@ publish:: set-versions: @# Help: Recursively $@ in all subdirs + $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml @$(MAKE) RULE=$@ .recurse + +build-pkg-dist:: + $(MAKE) .defaults.build-dist BUILD_WHEEL_EXTRA_ARG=-w + +publish-dist :: .defaults.publish-dist + diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml new file mode 100644 index 000000000..db42ecf0d --- /dev/null +++ b/data-processing-lib/pyproject.toml @@ -0,0 +1,43 @@ +[project] +name = "data_prep_toolkit" +version = "0.2.2.dev1" +keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +requires-python = ">=3.10,<3.13" +description = "Data Preparation Toolkit Library for Ray and Python" +license = {text = "Apache-2.0"} +readme = {file = "README.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] + +dynamic = ["dependencies", "optional-dependencies"] + +[project_urls] +Repository = "https://github.com/IBM/data-prep-kit" +Issues = "https://github.com/IBM/data-prep-kit/issues" +Documentation = "https://ibm.github.io/data-prep-kit/" +"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop" + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools.dynamic.dependencies] +file = ["requirements.txt"] + +[tool.setuptools.dynamic.optional-dependencies] +dev = { file = ["requirements-dev.txt"]} +ray = { file = ["requirements-ray.txt"]} +spark = { file = ["requirements-spark.txt"]} + +[tool.setuptools.packages.find] +where = ["python/src", "ray/src", "spark/src"] + + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/data-processing-lib/python/pyproject.toml b/data-processing-lib/python/pyproject.toml index 595a1805a..f00d45a0a 100644 --- a/data-processing-lib/python/pyproject.toml +++ b/data-processing-lib/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Library" diff --git a/data-processing-lib/ray/pyproject.toml b/data-processing-lib/ray/pyproject.toml index a7f476560..1e8c335cc 100644 --- a/data-processing-lib/ray/pyproject.toml +++ b/data-processing-lib/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Ray" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit>=0.2.2.dev0", + "data-prep-toolkit>=0.2.2.dev1", "ray[default]==2.36.1", # These two are to fix security issues identified by quay.io "fastapi>=0.110.2", diff --git a/data-processing-lib/requirements-dev.txt b/data-processing-lib/requirements-dev.txt new file mode 100644 index 000000000..326d62c8e --- /dev/null +++ b/data-processing-lib/requirements-dev.txt @@ -0,0 +1,9 @@ +twine +pytest>=7.3.2 +pytest-dotenv>=0.5.2 +pytest-env>=1.0.0 +pre-commit>=3.3.2 +pytest-cov>=4.1.0 +pytest-mock>=3.10.0 +moto==5.0.5 +markupsafe==2.0.1 diff --git a/data-processing-lib/requirements-ray.txt b/data-processing-lib/requirements-ray.txt new file mode 100644 index 000000000..33205cd9d --- /dev/null +++ b/data-processing-lib/requirements-ray.txt @@ -0,0 +1,3 @@ +ray[default]==2.36.1 +fastapi>=0.110.2 +pillow>=10.3.0 diff --git a/data-processing-lib/requirements-spark.txt b/data-processing-lib/requirements-spark.txt new file mode 100644 index 000000000..f38f033da --- /dev/null +++ b/data-processing-lib/requirements-spark.txt @@ -0,0 +1,2 @@ +pyspark>=3.5.2 +psutil>=6.0.0 diff --git a/data-processing-lib/requirements.txt b/data-processing-lib/requirements.txt new file mode 100644 index 000000000..7b363f2b5 --- /dev/null +++ b/data-processing-lib/requirements.txt @@ -0,0 +1,6 @@ + numpy < 1.29.0 + pyarrow==16.1.0 + boto3==1.34.69 + argparse + mmh3 + psutil diff --git a/data-processing-lib/spark/pyproject.toml b/data-processing-lib/spark/pyproject.toml index bc1cb4f67..15aedcfbd 100644 --- a/data-processing-lib/spark/pyproject.toml +++ b/data-processing-lib/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_spark" -version = "0.2.2.dev0" +version = "0.2.2.dev1" keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] requires-python = ">=3.10,<3.13" description = "Data Preparation Toolkit Library for Spark" @@ -11,7 +11,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev0", + "data-prep-toolkit==0.2.2.dev1", "pyspark>=3.5.2", "psutil>=6.0.0", "PyYAML>=6.0.2" diff --git a/doc/quick-start/quick-start.md b/doc/quick-start/quick-start.md index b7167df77..47d5a1f1b 100644 --- a/doc/quick-start/quick-start.md +++ b/doc/quick-start/quick-start.md @@ -59,7 +59,7 @@ or **Deploy the latest releases of the data prep toolkit library, all python transforms and all ray transforms** ```shell -pip3 install data-prep-toolkit-transforms-ray +pip3 install data-prep-toolkit-transforms[ray] ``` ## Running transforms diff --git a/examples/notebooks/Run_your_first_transform_colab.ipynb b/examples/notebooks/Run_your_first_transform_colab.ipynb index 8b99b8f36..0caed5bda 100644 --- a/examples/notebooks/Run_your_first_transform_colab.ipynb +++ b/examples/notebooks/Run_your_first_transform_colab.ipynb @@ -35,7 +35,7 @@ }, "outputs": [], "source": [ - "! pip install --default-timeout=100 data-prep-toolkit-transforms-ray\n" + "! pip install --default-timeout=100 data-prep-toolkit-transforms[ray]\n" ] }, { diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml index c0331b38b..d62342c5f 100644 --- a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v1" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -13,7 +13,7 @@ authors = [ ] dependencies = [ "kfp==1.8.22", - "data-prep-toolkit-kfp-shared==0.2.2.dev0", + "data-prep-toolkit-kfp-shared==0.2.2.dev1", ] [build-system] diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml index 220c56ad8..3dab7eac9 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_v2" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "kfp==2.8.0", "kfp-kubernetes==1.2.0", - "data-prep-toolkit-kfp-shared==0.2.2.dev0", + "data-prep-toolkit-kfp-shared==0.2.2.dev1", ] [build-system] diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml index 726d7fd5d..b2dc963d7 100644 --- a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml +++ b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_kfp_shared" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Data Preparation Kit Library. KFP support" license = {text = "Apache-2.0"} @@ -14,7 +14,7 @@ authors = [ dependencies = [ "requests", "kubernetes", - "data-prep-toolkit-ray==0.2.2.dev0", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/.make.transforms b/transforms/.make.transforms index cd6aa84ac..5034b3122 100644 --- a/transforms/.make.transforms +++ b/transforms/.make.transforms @@ -342,4 +342,11 @@ minio-stop: > tt.toml; \ mv tt.toml pyproject.toml; \ fi + if [ -e requirements.txt ]; then \ + cat requirements.txt | sed \ + -e 's/\(dpk[_-].*transform[_-]python[=<>~][=]\).*/\1$(TRANSFORM_PYTHON_VERSION)/' \ + > tt.txt; \ + mv tt.txt requirements.txt; \ + fi + diff --git a/transforms/.make.workflows b/transforms/.make.workflows index cd4c2532a..b151e87e6 100644 --- a/transforms/.make.workflows +++ b/transforms/.make.workflows @@ -52,7 +52,7 @@ endif ${WORKFLOW_VENV_ACTIVATE}: ${REPOROOT}/.make.versions ${REPOROOT}/kfp/kfp_ray_components/requirements.txt ${DPK_RAY_LIB_DIR} ${KFP_LIB_SRC_FILES} ${KFP_LIB_CONFIG_FILE} ${KFP_SHARED_LIB_SRC_FILES} rm -rf ${REPOROOT}/transforms/venv - $(MAKE) -C ${REPOROOT}/transforms .defaults.ray-lib-src-venv + $(MAKE) -C ${REPOROOT}/transforms .defaults.kfp-venv . ${WORKFLOW_VENV_ACTIVATE}; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/shared_workflow_support; \ pip install -e $(REPOROOT)/kfp/kfp_support_lib/$(WORKFLOW_SUPPORT_LIB); \ diff --git a/transforms/Makefile b/transforms/Makefile index 2f8fa27b1..63e635898 100644 --- a/transforms/Makefile +++ b/transforms/Makefile @@ -1,6 +1,8 @@ REPOROOT=../ # Use make help, to see the available rules include ../.make.defaults +include ./transform.config + setup:: @# Help: Recursively make $@ all subdirs @@ -78,4 +80,54 @@ workflow-upload:: set-versions:: @# Help: Recursively make $@ in all subdirs + make set-pkg-version @$(MAKE) RULE=$@ .recurse + +set-pkg-version: + @# Help: Set tag for this package and its dependencies + cat pyproject.toml | sed -e \ + 's/^version[ ]*=.*/version = "'${TRANSFORMS_PKG_VERSION}'"/' \ + > tt + mv tt pyproject.toml + echo $(DPK_VERSION) + cat requirements.txt | sed -e \ + 's/data-prep-toolkit\([=><~][=]\).*/data-prep-toolkit\1$(DPK_VERSION)/' \ + > tt + mv tt requirements.txt + cat requirements-ray.txt | sed -e \ + 's/data-prep-toolkit\[ray\]\([=><~][=]\).*/data-prep-toolkit\[ray\]\1$(DPK_VERSION)/' \ + > tt + mv tt requirements-ray.txt + + +build-pkg-dist: + @# Help: Build package wheel + ## Most transforms today don't have a package name.... Need to fix that + ## In the meantime, we will copy everything to a single folder + -rm -fr src + mkdir src + # Copy all the src folders recursively (not clear if they have subfolders) + for x in $(shell find . | grep '[ray| python]/src$$') ; do \ + echo $$x ; \ + if [ -d "$$x" ]; then \ + cp -r $$x/* src ; \ + fi \ + done + # Only needs to build the whl + $(MAKE) BUILD_WHEEL_EXTRA_ARG=-w .defaults.build-dist + -rm -fr src + +test-pkg-dist: + @# Help: Setup environment and run unit tests for all transforms. + -rm -fr venv + python -m venv venv + source venv/bin/activate && $(PYTHON) -m pip install '$(REPOROOT)/data-processing-lib/dist/data_prep_toolkit-$(DPK_VERSION)-py3-none-any.whl[dev,ray]' + source venv/bin/activate && $(PYTHON) -m pip install 'dist/data_prep_toolkit_transforms-$(DPK_TRANSFORMS_VERSION)-py3-none-any.whl[all]' + for T in $(shell find . | grep '[ray| python]/test$$') ; do \ + echo "running unit test on: $$T" ; \ + source venv/bin/activate && $(PYTEST) $$T; \ + done; + @# Help: Setup environment and run unit tests for all transforms + +publish-dist :: .defaults.publish-dist + diff --git a/transforms/packaging/python/README.md b/transforms/README-list.md similarity index 94% rename from transforms/packaging/python/README.md rename to transforms/README-list.md index 20eb0dff0..99885ad34 100644 --- a/transforms/packaging/python/README.md +++ b/transforms/README-list.md @@ -5,9 +5,14 @@ The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: `python -m pip install data-prep-toolkit-transforms` +or +`python -m pip install data-prep-toolkit-transforms[ray]` + installing the python transforms will also install `data-prep-toolkit` +installing the ray transforms will also install `data-prep-toolkit[ray]` + ## List of Transforms in current package Note: This list includes the transforms that were part of the release starting with data-prep-toolkit-transforms:0.2.1. This list may not always reflect up to date information. Users are encourage to raise an issue in git when they discover missing components or packages that are listed below but not in the current release they get from pypi. diff --git a/transforms/code/code2parquet/python/Dockerfile b/transforms/code/code2parquet/python/Dockerfile index b36b6a6c4..f94301a9c 100644 --- a/transforms/code/code2parquet/python/Dockerfile +++ b/transforms/code/code2parquet/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml index 34a668bf0..0c115efc3 100644 --- a/transforms/code/code2parquet/python/pyproject.toml +++ b/transforms/code/code2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "code2parquet Python Transform" license = {text = "Apache-2.0"} @@ -9,16 +9,15 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "parameterized", - "pandas", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt new file mode 100644 index 000000000..45f677e77 --- /dev/null +++ b/transforms/code/code2parquet/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev1 +parameterized +pandas diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml index 3f8808037..120d080dc 100644 --- a/transforms/code/code2parquet/ray/pyproject.toml +++ b/transforms/code/code2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code2parquet_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "code2parquet Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.2.dev0", - "dpk-code2parquet-transform-python==0.2.2.dev0", + "data-prep-toolkit-ray==0.2.2.dev1", + "dpk-code2parquet-transform-python==0.2.2.dev1", "parameterized", "pandas", ] diff --git a/transforms/code/code_quality/python/Dockerfile b/transforms/code/code_quality/python/Dockerfile index 76cf1de30..b25a57ca1 100644 --- a/transforms/code/code_quality/python/Dockerfile +++ b/transforms/code/code_quality/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . #COPY requirements.txt requirements.txt diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml index 58e2affa7..b217060f5 100644 --- a/transforms/code/code_quality/python/pyproject.toml +++ b/transforms/code/code_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Code Quality Python Transform" license = {text = "Apache-2.0"} @@ -8,16 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "bs4==0.0.2", - "transformers==4.38.2", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt new file mode 100644 index 000000000..4ee249788 --- /dev/null +++ b/transforms/code/code_quality/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev1 +bs4==0.0.2 +transformers==4.38.2 diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml index 78ded1ce0..457678a6e 100644 --- a/transforms/code/code_quality/ray/pyproject.toml +++ b/transforms/code/code_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_code_quality_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Code Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-code-quality-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-code-quality-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/code/header_cleanser/python/Dockerfile b/transforms/code/header_cleanser/python/Dockerfile index c2e215904..84831bcd2 100644 --- a/transforms/code/header_cleanser/python/Dockerfile +++ b/transforms/code/header_cleanser/python/Dockerfile @@ -27,6 +27,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml index c4326b4a0..79dee12a1 100644 --- a/transforms/code/header_cleanser/python/pyproject.toml +++ b/transforms/code/header_cleanser/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "License and Copyright Removal Transform for Python" license = {text = "Apache-2.0"} @@ -8,15 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "scancode-toolkit==32.1.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt new file mode 100644 index 000000000..4502a5fdb --- /dev/null +++ b/transforms/code/header_cleanser/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev1 +scancode-toolkit==32.1.0 ; platform_system != 'Darwin' + diff --git a/transforms/code/header_cleanser/ray/pyproject.toml b/transforms/code/header_cleanser/ray/pyproject.toml index 7509027a1..f99feaba7 100644 --- a/transforms/code/header_cleanser/ray/pyproject.toml +++ b/transforms/code/header_cleanser/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_header_cleanser_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "License and copyright removal Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Yash kalathiya", email = "yashkalathiya164@gmail.com" }, ] dependencies = [ - "dpk-header-cleanser-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-header-cleanser-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", "scancode-toolkit==32.1.0", ] diff --git a/transforms/code/license_select/python/Dockerfile b/transforms/code/license_select/python/Dockerfile index 6831306c3..2fa9f9426 100644 --- a/transforms/code/license_select/python/Dockerfile +++ b/transforms/code/license_select/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/code/license_select/python/pyproject.toml b/transforms/code/license_select/python/pyproject.toml index 1058b0440..740c5ccbb 100644 --- a/transforms/code/license_select/python/pyproject.toml +++ b/transforms/code/license_select/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_license_select_transform_python" -version = "0.2.1.dev0" -requires-python = ">=3.10" +version = "0.2.2.dev1" +requires-python = ">=3.10,<3.13" description = "License Select Python Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -9,14 +9,16 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.1.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/license_select/python/requirements.txt b/transforms/code/license_select/python/requirements.txt new file mode 100644 index 000000000..82723b6ef --- /dev/null +++ b/transforms/code/license_select/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev1 \ No newline at end of file diff --git a/transforms/code/license_select/ray/pyproject.toml b/transforms/code/license_select/ray/pyproject.toml index 89b4b9ea5..0d0634ef3 100644 --- a/transforms/code/license_select/ray/pyproject.toml +++ b/transforms/code/license_select/ray/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_license_select_transform_ray" -version = "0.2.1.dev0" -requires-python = ">=3.10" +version = "0.2.2.dev1" +requires-python = ">=3.10,<3.13" description = "License Select Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -10,8 +10,8 @@ authors = [ { name = "Mark Lewis", email = "mark_lewis@uk.ibm.com" }, ] dependencies = [ - "dpk-license-select-transform-python==0.2.1.dev0", - "data-prep-toolkit-ray==0.2.1.dev0", + "dpk-license-select-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/code/malware/python/pyproject.toml b/transforms/code/malware/python/pyproject.toml index 256a10b79..53af8fa4e 100644 --- a/transforms/code/malware/python/pyproject.toml +++ b/transforms/code/malware/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Malware Python Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev0", + "data-prep-toolkit==0.2.2.dev1", "clamd==1.0.2", ] diff --git a/transforms/code/malware/ray/pyproject.toml b/transforms/code/malware/ray/pyproject.toml index cf454e856..6abc2af60 100644 --- a/transforms/code/malware/ray/pyproject.toml +++ b/transforms/code/malware/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_malware_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Malware Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Takuya Goto", email = "tkyg@jp.ibm.com" }, ] dependencies = [ - "dpk-malware-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-malware-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/code/proglang_select/python/Dockerfile b/transforms/code/proglang_select/python/Dockerfile index a94d9d960..3186862f0 100644 --- a/transforms/code/proglang_select/python/Dockerfile +++ b/transforms/code/proglang_select/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/code/proglang_select/python/pyproject.toml b/transforms/code/proglang_select/python/pyproject.toml index 25aa5fdcf..b120e5064 100644 --- a/transforms/code/proglang_select/python/pyproject.toml +++ b/transforms/code/proglang_select/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Python Transform" license = {text = "Apache-2.0"} @@ -8,14 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/code/proglang_select/python/requirements.txt b/transforms/code/proglang_select/python/requirements.txt new file mode 100644 index 000000000..82723b6ef --- /dev/null +++ b/transforms/code/proglang_select/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev1 \ No newline at end of file diff --git a/transforms/code/proglang_select/ray/pyproject.toml b/transforms/code/proglang_select/ray/pyproject.toml index 1730ab04f..a74372f49 100644 --- a/transforms/code/proglang_select/ray/pyproject.toml +++ b/transforms/code/proglang_select/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_proglang_select_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Programming Language Selection Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Shivdeep Singh", email = "shivdeep.singh@ibm.com" }, ] dependencies = [ - "dpk-proglang-select-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-proglang-select-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/code/repo_level_ordering/Makefile b/transforms/code/repo_level_ordering/Makefile index 04b1cc451..cfcd22d8a 100644 --- a/transforms/code/repo_level_ordering/Makefile +++ b/transforms/code/repo_level_ordering/Makefile @@ -1,6 +1,7 @@ REPOROOT=../../.. # Use make help, to see the available rules include $(REPOROOT)/.make.defaults +include transform.config setup:: @# Help: Recursively make $@ all subdirs @@ -47,25 +48,41 @@ load-image:: .PHONY: workflow-venv workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ + @is_blacklisted=$$(cd $(REPOROOT); bash scripts/check-workflows.sh -show-kfp-black-list | grep $(TRANSFORM_NAME)); \ + if [ -z "$$is_blacklisted" ]; \ + then \ + echo $(MAKE) -C kfp_ray $@ ; \ + else \ + echo "Skipping KFP workflow: Transform is blacklisted " ; \ fi .PHONY: workflow-test workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ + @is_blacklisted=$$(cd $(REPOROOT); bash scripts/check-workflows.sh -show-kfp-black-list | grep $(TRANSFORM_NAME)); \ + if [ -z "$$is_blacklisted" ]; \ + then \ + echo $(MAKE) -C kfp_ray $@ ; \ + else \ + echo "Skipping KFP workflow: Transform is blacklisted " ; \ fi .PHONY: workflow-upload workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ + @is_blacklisted=$$(cd $(REPOROOT); bash scripts/check-workflows.sh -show-kfp-black-list | grep $(TRANSFORM_NAME)); \ + if [ -z "$$is_blacklisted" ]; \ + then \ + echo $(MAKE) -C kfp_ray $@ ; \ + else \ + echo "Skipping KFP workflow: Transform is blacklisted " ; \ fi .PHONY: workflow-build workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ + is_blacklisted=$$(cd $(REPOROOT); bash scripts/check-workflows.sh -show-kfp-black-list | grep $(TRANSFORM_NAME)); \ + if [ -z "$$is_blacklisted" ]; \ + then \ + echo $(MAKE) -C kfp_ray $@ ; \ + else \ + echo "Skipping KFP workflow: Transform is blacklisted " ; \ fi diff --git a/transforms/code/repo_level_ordering/ray/pyproject.toml b/transforms/code/repo_level_ordering/ray/pyproject.toml index e87d133e0..f66d2c9d1 100644 --- a/transforms/code/repo_level_ordering/ray/pyproject.toml +++ b/transforms/code/repo_level_ordering/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_repo_level_order_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "repo_level_order Ray Transform" license = {text = "Apache-2.0"} @@ -11,7 +11,7 @@ authors = [ { name = "Shanmukha Guttula", email = "shagutt1@in.ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.2.dev0", + "data-prep-toolkit-ray==0.2.2.dev1", "networkx==3.3", "colorlog==6.8.2", "func-timeout==4.3.5", diff --git a/transforms/language/doc_chunk/python/Dockerfile b/transforms/language/doc_chunk/python/Dockerfile index 8efb3845b..d399a77ed 100644 --- a/transforms/language/doc_chunk/python/Dockerfile +++ b/transforms/language/doc_chunk/python/Dockerfile @@ -21,6 +21,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml index 7705779b0..eeff859f0 100644 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ b/transforms/language/doc_chunk/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "chunk documents Python Transform" license = {text = "Apache-2.0"} @@ -10,16 +10,15 @@ authors = [ { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "docling-core==1.3.0", - "llama-index-core>=0.11.0,<0.12.0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt new file mode 100644 index 000000000..d532510ba --- /dev/null +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev1 +docling-core==1.3.0 +llama-index-core>=0.11.0,<0.12.0 diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml index 6bab175b8..ac3f5218e 100644 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ b/transforms/language/doc_chunk/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_chunk_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "chunk documents Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, ] dependencies = [ - "dpk-doc-chunk-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-doc-chunk-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/language/doc_quality/python/Dockerfile b/transforms/language/doc_quality/python/Dockerfile index 78b769dd7..10dca4999 100644 --- a/transforms/language/doc_quality/python/Dockerfile +++ b/transforms/language/doc_quality/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/doc_quality/python/pyproject.toml b/transforms/language/doc_quality/python/pyproject.toml index 8ebec8fe3..c4c9b2805 100644 --- a/transforms/language/doc_quality/python/pyproject.toml +++ b/transforms/language/doc_quality/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Document Quality Python Transform" license = {text = "Apache-2.0"} @@ -8,14 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ diff --git a/transforms/language/doc_quality/python/requirements.txt b/transforms/language/doc_quality/python/requirements.txt new file mode 100644 index 000000000..25bf48702 --- /dev/null +++ b/transforms/language/doc_quality/python/requirements.txt @@ -0,0 +1,2 @@ + +data-prep-toolkit==0.2.2.dev1 diff --git a/transforms/language/doc_quality/ray/pyproject.toml b/transforms/language/doc_quality/ray/pyproject.toml index 0588c1997..a4aba9a3a 100644 --- a/transforms/language/doc_quality/ray/pyproject.toml +++ b/transforms/language/doc_quality/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_quality_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Document Quality Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-doc_quality-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0" + "dpk-doc_quality-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1" ] [build-system] diff --git a/transforms/language/html2parquet/python/pyproject.toml b/transforms/language/html2parquet/python/pyproject.toml index 9cb33f5c3..0f78a62dd 100644 --- a/transforms/language/html2parquet/python/pyproject.toml +++ b/transforms/language/html2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_html2parquet_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/html2parquet/python/requirements.txt b/transforms/language/html2parquet/python/requirements.txt index 69d487445..8b507cedd 100644 --- a/transforms/language/html2parquet/python/requirements.txt +++ b/transforms/language/html2parquet/python/requirements.txt @@ -1,2 +1,2 @@ -data-prep-toolkit==0.2.2.dev0 +data-prep-toolkit==0.2.2.dev1 trafilatura==1.12.0 diff --git a/transforms/language/html2parquet/ray/pyproject.toml b/transforms/language/html2parquet/ray/pyproject.toml index d1153a91a..5d2af9043 100644 --- a/transforms/language/html2parquet/ray/pyproject.toml +++ b/transforms/language/html2parquet/ray/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_html2parquet_transform_ray" -version = "0.2.2.dev0" -requires-python = ">=3.10" +version = "0.2.2.dev1" +requires-python = ">=3.10,<3.13" description = "HTML2PARQUET Python Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} diff --git a/transforms/language/html2parquet/ray/requirements.txt b/transforms/language/html2parquet/ray/requirements.txt index dc796d602..d4c7abc1b 100644 --- a/transforms/language/html2parquet/ray/requirements.txt +++ b/transforms/language/html2parquet/ray/requirements.txt @@ -1,2 +1,3 @@ -dpk-html2parquet-transform-python==0.2.2.dev0 -data-prep-toolkit-ray==0.2.2.dev0 +dpk-html2parquet-transform-python==0.2.2.dev1 +data-prep-toolkit-ray==0.2.2.dev1 +trafilatura==1.12.0 \ No newline at end of file diff --git a/transforms/language/lang_id/python/Dockerfile b/transforms/language/lang_id/python/Dockerfile index 131748480..f1bcc1bdd 100644 --- a/transforms/language/lang_id/python/Dockerfile +++ b/transforms/language/lang_id/python/Dockerfile @@ -25,6 +25,7 @@ USER dpk COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # clean up apt diff --git a/transforms/language/lang_id/python/pyproject.toml b/transforms/language/lang_id/python/pyproject.toml index 54c874a36..35406abc3 100644 --- a/transforms/language/lang_id/python/pyproject.toml +++ b/transforms/language/lang_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Language Identification Python Transform" license = {text = "Apache-2.0"} @@ -8,18 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "fasttext==0.9.2", - "langcodes==3.3.0", - "huggingface-hub >= 0.21.4, <1.0.0", - "numpy==1.26.4", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/lang_id/python/requirements.txt b/transforms/language/lang_id/python/requirements.txt new file mode 100644 index 000000000..d195ebfbb --- /dev/null +++ b/transforms/language/lang_id/python/requirements.txt @@ -0,0 +1,5 @@ +data-prep-toolkit==0.2.2.dev1 +fasttext==0.9.2 +langcodes==3.3.0 +huggingface-hub >= 0.21.4, <1.0.0 +numpy==1.26.4 diff --git a/transforms/language/lang_id/ray/pyproject.toml b/transforms/language/lang_id/ray/pyproject.toml index ac4558675..60ff39947 100644 --- a/transforms/language/lang_id/ray/pyproject.toml +++ b/transforms/language/lang_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_lang_id_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Language Identification Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Daiki Tsuzuku", email = "dtsuzuku@jp.ibm.com" } ] dependencies = [ - "dpk-lang_id-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-lang_id-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/language/pdf2parquet/python/pyproject.toml b/transforms/language/pdf2parquet/python/pyproject.toml index 804f2ff7c..c069300b5 100644 --- a/transforms/language/pdf2parquet/python/pyproject.toml +++ b/transforms/language/pdf2parquet/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pdf2parquet_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "PDF2PARQUET Python Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/pdf2parquet/python/requirements.txt b/transforms/language/pdf2parquet/python/requirements.txt index e275fdc2a..d959b9e38 100644 --- a/transforms/language/pdf2parquet/python/requirements.txt +++ b/transforms/language/pdf2parquet/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev0 +data-prep-toolkit==0.2.2.dev1 docling-core==1.3.0 docling-ibm-models==1.1.7 deepsearch-glm==0.21.0 diff --git a/transforms/language/pdf2parquet/ray/pyproject.toml b/transforms/language/pdf2parquet/ray/pyproject.toml index 39939adae..5699dcc8c 100644 --- a/transforms/language/pdf2parquet/ray/pyproject.toml +++ b/transforms/language/pdf2parquet/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pdf2parquet_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "PDF2PARQUET Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/language/pdf2parquet/ray/requirements.txt b/transforms/language/pdf2parquet/ray/requirements.txt index 1db94ff1c..1577d024f 100644 --- a/transforms/language/pdf2parquet/ray/requirements.txt +++ b/transforms/language/pdf2parquet/ray/requirements.txt @@ -1,5 +1,5 @@ -dpk-pdf2parquet-transform-python==0.2.2.dev0 -data-prep-toolkit-ray==0.2.2.dev0 +dpk-pdf2parquet-transform-python==0.2.2.dev1 +data-prep-toolkit-ray==0.2.2.dev1 docling-core==1.3.0 docling-ibm-models==1.1.7 deepsearch-glm==0.21.0 diff --git a/transforms/language/pii_redactor/python/Dockerfile b/transforms/language/pii_redactor/python/Dockerfile index 64b92e1b6..437bf8220 100644 --- a/transforms/language/pii_redactor/python/Dockerfile +++ b/transforms/language/pii_redactor/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/pii_redactor/python/pyproject.toml b/transforms/language/pii_redactor/python/pyproject.toml index 55d4e8970..3f9ddaaad 100644 --- a/transforms/language/pii_redactor/python/pyproject.toml +++ b/transforms/language/pii_redactor/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "PII redactor Transform for Python" license = {text = "Apache-2.0"} @@ -8,18 +8,15 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Sowmya.L.R", email = "lrsowmya@gmail.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "presidio-analyzer>=2.2.355", - "presidio-anonymizer>=2.2.355", - "flair>=0.14.0", - "pandas>=2.2.2", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/pii_redactor/python/requirements.txt b/transforms/language/pii_redactor/python/requirements.txt new file mode 100644 index 000000000..6969b83b9 --- /dev/null +++ b/transforms/language/pii_redactor/python/requirements.txt @@ -0,0 +1,5 @@ +data-prep-toolkit==0.2.2.dev1 +presidio-analyzer>=2.2.355 +presidio-anonymizer>=2.2.355 +flair>=0.14.0 +pandas>=2.2.2 diff --git a/transforms/language/pii_redactor/ray/pyproject.toml b/transforms/language/pii_redactor/ray/pyproject.toml index 4283df428..1ef96511a 100644 --- a/transforms/language/pii_redactor/ray/pyproject.toml +++ b/transforms/language/pii_redactor/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_pii_redactor_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "PII Redactor Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk_pii_redactor_transform_python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk_pii_redactor_transform_python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", "presidio-analyzer>=2.2.355", "presidio-anonymizer>=2.2.355", "flair>=0.14.0", diff --git a/transforms/language/text_encoder/Makefile b/transforms/language/text_encoder/Makefile index bca6f7e85..29357c272 100644 --- a/transforms/language/text_encoder/Makefile +++ b/transforms/language/text_encoder/Makefile @@ -55,25 +55,25 @@ docker-save-image:: .PHONY: workflow-venv workflow-venv: - if [ -e kfp_ray ]; then \ + if [ -e kfp_ray ] && [ -f kfp_ray/Makefile ]; then \ $(MAKE) -C kfp_ray workflow-venv; \ fi .PHONY: workflow-test workflow-test: - if [ -e kfp_ray ]; then \ + if [ -e kfp_ray ] && [ -f kfp_ray/Makefile ]; then \ $(MAKE) -C kfp_ray workflow-test; \ fi .PHONY: workflow-upload workflow-upload: - if [ -e kfp_ray ]; then \ + if [ -e kfp_ray ] && [ -f kfp_ray/Makefile ]; then \ $(MAKE) -C kfp_ray workflow-upload; \ fi .PHONY: workflow-build workflow-build: - if [ -e kfp_ray ]; then \ + if [ -e kfp_ray ] && [ -f kfp_ray/Makefile ]; then \ $(MAKE) -C kfp_ray workflow-build; \ fi diff --git a/transforms/language/text_encoder/python/Dockerfile b/transforms/language/text_encoder/python/Dockerfile index 676968fee..86023a440 100644 --- a/transforms/language/text_encoder/python/Dockerfile +++ b/transforms/language/text_encoder/python/Dockerfile @@ -19,7 +19,8 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . # copy transform main() entry point to the image diff --git a/transforms/language/text_encoder/python/pyproject.toml b/transforms/language/text_encoder/python/pyproject.toml index e9f84fefd..65cb16b5b 100644 --- a/transforms/language/text_encoder/python/pyproject.toml +++ b/transforms/language/text_encoder/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Text Encoder Python Transform" license = {text = "Apache-2.0"} @@ -10,15 +10,15 @@ authors = [ { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "sentence-transformers==3.0.1", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/language/text_encoder/python/requirements.txt b/transforms/language/text_encoder/python/requirements.txt new file mode 100644 index 000000000..aab7681dc --- /dev/null +++ b/transforms/language/text_encoder/python/requirements.txt @@ -0,0 +1,2 @@ +data-prep-toolkit==0.2.2.dev1 +sentence-transformers==3.0.1 diff --git a/transforms/language/text_encoder/ray/pyproject.toml b/transforms/language/text_encoder/ray/pyproject.toml index 2735856aa..95e29638f 100644 --- a/transforms/language/text_encoder/ray/pyproject.toml +++ b/transforms/language/text_encoder/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_text_encoder_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Text Encoder Ray Transform" license = {text = "Apache-2.0"} @@ -11,8 +11,8 @@ authors = [ { name = "Peter Staar", email = "taa@zurich.ibm.com" }, ] dependencies = [ - "dpk-text_encoder-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-text_encoder-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/packaging/.gitignore b/transforms/packaging/.gitignore deleted file mode 100644 index 863607847..000000000 --- a/transforms/packaging/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -**/src -**/dist -**/*.egg-info -**/build - diff --git a/transforms/packaging/.make.packaging b/transforms/packaging/.make.packaging deleted file mode 100644 index 29506aaf1..000000000 --- a/transforms/packaging/.make.packaging +++ /dev/null @@ -1,88 +0,0 @@ -ifndef T_SET -T_SET=all -endif - -# Defines the version of the wheel for the package transforms -# If you change this value, you will need to run "make set-versions" to -# apply the new version number to the toml files. -DPK_TRANSFORMS_VERSION=$(DPK_VERSION) - - -venv: - $(MAKE) .defaults.create-venv - -test:: test-src - -clean:: .transforms.clean - -rm -fr src - -image:: .transforms.python-image - -run-ut:: - source venv/bin/activate; \ - if [ -e requirements.test.txt ]; then \ - $(PYTHON) -m pip install -r requirements.test.txt ; \ - fi; \ - for T in $(TRANSFORMS_NAMES); do \ - echo running unit test on: $$T ; \ - $(PYTEST) $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/test; \ - done; - @# Help: Setup environment and run unit tests for all transforms - - -setup: .transforms.setup venv - $(MAKE) src - source venv/bin/activate; \ - $(PYTHON) -m pip install . - @# Help: Do any default transform setup before running make src and setting up a test environment - - -requirements: - if [ -e requirements.$(T_SET).txt ]; then \ - cp requirements.$(T_SET).txt requirements.txt ; \ - fi - -pkg-name: - if [ $(TRANSFORM_PKG) ]; then \ - cat pyproject.toml | sed -e \ - 's/^name[ ]*=.*/name = "'${TRANSFORM_PKG}'"/' \ - > tt.toml; \ - mv tt.toml pyproject.toml; \ - fi - -is-patch: - if [ $(IS_PATCH) ]; then \ - cat pyproject.toml | sed -e \ - 's/^version[ ]*=[ ]*"\(.*\).dev.*/version = "\1"/' \ - > tt.toml; \ - mv tt.toml pyproject.toml; \ - fi - -##################################################### -# to build a patched release, use make IS_PATCH=1 src -##################################################### -src: - mkdir src - make requirements - make pkg-name - make is-patch - for T in $(shell echo $(TRANSFORMS_NAMES)); do \ - echo copy src from $$T ; \ - cp -R $(REPOROOT)/transforms/$$T/$(PACKAGING_RUN_TIME)/src/* src ; \ - rm -fr *.egg-info ; \ - rm -fr dist ; \ - rm -fr build ; \ - done; - @# Help: Setup src folder and remove old distribution. to setup for a patched release use: make IS_PATCH=1 $@ - - -build:: build-dist - -publish:: publish-dist - -build-dist:: src .defaults.build-dist - @# Help: build the distribution for publishing to pypi. to build a patch release (no .devN) use: make IS_PATCH=1 $@ - -publish-dist:: .defaults.publish-dist - - diff --git a/transforms/packaging/Makefile b/transforms/packaging/Makefile deleted file mode 100644 index aa75d525e..000000000 --- a/transforms/packaging/Makefile +++ /dev/null @@ -1,60 +0,0 @@ -REPOROOT=../../ -# Use make help, to see the available rules -include ../../.make.defaults - -setup:: - -clean:: - # Clean up workflows common virtual environment. - rm -rf venv || true - rm -rf *.back || true - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -src:: - @# Help: Recursively setup $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -setup:: - -build:: - -build-dist:: - @# Help: Recursively build distributions in all subdirs - $(MAKE) RULE=$@ .recurse - -publish-dist:: - @# Help: Recursively publish distributions in all subdirs - $(MAKE) RULE=$@ .recurse - -venv:: - -image:: - -publish:: - -test-image:: - -test:: - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: - -workflow-venv:: - -workflow-test:: - -workflow-build:: - -workflow-upload:: - -set-versions:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse diff --git a/transforms/packaging/README.md b/transforms/packaging/README.md deleted file mode 100644 index e0d23ad52..000000000 --- a/transforms/packaging/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# Transforms Pacakges for both Python and Ray - -Most available Transforms can be published to pypi as a single package. A detailed list of available Python transforms is available at this [link](python/README.md). Similarly the following [link](ray/README.md) provide a derailed list and installation instructions for Ray transforms - - - -## Clone folder and update version number -```` -git clone https://github.com/IBM/data-prep-kit.git package-release -cd package-release -```` -in `.make.versions`, Set the values for DPK_MAJOR_VERSION, DPK_MINOR_VERSION and DPK_MICRO_VERSION to specify the DPK library to use and as appropriate, set the value for `DPK_TRANSFORMS_VERSION` that will be used to tag the latest version released to pypi - -`make set-versions` - -## Creating src folder - -Given that the transforms do not currently have their own name spaces, the first step is to copy all the transforms to the same src folder prior to running unit tests of the individual transforms and/or building the distribution: - - -```` -cd transforms/packaging -make clean -make src -```` - -## Build and Test - -This procedure will run all the UT for each individual transforms using a single package configuration: - -```` -cd transforms/packaging -make clean -make src -make test-src -```` - -## Build and Deploy - -This procedure will buid two wheels: one for the python transforms and one for the ray transforms. - -```` -cd transforms/packaging -make clean -make src -make build-dist -```` - -To publish the wheels to pypi.org, run: - -`make publish-dist` - - - - diff --git a/transforms/packaging/python/Makefile b/transforms/packaging/python/Makefile deleted file mode 100644 index 6a0a355de..000000000 --- a/transforms/packaging/python/Makefile +++ /dev/null @@ -1,89 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -include $(REPOROOT)/transforms/.make.transforms -include ../.make.packaging - -PACKAGING_RUN_TIME=python - -ifeq ($(T_SET), all) -# Cannot combine language/html2parquet with pdf2parquet due to: -#The conflict is caused by: -# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 -# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -TRANSFORMS_NAMES = code/code_quality \ - code/code2parquet \ - code/header_cleanser \ - code/proglang_select \ - language/doc_chunk \ - language/doc_quality \ - language/lang_id \ - language/pdf2parquet \ - language/pii_redactor \ - language/text_encoder \ - universal/tokenization \ - universal/ededup \ - /universal/doc_id \ - universal/filter \ - universal/resize -TRANSFORM_PKG = "data_prep_toolkit_transforms" -endif - -ifeq ($(T_SET), lang1) -TRANSFORMS_NAMES = language/doc_quality \ - language/lang_id \ - language/text_encoder \ - language/html2parquet \ - universal/tokenization \ - universal/ededup \ - /universal/doc_id \ - universal/filter \ - universal/resize -TRANSFORM_PKG = "data_prep_toolkit_transforms_lang1" -endif - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions - -test-src:: - $(MAKE) src - $(MAKE) .transforms.python-venv - $(MAKE) run-ut - @# Help: Do any default transform setup before running make src and setting up a test environment - -test-with-pypi: - $(MAKE) src - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install . - $(MAKE) run-ut - @# Help: Load dependencies from pypi and run all unit tests: final step in verification BEFORE deploying to pypi) - - -test-wheel: - -rm -fr venv - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install dist/*.whl - $(MAKE) run-ut - @# Help: Load wheel from local folder and run all unit tests - - - -test-latest-patch: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install $(TRANSFORM_PKG) - $(MAKE) run-ut - @# Help: Load wheel from pypi and run all unit tests: final step in verification AFTER deploying to pypi) - - - diff --git a/transforms/packaging/python/pyproject.toml b/transforms/packaging/python/pyproject.toml deleted file mode 100644 index 8d760515a..000000000 --- a/transforms/packaging/python/pyproject.toml +++ /dev/null @@ -1,39 +0,0 @@ -[project] -name = "data_prep_toolkit_transforms" -version = "0.2.2.dev0" -requires-python = ">=3.10,<3.13" -keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -description = "Data Preparation Toolkit Transforms" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Maroun Touma", email = "touma@us.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[options] -package_dir = ["src"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] - - - - - - diff --git a/transforms/packaging/python/requirements.all.txt b/transforms/packaging/python/requirements.all.txt deleted file mode 100644 index c1246fba9..000000000 --- a/transforms/packaging/python/requirements.all.txt +++ /dev/null @@ -1,51 +0,0 @@ -data-prep-toolkit>=0.2.1 -# code quality -bs4==0.0.2 -transformers==4.38.2 -#pdf2parquet -docling-core==1.3.0 -docling-ibm-models==1.1.7 -deepsearch-glm==0.21.0 -docling==1.11.0, -filetype >=1.2.0, <2.0.0 -#Doc chunking -docling-core==1.3.0, -llama-index-core>=0.11.0,<0.12.0, -#filter -duckdb>=0.10.1 -#langid -fasttext==0.9.2 -langcodes==3.3.0 -huggingface-hub >= 0.21.4, <1.0.0 -numpy==1.26.4 -#fdedup -mmh3>=4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -scipy>=1.12.0, <2.0.0 -# ededup -mmh3>=4.1.0 -xxhash==3.4.1 -#code2parquet -pandas -parameterized -#header cleanser -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -#text_encoder -sentence-transformers==3.0.1 -# PII-redactor -presidio-analyzer>=2.2.355 -presidio-anonymizer>=2.2.355 -flair>=0.14.0 -pandas>=2.2.2 -#html2parquet -#INFO: pip is looking at multiple versions of trafilatura to determine which version is compatible with other requirements. This could take a while. -#The conflict is caused by: -# docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 -# trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" -#trafilatura==1.12.0 -#tokenization -transformers==4.38.2 - - - diff --git a/transforms/packaging/python/requirements.lang1.txt b/transforms/packaging/python/requirements.lang1.txt deleted file mode 100644 index 1c7289f64..000000000 --- a/transforms/packaging/python/requirements.lang1.txt +++ /dev/null @@ -1,32 +0,0 @@ -data-prep-toolkit>=0.2.1 -#filter -duckdb>=0.10.1 -#langid -fasttext==0.9.2 -langcodes==3.3.0 -huggingface-hub >= 0.21.4, <1.0.0 -numpy==1.26.4 -#fdedup -mmh3>=4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -scipy==1.12.0 -# ededup -mmh3>=4.1.0, -xxhash==3.4.1 -#text_encoder -sentence-transformers>=3.0.1 -#html2parquet -trafilatura==1.12.0 -#tokenization -transformers==4.38.2 - -#ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts. -#data-prep-toolkit-transforms 0.2.2.dev0 requires duckdb==0.10.1, but you have duckdb 1.1.0 which is incompatible. -#data-prep-toolkit-transforms 0.2.2.dev0 requires sentence-transformers==3.0.1, but you have sentence-transformers 3.1.1 which is incompatible. - - - - - - diff --git a/transforms/packaging/ray/Makefile b/transforms/packaging/ray/Makefile deleted file mode 100644 index 0a1d6d911..000000000 --- a/transforms/packaging/ray/Makefile +++ /dev/null @@ -1,66 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../.. -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. - -# $(REPOROOT)/.make.versions file contains the versions - -include $(REPOROOT)/transforms/.make.transforms -include ../.make.packaging - -PACKAGING_RUN_TIME=ray - -# Excluded from build -# ./code/malware/ray - -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DPK_TRANSFORMS_VERSION) TOML_VERSION=$(DPK_TRANSFORMS_VERSION) .transforms.set-versions - - -## Ray Transforms: `find . -name src | grep ray/src` -TRANSFORMS_NAMES = code/proglang_select \ - code/header_cleanser \ - code/code_quality \ - code/repo_level_ordering \ - code/code2parquet \ - language/doc_chunk \ - language/doc_quality \ - language/lang_id \ - language/text_encoder \ - language/pii_redactor \ - language/pdf2parquet \ - universal/fdedup \ - universal/tokenization \ - universal/ededup \ - universal/profiler \ - universal/doc_id \ - universal/filter \ - universal/resize - -# doc chunk has conflict dependencies with pdf2parquet that need to be resolved -# doc_chunk depends on docling>=1.8.2,<2.0.0 -# pdf2parquet depends on docling==1.7.0 - - -test-src:: - $(MAKE) src - $(MAKE) -C ../python src - make .transforms.ray-venv - $(MAKE) run-ut - @# Help: Do any default transform setup before running make src and setting up a test environment - -test-with-python-pypi: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate && cd ../ray && $(MAKE) src && $(PYTHON) -m pip install . - $(MAKE) test-src - -test-with-pypi: - $(MAKE) clean - $(MAKE) .defaults.create-venv - source venv/bin/activate; \ - $(PYTHON) -m pip install data_prep_toolkit_transforms_ray==$(DPK_TRANSFORMS_VERSION) - $(MAKE) test-src - diff --git a/transforms/packaging/ray/README.md b/transforms/packaging/ray/README.md deleted file mode 100644 index b7d4cf2eb..000000000 --- a/transforms/packaging/ray/README.md +++ /dev/null @@ -1,41 +0,0 @@ -# DPK Ray Transforms - -## installation - -The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: - -`python -m pip install data-prep-toolkit-transforms-ray` - -installing the Ray transforms will also install `data_prep_toolkit_transforms` and `data-prep-toolkit-ray` - -## List of Ray Transforms availabe in current package - -Note: This list includes the transforms that are part of the current release for 0.2.1.dev3 and will be maintained on best effort but may may not be always up to date. users are encourage to raise an issue in git when they discover missing components - -* code - * [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) - * [proglang_select](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/proglang_select/ray/README.md) - * [header_cleanser (Not available on MacOS)](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) - * [code_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code_quality/ray/README.md) - * [repo_level_ordering](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/repo_level_ordering/ray/README.md) -* language - * [doc_quality](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_quality/ray/README.md) - * [doc_chunk](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_chunk/ray/README.md) - * [lang_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/lang_id/ray/README.md) - * [text_encoder](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/text_encoder/ray/README.md) - * [pdf2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pdf2parquet/ray/README.md) - * [pii_redactor](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/pii_redactor/ray/README.md) -* universal - * [fdedup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/fdedup/ray/README.md) - * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/tokenization/ray/README.md) - * [ededup](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/ededup/ray/README.md) - * [profiler](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/profiler/ray/README.md) - * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/doc_id/ray/README.md) - * [filter](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/filter/ray/README.md) - * [resize](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/resize/ray/README.md) - - - - - - diff --git a/transforms/packaging/ray/pyproject.toml b/transforms/packaging/ray/pyproject.toml deleted file mode 100644 index 2f02d4c51..000000000 --- a/transforms/packaging/ray/pyproject.toml +++ /dev/null @@ -1,40 +0,0 @@ -[project] -name = "data_prep_toolkit_transforms_ray" -version = "0.2.2.dev0" -requires-python = ">=3.10,<3.13" -keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -description = "Data Preparation Toolkit Transforms using Ray" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Maroun Touma", email = "touma@us.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - - -[options] -package_dir = ["src"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] - - - - - - diff --git a/transforms/packaging/ray/requirements.txt b/transforms/packaging/ray/requirements.txt deleted file mode 100644 index 632bbe670..000000000 --- a/transforms/packaging/ray/requirements.txt +++ /dev/null @@ -1,21 +0,0 @@ -data-prep-toolkit-ray>=0.2.2.dev0 -data-prep-toolkit-transforms>=0.2.2.dev0 -scancode-toolkit==32.1.0 ; platform_system != 'Darwin' -parameterized -tqdm==4.66.3 -mmh3==4.1.0 -xxhash==3.4.1 -tqdm==4.66.3 -#The conflict is caused by: -# ray fdedup depends on scipy==1.12.0 -# docling 1.7.0 depends on scipy<2.0.0 and >=1.14.1 -scipy>=1.12.0 -networkx==3.3 -colorlog==6.8.2 -func-timeout==4.3.5 -pandas==2.2.2 -emerge-viz==2.0.0 - - - - diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml new file mode 100644 index 000000000..48ca02703 --- /dev/null +++ b/transforms/pyproject.toml @@ -0,0 +1,97 @@ +[project] +name = "data_prep_toolkit_transforms" +version = "0.2.2.dev1" +requires-python = ">=3.10,<3.13" +keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] +description = "Data Preparation Toolkit Transforms using Ray" +license = {text = "Apache-2.0"} +readme = {file = "README-list.md", content-type = "text/markdown"} +authors = [ + { name = "Maroun Touma", email = "touma@us.ibm.com" }, +] +dynamic = ["dependencies","optional-dependencies"] + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + + +[tool.setuptools.dynamic.dependencies] +file = ["requirements.txt"] + +[tool.setuptools.dynamic.optional-dependencies] +dev = { file = ["requirements-dev.txt"]} +ray = { file = ["requirements-ray.txt"]} +all = { file = [ +"code/proglang_select/python/requirements.txt", +"code/header_cleanser/python/requirements.txt", +"code/license_select/python/requirements.txt", +"code/code_quality/python/requirements.txt", +"code/code2parquet/python/requirements.txt", + +"language/doc_quality/python/requirements.txt", +"language/doc_chunk/python/requirements.txt", +##### Cannot have html2parquet until we solve +## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 +## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" +## "language/html2parquet/python/requirements.txt", +##### pii_redactor seem to be failing UT +## "language/pii_redactor/python/requirements.txt", +"language/lang_id/python/requirements.txt", +"language/text_encoder/python/requirements.txt", +"language/pdf2parquet/python/requirements.txt", + +"universal/hap/python/requirements.txt", +"universal/tokenization/python/requirements.txt", +"universal/ededup/python/requirements.txt", +"universal/profiler/python/requirements.txt", +"universal/doc_id/python/requirements.txt", +"universal/filter/python/requirements.txt", +"universal/resize/python/requirements.txt" +]} + +# pyproject.toml must be in a parent and cannot be in sibling +# i.e. Cannot access '../code/proglang_select/python/.. + +proglang_select = { file = ["code/proglang_select/python/requirements.txt"]} +header_cleanser = {file = ["code/header_cleanser/python/requirements.txt"]} +license_select = { file = ["code/license_select/python/requirements.txt"]} +code_quality = { file = ["code/code_quality/python/requirements.txt"]} +code2parquet = {file = ["code/code2parquet/python/requirements.txt"]} + +doc_quality = { file = ["language/doc_quality/python/requirements.txt"]} +doc_chunk = { file = ["language/doc_chunk/python/requirements.txt"]} +html2parquet = { file = ["language/html2parquet/python/requirements.txt"]} +pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]} +lang_id = { file = ["language/lang_id/python/requirements.txt"]} +text_encoder = { file = ["language/text_encoder/python/requirements.txt"]} +pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]} + +hap = { file = ["universal/hap/python/requirements.txt"]} +tokenization = { file = ["universal/tokenization/python/requirements.txt"]} +ededup = { file = ["universal/ededup/python/requirements.txt"]} +profiler = { file = ["universal/profiler/python/requirements.txt"]} +doc_id = { file = ["universal/doc_id/python/requirements.txt"]} +filter = { file = ["universal/filter/python/requirements.txt"]} +resize = { file = ["universal/resize/python/requirements.txt"]} + +# Does not seem to work for our custom layout +# copy all files to a single src and let automatic discovery find them + +[options] +package_dir = ["src","test"] + +[options.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + + + + + + + diff --git a/transforms/requirements-ray.txt b/transforms/requirements-ray.txt new file mode 100644 index 000000000..00e6a157f --- /dev/null +++ b/transforms/requirements-ray.txt @@ -0,0 +1,9 @@ +data-prep-toolkit[ray]>=0.2.2.dev1 +networkx==3.3 +colorlog==6.8.2 +func-timeout==4.3.5 +emerge-viz==2.0.0 + + + + diff --git a/transforms/requirements.txt b/transforms/requirements.txt new file mode 100644 index 000000000..93631f7d4 --- /dev/null +++ b/transforms/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit>=0.2.2.dev1 \ No newline at end of file diff --git a/transforms/transform.config b/transforms/transform.config new file mode 100644 index 000000000..afe747c21 --- /dev/null +++ b/transforms/transform.config @@ -0,0 +1,17 @@ +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=data-prep-kit-transforms + +################################################################################ +# This defines the transforms' package version number as would be used +# when publishing the wheel. In general, only the micro version +# number should be advanced relative to the DPK_VERSION. +# +# If you change the versions numbers, be sure to run "make set-versions" to +# update version numbers across the transform (e.g., pyproject.toml). +TRANSFORMS_PKG_VERSION=$(DPK_VERSION) diff --git a/transforms/universal/doc_id/python/Dockerfile b/transforms/universal/doc_id/python/Dockerfile index 16a9c0e66..6f478cb33 100644 --- a/transforms/universal/doc_id/python/Dockerfile +++ b/transforms/universal/doc_id/python/Dockerfile @@ -18,7 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md - +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy source data diff --git a/transforms/universal/doc_id/python/pyproject.toml b/transforms/universal/doc_id/python/pyproject.toml index 46d3f79f8..ad4fba0ab 100644 --- a/transforms/universal/doc_id/python/pyproject.toml +++ b/transforms/universal/doc_id/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} @@ -9,14 +9,15 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0" -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/doc_id/python/requirements.txt b/transforms/universal/doc_id/python/requirements.txt new file mode 100644 index 000000000..82723b6ef --- /dev/null +++ b/transforms/universal/doc_id/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev1 \ No newline at end of file diff --git a/transforms/universal/doc_id/ray/pyproject.toml b/transforms/universal/doc_id/ray/pyproject.toml index 836454098..03530dff2 100644 --- a/transforms/universal/doc_id/ray/pyproject.toml +++ b/transforms/universal/doc_id/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "docid Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk_doc_id_transform_python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0" + "dpk_doc_id_transform_python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1" ] [build-system] diff --git a/transforms/universal/doc_id/spark/pyproject.toml b/transforms/universal/doc_id/spark/pyproject.toml index 485174834..636bbf26e 100644 --- a/transforms/universal/doc_id/spark/pyproject.toml +++ b/transforms/universal/doc_id/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_doc_id_transform_spark" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Doc ID Spark Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-spark==0.2.2.dev0", + "data-prep-toolkit-spark==0.2.2.dev1", ] [build-system] diff --git a/transforms/universal/ededup/python/Dockerfile b/transforms/universal/ededup/python/Dockerfile index d3d47e7a4..df9f3ce64 100644 --- a/transforms/universal/ededup/python/Dockerfile +++ b/transforms/universal/ededup/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/ededup/python/pyproject.toml b/transforms/universal/ededup/python/pyproject.toml index 59d0d72ee..21bfdad41 100644 --- a/transforms/universal/ededup/python/pyproject.toml +++ b/transforms/universal/ededup/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "ededup Python Transform" license = {text = "Apache-2.0"} @@ -9,16 +9,16 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "mmh3==4.1.0", - "xxhash==3.4.1", -] + +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/ededup/python/requirements.txt b/transforms/universal/ededup/python/requirements.txt new file mode 100644 index 000000000..84b4ac832 --- /dev/null +++ b/transforms/universal/ededup/python/requirements.txt @@ -0,0 +1,3 @@ +data-prep-toolkit==0.2.2.dev1 +mmh3>=4.1.0 +xxhash==3.4.1 diff --git a/transforms/universal/ededup/ray/pyproject.toml b/transforms/universal/ededup/ray/pyproject.toml index 886832947..84a892180 100644 --- a/transforms/universal/ededup/ray/pyproject.toml +++ b/transforms/universal/ededup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_ededup_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "ededup Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.2.dev0", - "dpk_ededup_transform_python==0.2.2.dev0", + "data-prep-toolkit-ray==0.2.2.dev1", + "dpk_ededup_transform_python==0.2.2.dev1", "tqdm==4.66.3", ] diff --git a/transforms/universal/fdedup/ray/pyproject.toml b/transforms/universal/fdedup/ray/pyproject.toml index 3f2c8ba51..54fd83a00 100644 --- a/transforms/universal/fdedup/ray/pyproject.toml +++ b/transforms/universal/fdedup/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_fdedup_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "fdedup Ray Transform" license = {text = "Apache-2.0"} @@ -10,11 +10,11 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.2.dev0", - "mmh3==4.1.0", + "data-prep-toolkit-ray==0.2.2.dev1", + "mmh3>=4.1.0", "xxhash==3.4.1", "tqdm==4.66.3", - "scipy==1.12.0" + "scipy>=1.12.0, <2.0.0" ] [build-system] diff --git a/transforms/universal/filter/python/Dockerfile b/transforms/universal/filter/python/Dockerfile index 6f60d2813..5df52a36e 100644 --- a/transforms/universal/filter/python/Dockerfile +++ b/transforms/universal/filter/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/universal/filter/python/pyproject.toml b/transforms/universal/filter/python/pyproject.toml index b9d781573..b93a601e1 100644 --- a/transforms/universal/filter/python/pyproject.toml +++ b/transforms/universal/filter/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Filter Transform for Python" license = {text = "Apache-2.0"} @@ -8,15 +8,16 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "duckdb==0.10.1", -] + +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/filter/python/requirements.txt b/transforms/universal/filter/python/requirements.txt new file mode 100644 index 000000000..56be59c0a --- /dev/null +++ b/transforms/universal/filter/python/requirements.txt @@ -0,0 +1,3 @@ + +data-prep-toolkit==0.2.2.dev1 +duckdb>=0.10.1 diff --git a/transforms/universal/filter/ray/pyproject.toml b/transforms/universal/filter/ray/pyproject.toml index 5c63a90ff..9d2f84325 100644 --- a/transforms/universal/filter/ray/pyproject.toml +++ b/transforms/universal/filter/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Filter Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Constantin Adam", email = "cmadam@us.ibm.com" }, ] dependencies = [ - "dpk-filter-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-filter-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/universal/filter/spark/pyproject.toml b/transforms/universal/filter/spark/pyproject.toml index a8a0174b6..54a49893e 100644 --- a/transforms/universal/filter/spark/pyproject.toml +++ b/transforms/universal/filter/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_filter_transform_spark" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Filter Spark Transform" license = {text = "Apache-2.0"} @@ -9,7 +9,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-spark==0.2.2.dev0", + "data-prep-toolkit-spark==0.2.2.dev1", ] [project.optional-dependencies] diff --git a/transforms/universal/hap/python/output/metadata.json b/transforms/universal/hap/python/output/metadata.json deleted file mode 100644 index 062fee162..000000000 --- a/transforms/universal/hap/python/output/metadata.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "hap", - "job type": "pure python", - "job id": "job_id", - "start_time": "2024-10-03 21:38:20", - "end_time": "2024-10-03 21:38:29", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "model_name_or_path": "ibm-granite/granite-guardian-hap-38m", - "annotation_column": "hap_score", - "doc_text_column": "contents", - "inference_engine": "CPU", - "max_length": 512, - "batch_size": 128, - "checkpointing": false, - "max_files": -1, - "random_samples": -1, - "files_to_use": [ - ".parquet" - ], - "num_processors": 0 - }, - "job_output_stats": { - "source_files": 2, - "source_size": 12124594, - "transform execution exception": 1, - "result_files": 1, - "result_size": 79822, - "processing_time": 6.932, - "source_doc_count": 50, - "result_doc_count": 50 - }, - "source": { - "name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/test-data/input", - "type": "path" - }, - "target": { - "name": "/Users/ian/Desktop/data-prep-kit/transforms/universal/hap/python/output", - "type": "path" - } -} \ No newline at end of file diff --git a/transforms/universal/hap/python/output/test1.parquet b/transforms/universal/hap/python/output/test1.parquet deleted file mode 100644 index c9483e34d..000000000 Binary files a/transforms/universal/hap/python/output/test1.parquet and /dev/null differ diff --git a/transforms/universal/hap/python/pyproject.toml b/transforms/universal/hap/python/pyproject.toml index 17f5346d2..fd775091e 100644 --- a/transforms/universal/hap/python/pyproject.toml +++ b/transforms/universal/hap/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_hap_transform_python" -version = "0.2.2.dev0" -requires-python = ">=3.10" +version = "0.2.2.dev1" +requires-python = ">=3.10,<3.13" description = "HAP Python Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} diff --git a/transforms/universal/hap/python/requirements.txt b/transforms/universal/hap/python/requirements.txt index 5062a733c..efdb8662b 100644 --- a/transforms/universal/hap/python/requirements.txt +++ b/transforms/universal/hap/python/requirements.txt @@ -1,4 +1,4 @@ -data-prep-toolkit==0.2.2.dev0 +data-prep-toolkit==0.2.2.dev1 nltk==3.9.1 transformers==4.38.2 torch==2.4.1 diff --git a/transforms/universal/hap/ray/pyproject.toml b/transforms/universal/hap/ray/pyproject.toml index ff3fc05f0..412df9413 100644 --- a/transforms/universal/hap/ray/pyproject.toml +++ b/transforms/universal/hap/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_hap_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10" description = "HAP Ray Transform" license = {text = "Apache-2.0"} diff --git a/transforms/universal/hap/ray/requirements.txt b/transforms/universal/hap/ray/requirements.txt index 36c2b81af..6b7f46c5f 100644 --- a/transforms/universal/hap/ray/requirements.txt +++ b/transforms/universal/hap/ray/requirements.txt @@ -1,5 +1,5 @@ -data-prep-toolkit-ray==0.2.2.dev0 -dpk-hap-transform-python==0.2.2.dev0 +data-prep-toolkit-ray==0.2.2.dev1 +dpk-hap-transform-python==0.2.2.dev1 nltk==3.9.1 transformers==4.38.2 torch==2.4.1 diff --git a/transforms/universal/noop/python/pyproject.toml b/transforms/universal/noop/python/pyproject.toml index 9b1675a69..81a46383c 100644 --- a/transforms/universal/noop/python/pyproject.toml +++ b/transforms/universal/noop/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "NOOP Python Transform" license = {text = "Apache-2.0"} @@ -10,7 +10,7 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit==0.2.2.dev0", + "data-prep-toolkit==0.2.2.dev1", ] [build-system] diff --git a/transforms/universal/noop/ray/pyproject.toml b/transforms/universal/noop/ray/pyproject.toml index c4120753f..c73f5c67a 100644 --- a/transforms/universal/noop/ray/pyproject.toml +++ b/transforms/universal/noop/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "NOOP Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-noop-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/universal/noop/spark/pyproject.toml b/transforms/universal/noop/spark/pyproject.toml index 633ee66bd..5068ffa2f 100644 --- a/transforms/universal/noop/spark/pyproject.toml +++ b/transforms/universal/noop/spark/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_noop_transform_spark" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "NOOP Spark Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-noop-transform-python==0.2.2.dev0", - "data-prep-toolkit-spark==0.2.2.dev0", + "dpk-noop-transform-python==0.2.2.dev1", + "data-prep-toolkit-spark==0.2.2.dev1", ] [build-system] diff --git a/transforms/universal/profiler/python/Dockerfile b/transforms/universal/profiler/python/Dockerfile index a744fc9cd..9aa921f5e 100644 --- a/transforms/universal/profiler/python/Dockerfile +++ b/transforms/universal/profiler/python/Dockerfile @@ -18,6 +18,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml COPY --chown=dpk:root README.md README.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . diff --git a/transforms/universal/profiler/python/pyproject.toml b/transforms/universal/profiler/python/pyproject.toml index 4bc90209f..e1e36f80a 100644 --- a/transforms/universal/profiler/python/pyproject.toml +++ b/transforms/universal/profiler/python/pyproject.toml @@ -1,23 +1,22 @@ [project] name = "dpk_profiler_transform_python" -version = "0.2.2.dev0" -requires-python = ">=3.10" +version = "0.2.2.dev1" +requires-python = ">=3.10,<3.13" description = "profiler Python Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "mmh3==4.1.0", - "xxhash==3.4.1", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/profiler/python/requirements.txt b/transforms/universal/profiler/python/requirements.txt new file mode 100644 index 000000000..638e1b7b5 --- /dev/null +++ b/transforms/universal/profiler/python/requirements.txt @@ -0,0 +1,5 @@ + +data-prep-toolkit==0.2.2.dev1 +mmh3==4.1.0 +xxhash==3.4.1 + diff --git a/transforms/universal/profiler/ray/pyproject.toml b/transforms/universal/profiler/ray/pyproject.toml index bacba9abb..0b3ef4b55 100644 --- a/transforms/universal/profiler/ray/pyproject.toml +++ b/transforms/universal/profiler/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_profiler_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "profiler Ray Transform" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "data-prep-toolkit-ray==0.2.2.dev0", - "dpk_profiler_transform_python==0.2.2.dev0", + "data-prep-toolkit-ray==0.2.2.dev1", + "dpk_profiler_transform_python==0.2.2.dev1", "tqdm==4.66.3", ] diff --git a/transforms/universal/profiler/spark/pyproject.toml b/transforms/universal/profiler/spark/pyproject.toml index 9cb3106bd..34003b539 100644 --- a/transforms/universal/profiler/spark/pyproject.toml +++ b/transforms/universal/profiler/spark/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_profiler_transform_spark" -version = "0.2.2.dev0" -requires-python = ">=3.10" +version = "0.2.2.dev1" +requires-python = ">=3.10,<3.13" description = "Profiler Spark Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -9,8 +9,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-profiler-transform-python==0.2.2.dev0", - "data-prep-toolkit-spark==0.2.2.dev0", + "dpk-profiler-transform-python==0.2.2.dev1", + "data-prep-toolkit-spark==0.2.2.dev1", ] [build-system] diff --git a/transforms/universal/resize/python/Dockerfile b/transforms/universal/resize/python/Dockerfile index 303e67840..9caa3565c 100644 --- a/transforms/universal/resize/python/Dockerfile +++ b/transforms/universal/resize/python/Dockerfile @@ -19,6 +19,7 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:users src/ src/ COPY --chown=dpk:users pyproject.toml pyproject.toml COPY --chown=dpk:users README.md Readme.md +COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -e . # copy the main() entry point to the image diff --git a/transforms/universal/resize/python/pyproject.toml b/transforms/universal/resize/python/pyproject.toml index 2396e5b23..f393b5b0e 100644 --- a/transforms/universal/resize/python/pyproject.toml +++ b/transforms/universal/resize/python/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_python" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "resize Python Transform" license = {text = "Apache-2.0"} @@ -9,14 +9,15 @@ authors = [ { name = "David Wood", email = "dawood@us.ibm.com" }, { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", -] +dynamic = ["dependencies"] [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/resize/python/requirements.txt b/transforms/universal/resize/python/requirements.txt new file mode 100644 index 000000000..82723b6ef --- /dev/null +++ b/transforms/universal/resize/python/requirements.txt @@ -0,0 +1 @@ +data-prep-toolkit==0.2.2.dev1 \ No newline at end of file diff --git a/transforms/universal/resize/ray/pyproject.toml b/transforms/universal/resize/ray/pyproject.toml index 249e40c7d..38043bb7e 100644 --- a/transforms/universal/resize/ray/pyproject.toml +++ b/transforms/universal/resize/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_resize_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Resize Ray Transform" license = {text = "Apache-2.0"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsky@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-resize-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system] diff --git a/transforms/universal/resize/spark/pyproject.toml b/transforms/universal/resize/spark/pyproject.toml index 77687ca7e..6b6d0f50b 100644 --- a/transforms/universal/resize/spark/pyproject.toml +++ b/transforms/universal/resize/spark/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_resize_transform_spark" -version = "0.2.2.dev0" -requires-python = ">=3.10" +version = "0.2.2.dev1" +requires-python = ">=3.10,<3.13" description = "Resize Spark Transform" license = {text = "Apache-2.0"} readme = {file = "README.md", content-type = "text/markdown"} @@ -10,8 +10,8 @@ authors = [ { name = "Boris Lublinsky", email = "blublinsk@ibm.com" }, ] dependencies = [ - "dpk-resize-transform-python==0.2.2.dev0", - "data-prep-toolkit-spark==0.2.2.dev0", + "dpk-resize-transform-python==0.2.2.dev1", + "data-prep-toolkit-spark==0.2.2.dev1", ] [build-system] diff --git a/transforms/universal/tokenization/python/Dockerfile b/transforms/universal/tokenization/python/Dockerfile index a1fd159c7..e1eea7e40 100644 --- a/transforms/universal/tokenization/python/Dockerfile +++ b/transforms/universal/tokenization/python/Dockerfile @@ -19,11 +19,10 @@ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=dpk:root src/ src/ COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -e . -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt - # copy the main() entry point to the image COPY ./src/tokenization_transform_python.py . diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml index f69787b3d..51e3cbff9 100644 --- a/transforms/universal/tokenization/python/pyproject.toml +++ b/transforms/universal/tokenization/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "dpk_tokenization_transform_python" keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Python" license = {text = "Apache-2.0"} @@ -9,11 +9,6 @@ readme = {file = "README.md", content-type = "text/markdown"} authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] -dependencies = [ - "data-prep-toolkit==0.2.2.dev0", - "transformers==4.38.2", -] - [project_urls] Repository = "https://github.com/IBM/data-prep-kit" @@ -21,10 +16,15 @@ Issues = "https://github.com/IBM/data-prep-kit/issues" Documentation = "https://ibm.github.io/data-prep-kit/" "Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/tokenization" +dynamic = ["dependencies"] + [build-system] requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] build-backend = "setuptools.build_meta" +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + [project.optional-dependencies] dev = [ "twine", diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/python/requirements.txt new file mode 100644 index 000000000..d64bcef48 --- /dev/null +++ b/transforms/universal/tokenization/python/requirements.txt @@ -0,0 +1,2 @@ +data-prep-toolkit==0.2.2.dev1 +transformers==4.38.2 diff --git a/transforms/universal/tokenization/ray/Dockerfile b/transforms/universal/tokenization/ray/Dockerfile index 0199e23b8..8b7e78c27 100644 --- a/transforms/universal/tokenization/ray/Dockerfile +++ b/transforms/universal/tokenization/ray/Dockerfile @@ -13,11 +13,9 @@ COPY --chown=ray:users data-processing-lib-python/ data-processing-lib-python/ RUN cd data-processing-lib-python && pip install --no-cache-dir -e . COPY --chown=ray:users data-processing-lib-ray/ data-processing-lib-ray/ RUN cd data-processing-lib-ray && pip install --no-cache-dir -e . -COPY --chown=ray:users python-transform/ python-transform -RUN cd python-transform && pip install --no-cache-dir -e . +COPY --chown=ray:users python-transform/ python-transform +RUN cd python-transform && pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -e . -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt COPY --chown=ray:users src/ src/ COPY --chown=ray:users pyproject.toml pyproject.toml diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml index aa109bbc1..a1ef73dd8 100644 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ b/transforms/universal/tokenization/ray/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dpk_tokenization_transform_ray" -version = "0.2.2.dev0" +version = "0.2.2.dev1" requires-python = ">=3.10,<3.13" description = "Tokenization Transform for Ray" license = {text = "Apache-2.0"} @@ -9,8 +9,8 @@ authors = [ { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, ] dependencies = [ - "dpk-tokenization-transform-python==0.2.2.dev0", - "data-prep-toolkit-ray==0.2.2.dev0", + "dpk-tokenization-transform-python==0.2.2.dev1", + "data-prep-toolkit-ray==0.2.2.dev1", ] [build-system]