diff --git a/.make.defaults b/.make.defaults index 51eb984ee..80df91c8e 100644 --- a/.make.defaults +++ b/.make.defaults @@ -209,7 +209,7 @@ __check_defined = \ # We create both local and remote tags. Local seems to be needed when using our spark # base image. Remote seems to be needed by kfp. .PHONY: .defaults.image -.defaults.image:: # Must be called with a DOCKER_IMAGE= settings. +.defaults.image:: # Must be called with a DOCKER_IMAGE_NAME= settings. @# Help: Create the docker image $(DOCKER_LOCAL_IMAGE) and a tag for $(DOCKER_REMOTE_IMAGE) $(call check_defined, DOCKER_IMAGE_NAME) # The following touch seems to be needed to work around a docker build problem in which @@ -222,14 +222,15 @@ __check_defined = \ if [ -e pyproject.toml ]; then \ touch pyproject.toml; \ fi - $(DOCKER) build -t $(DOCKER_LOCAL_IMAGE) $(DOCKER_BUILD_EXTRA_ARGS) \ + $(DOCKER) build -f $(DOCKER_FILE) -t $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(DOCKER_BUILD_EXTRA_ARGS) \ --platform $(DOCKER_PLATFORM) \ --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ --build-arg BASE_IMAGE=$(BASE_IMAGE) \ --build-arg DPK_WHEEL_FILE_NAME=$(DPK_WHEEL_FILE_NAME) \ --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . - $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) + $(DOCKER) tag $(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) $(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) + # Copy a source tree in LIB_PATH, including src, pyproject.toml to LIB_NAME # Generally used to copy source from within the repo into a local directory for use by a Dockerfile @@ -244,24 +245,25 @@ __check_defined = \ cp -p ${LIB_PATH}/requirements.txt ${LIB_NAME}; \ fi - -# Build and image using the local Dockerfile and make the data-processing-lib/python -# available in the current directory for use by the Dockerfile (i.e. to install the library). -#.PHONY: .defaults.python-lib-src-image -#.defaults.python-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings. -# @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-lib/python source -#ifeq ($(USE_REPO_LIB_SRC), 1) -# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib -#endif -# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image -# -rm -rf data-processing-lib-python .PHONY: .default.build-lib-wheel .default.build-lib-wheel: - make -C $(REPOROOT)/data-processing-lib build-pkg-dist + $(MAKE) -C $(REPOROOT)/data-processing-lib build-pkg-dist rm -rf data-processing-dist && mkdir data-processing-dist cp $(REPOROOT)/data-processing-lib/dist/*.whl data-processing-dist + +# Build and image using the local Dockerfile +# Assumes wheel has already been created +.PHONY: .defaults.lib-whl-image +.defaults.lib-whl-image:: + # Must be called with a DOCKER_LOCAL_IMAGE= settings. + @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the the wheel file for the library + @$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl)) + $(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE))) + $(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image + + # Build and image using the local Dockerfile and make the wheel for data-processing-lib # available in the current directory for use by the Dockerfile (i.e. to install the library). .PHONY: .defaults.python-lib-whl-image @@ -270,28 +272,9 @@ __check_defined = \ @# Help: Build the Python $(DOCKER_LOCAL_IMAGE) using the the wheel file for the library @$(eval LIB_WHEEL_FILE := $(shell find data-processing-dist/*.whl)) $(eval LIB_WHEEL_FILE := $(shell basename $(LIB_WHEEL_FILE))) - $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image + $(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image -rm -rf data-processing-dist -# Build an image using the local Dockerfile and make the data-processing-lib/ray -# available in the current directory for use by the Dockerfile (i.e. to install the library). -# Note that this looks for the ../python directory, which is currently only used in the transform projects, -# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms. -#.PHONY: .defaults.ray-lib-src-image -#.defaults.ray-lib-src-image:: # Must be called with a DOCKER_LOCAL_IMAGE= settings. -# @# Help: Build the Ray $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source -#ifeq ($(USE_REPO_LIB_SRC), 1) -# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib -# $(MAKE) LIB_PATH=$(DPK_RAY_LIB_DIR) LIB_NAME=data-processing-lib-ray .defaults.copy-lib -#endif -# if [ -e ../python ]; then \ -# $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \ -# fi -# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) .defaults.image -# -rm -rf data-processing-lib-python -# -rm -rf data-processing-lib-ray -# -rm -rf python-transform - # Build an image using the local Dockerfile and make the data-processing wheel # available in the current directory for use by the Dockerfile (i.e. to install the library). @@ -306,7 +289,7 @@ __check_defined = \ if [ -e ../python ]; then \ $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \ fi - $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image + $(MAKE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image -rm -rf python-transform -rm -rf data-processing-dist @@ -316,24 +299,6 @@ __check_defined = \ .defaults.spark-lib-base-image: $(MAKE) -C $(DPK_SPARK_LIB_DIR) image -# Note that this looks for the ../python directory, which is currently only used in the transform projects, -# but we add it here as a convenience to avoid duplicating a lot of this in transforms/.make.transforms. -# Must be called with a DOCKER_LOCAL_IMAGE= settings. -#.PHONY: .defaults.spark-lib-src-image -#.defaults.spark-lib-src-image:: .defaults.spark-lib-base-image -# @# Help: Build the Spark $(DOCKER_LOCAL_IMAGE) using the $(DOCKER_FILE), requirements.txt and data-processing-libs source -# $(MAKE) IMAGE_NAME_TO_VERIFY=$(DOCKER_SPARK_BASE_IMAGE_NAME) .defaults.verify-image-availability -#ifeq ($(USE_REPO_LIB_SRC), 1) -# $(MAKE) LIB_PATH=$(DPK_PYTHON_LIB_DIR) LIB_NAME=data-processing-lib-python .defaults.copy-lib -# $(MAKE) LIB_PATH=$(DPK_SPARK_LIB_DIR) LIB_NAME=data-processing-lib-spark .defaults.copy-lib -#endif -# if [ -e ../python ]; then \ -# $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \ -# fi -# $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) .defaults.image -# -rm -rf data-processing-lib-python -# -rm -rf data-processing-lib-spark -# -rm -rf python-transform .PHONY: .defaults.spark-lib-whl-image .defaults.spark-lib-whl-image:: .default.build-lib-wheel .defaults.spark-lib-base-image @@ -345,7 +310,7 @@ __check_defined = \ if [ -e ../python ]; then \ $(MAKE) LIB_PATH=../python LIB_NAME=python-transform .defaults.copy-lib; \ fi - $(MAKE) DOCKER_IMAGE=$(DOCKER_LOCAL_IMAGE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image + $(MAKE) BASE_IMAGE=$(DOCKER_SPARK_BASE_IMAGE) DPK_WHEEL_FILE_NAME=$(LIB_WHEEL_FILE) .defaults.image -rm -rf python-transform -rm -rf data-processing-dist diff --git a/transforms/.make.cicd.targets b/transforms/.make.cicd.targets index 69a5f54fd..e392e8f36 100644 --- a/transforms/.make.cicd.targets +++ b/transforms/.make.cicd.targets @@ -7,10 +7,15 @@ include $(REPOROOT)/transforms/.make.transforms ###################################################################### -## Default setting for TRANSFORM_RUNTIME uses folder name-- Old layout -TRANSFORM_PYTHON_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).transform -TRANSFORM_RAY_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).ray.transform -TRANSFORM_PYTHON_RUNTIME_SRC_FILE=-m dpk_$(TRANSFORM_NAME).spark.transform +## Default setting for TRANSFORM_RUNTIME entry point: +# python -m dpk_html2parquet.ray.transform --help +# or +# python -m dpk_html2parquet.transform_python --help +# +TRANSFORM_PYTHON_SRC?="-m dpk_$(TRANSFORM_NAME).transform_python" +TRANSFORM_RAY_SRC?="-m dpk_$(TRANSFORM_NAME).ray.transform" +TRANSFORM_SPARK_SRC?="-m dpk_$(TRANSFORM_NAME).spark.transform" + venv:: .defaults.create-venv source venv/bin/activate && $(PIP) install -e $(REPOROOT)/data-processing-lib[ray,spark] @@ -19,7 +24,6 @@ venv:: .defaults.create-venv source venv/bin/activate && $(PIP) install -r requirements.txt; \ fi; - test:: .transforms.test-src test-image clean:: .transforms.clean @@ -28,62 +32,113 @@ clean:: .transforms.clean set-versions:: ## We need to think how we want to do this going forward -build:: -image:: - @if [ -e Dockerfile ]; then \ - $(MAKE) image-default ; \ - else \ - echo "Skipping image for $(shell pwd) since no Dockerfile is present"; \ +build:: image + +publish: + @if [ -e Dockerfile.python ]; then \ + $(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-python:$(DOCKER_IMAGE_VERSION) \ + .defaults.publish-image ; \ + fi + @if [ -e Dockerfile.ray ]; then \ + $(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-ray:$(DOCKER_IMAGE_VERSION) \ + .defaults.publish-image ; \ + fi + @if [ -e Dockerfile.spark ]; then \ + $(MAKE) DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(TRANSFORM_NAME)-spark:$(DOCKER_IMAGE_VERSION) \ + .defaults.publish-image ; \ + fi + +test-image-sequence:: .defaults.lib-whl-image .transforms.test-image-help .transforms.clean + +test-image:: .default.build-lib-wheel + @if [ -e Dockerfile.python ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.python \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_PYTHON_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \ + test-image-sequence ; \ + fi + @if [ -e Dockerfile.ray ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.ray \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_RAY_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \ + BASE_IMAGE=$(RAY_BASE_IMAGE) \ + test-image-sequence ; \ fi + @if [ -e Dockerfile.spark ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.spark \ + TRANSFORM_RUNTIME_SRC_FILE=$(TRANSFORM_SPARK_SRC) \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ + BASE_IMAGE=$(SPARK_BASE_IMAGE) \ + test-image-sequence ; \ + fi + -rm -rf data-processing-dist + -publish:: - @if [ -e Dockerfile ]; then \ - $(MAKE) publish-default ; \ - else \ - echo "Skipping publish for $(shell pwd) since no Dockerfile is present"; \ +image-python: + @if [ -e Dockerfile.python ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.python \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-python \ + .defaults.lib-whl-image ; \ fi -publish-image:: - @if [ -e Dockerfile ]; then \ - $(MAKE) publish-image-default ; \ - else \ - echo "Skipping publish-image for $(shell pwd) since no Dockerfile is present"; \ +image-ray: + @if [ -e Dockerfile.ray ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.ray \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-ray \ + BASE_IMAGE=$(RAY_BASE_IMAGE) \ + .defaults.lib-whl-image ; \ fi -test-image:: - @if [ -e Dockerfile ]; then \ - $(MAKE) test-image-default ; \ - else \ - echo "Skipping test-image for $(shell pwd) since no Dockerfile is present"; \ +image-spark: + @if [ -e Dockerfile.spark ]; then \ + $(MAKE) DOCKER_FILE=Dockerfile.spark \ + DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-spark \ + BASE_IMAGE=$(SPARK_BASE_IMAGE) \ + .defaults.lib-whl-image ; \ fi +image:: .default.build-lib-wheel + ## Build all possible images unless a specific runtime is specified + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "python" ]; then \ + $(MAKE) image-python ; \ + fi + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "ray" ]; then \ + $(MAKE) image-ray ; \ + fi + @if [ -z "$(BUILD_SPECIFIC_RUNTIME)" ] || [ "$(BUILD_SPECIFIC_RUNTIME)" == "spark" ]; then \ + $(MAKE) image-spark ; \ + fi + -rm -rf data-processing-dist + test-src:: .transforms.test-src setup:: .transforms.setup -publish-default:: publish-image - -publish-image-default:: .defaults.publish-image - -test-image-default:: image .transforms.test-image-help .defaults.test-image-pytest .transforms.clean - -build-lib-wheel: - make -C $(REPOROOT)/data-processing-lib build-pkg-dist - -image-default:: build-lib-wheel - @$(eval LIB_WHEEL_FILE := $(shell find $(REPOROOT)/data-processing-lib/dist/*.whl)) - rm -fr dist && mv $(REPOROOT)/data-processing-lib/dist . - $(eval WHEEL_FILE_NAME := $(shell basename $(LIB_WHEEL_FILE))) - $(DOCKER) build -t $(DOCKER_IMAGE_NAME) $(DOCKER_BUILD_EXTRA_ARGS) \ - --platform $(DOCKER_PLATFORM) \ - --build-arg EXTRA_INDEX_URL=$(EXTRA_INDEX_URL) \ - --build-arg BASE_IMAGE=$(RAY_BASE_IMAGE) \ - --build-arg BUILD_DATE=$(shell date -u +'%Y-%m-%dT%H:%M:%SZ') \ - --build-arg WHEEL_FILE_NAME=$(WHEEL_FILE_NAME) \ - --build-arg TRANSFORM_NAME=$(TRANSFORM_NAME) \ - --build-arg GIT_COMMIT=$(shell git log -1 --format=%h) . - $(DOCKER) tag $(DOCKER_LOCAL_IMAGE) $(DOCKER_REMOTE_IMAGE) - rm -fr dist +kind-load-image:: .transforms.kind-load-image + +.PHONY: workflow-vent +workflow-venv: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-venv; \ + fi + +.PHONY: workflow-test +workflow-test: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-test; \ + fi + +.PHONY: workflow-upload +workflow-upload: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-upload; \ + fi + +.PHONY: workflow-build +workflow-build: + if [ -e kfp_ray ]; then \ + $(MAKE) -C kfp_ray TRANSFORM_NAME=$(TRANSFORM_NAME) workflow-build; \ + fi diff --git a/transforms/language/doc_chunk/python/Dockerfile b/transforms/language/doc_chunk/Dockerfile.python similarity index 70% rename from transforms/language/doc_chunk/python/Dockerfile rename to transforms/language/doc_chunk/Dockerfile.python index 358f9ca13..2571a065c 100644 --- a/transforms/language/doc_chunk/python/Dockerfile +++ b/transforms/language/doc_chunk/Dockerfile.python @@ -20,20 +20,9 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root dpk_doc_chunk/ dpk_doc_chunk/ COPY --chown=dpk:root requirements.txt requirements.txt -RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . - -# copy transform main() entry point to the image -COPY ./src/doc_chunk_transform_python.py . - -# copy some of the samples in -COPY ./src/doc_chunk_local.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +RUN pip install ${PIP_INSTALL_EXTRA_ARGS} -r requirements.txt # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/language/doc_chunk/ray/Dockerfile b/transforms/language/doc_chunk/Dockerfile.ray similarity index 50% rename from transforms/language/doc_chunk/ray/Dockerfile rename to transforms/language/doc_chunk/Dockerfile.ray index c64771cc9..1dbf6cfcf 100644 --- a/transforms/language/doc_chunk/ray/Dockerfile +++ b/transforms/language/doc_chunk/Dockerfile.ray @@ -12,26 +12,10 @@ ARG DPK_WHEEL_FILE_NAME COPY --chown=ray:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -## Copy the python version of the tansform -COPY --chown=ray:users python-transform/ python-transform/ -RUN cd python-transform && pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . -#COPY requirements.txt requirements.txt -#RUN pip install --no-cache-dir -r requirements.txt - -COPY --chown=ray:users src/ src/ -COPY --chown=ray:users pyproject.toml pyproject.toml -RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/doc_chunk_transform_ray.py . - -# copy some of the samples in -COPY ./src/doc_chunk_local_ray.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ +COPY --chown=ray:users dpk_doc_chunk/ dpk_doc_chunk/ +COPY --chown=ray:users requirements.txt requirements.txt +RUN pip install ${PIP_INSTALL_EXTRA_ARGS} --no-cache-dir -r requirements.txt # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/language/doc_chunk/Makefile b/transforms/language/doc_chunk/Makefile index bca6f7e85..8e3af6da6 100644 --- a/transforms/language/doc_chunk/Makefile +++ b/transforms/language/doc_chunk/Makefile @@ -1,79 +1,24 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults - -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse - -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse - -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -set-versions: - @# Help: Recursively $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi - +include $(REPOROOT)/transforms/.make.cicd.targets + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ + + +LINUX_WITH_CPU_TORCH?=true +OS := $(shell uname -s) +ifeq ($(OS),Linux) + ifeq ($(LINUX_WITH_CPU_TORCH),true) + PIP_INSTALL_EXTRA_ARGS=--extra-index-url=https://download.pytorch.org/whl/cpu + DOCKER_BUILD_EXTRA_ARGS=--build-arg PIP_INSTALL_EXTRA_ARGS=${PIP_INSTALL_EXTRA_ARGS} + endif +endif diff --git a/transforms/language/doc_chunk/README.md b/transforms/language/doc_chunk/README.md index e4a58a3bc..7010439d4 100644 --- a/transforms/language/doc_chunk/README.md +++ b/transforms/language/doc_chunk/README.md @@ -1,12 +1,164 @@ # Chunk documents Transform -This transform is chunking documents. It supports multiple _chunker modules_. -More details as well as a description of the parameters can be found in the [python/README.md](python/README.md). +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. +## Contributors -* [python](python/README.md) - provides the base python-based transformation -implementation. -* [ray](ray/README.md) - enables the running of the base python transformation -in a Ray runtime -* [kfp](kfp_ray/README.md) - enables running the ray docker image -in a kubernetes cluster using a generated `yaml` file. +- Michele Dolfi (dol@zurich.ibm.com) + +## Description + +This transform is chunking documents. It supports multiple _chunker modules_ (see the `chunking_type` parameter). + +When using documents converted to JSON, the transform leverages the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` +to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc. +It relies on documents converted with the Docling library in the [pdf2parquet transform](../../pdf2parquet/python/README.md) using the option `contents_type: "application/json"`, +which provides the required JSON structure. + +When using documents converted to Markdown, the transform leverages the [Llama Index](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) `MarkdownNodeParser`, which is relying on its internal Markdown splitting logic. + + +### Input + +| input column name | data type | description | +|-|-|-| +| the one specified in _content_column_name_ configuration | string | the content used in this transform | + + +### Output format + +The output parquet file will contain all the original columns, but the content will be replaced with the individual chunks. + + +#### Tracing the origin of the chunks + +The transform allows to trace the origin of the chunk with the `source_doc_id` which is set to the value of the `document_id` column (if present) in the input table. +The actual name of columns can be customized with the parameters described below. + + +## Configuration + +The transform can be tuned with the following parameters. + + +| Parameter | Default | Description | +|------------|----------|--------------| +| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. | +| `content_column_name` | `contents` | Name of the column containing the text to be chunked. | +| `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. | +| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. | +| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. | +| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. | +| `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. | +| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. | +| `output_pageno_column_name` | `page_number` | Column name to store the page number of the chunk in the output table. | +| `output_bbox_column_name` | `bbox` | Column name to store the bbox of the chunk in the output table. | + + + +## Usage + +### Launched Command Line Options + +When invoking the CLI, the parameters must be set as `--doc_chunk_`, e.g. `--doc_chunk_column_name_key=myoutput`. + + +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/doc_chunk_transform.py using command line args +* `run-local-sample` - runs src/doc_chunk_local.py + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + +### Code example + +TBD (link to the notebook will be provided) + +See the sample script [src/doc_chunk_local_python.py](src/doc_chunk_local_python.py). + + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_doc_chunk_python.py) + + +## Further Resource + +- For the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` + - +- For the Markdown chunker in LlamaIndex + - [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) +- For the Token Text Splitter in LlamaIndex + - [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/) + + +# Chunk documents Ray Transform + +## Summary +This project wraps the doc_chunck transform python implementation with a Ray runtime. + +## Configuration and command line Options + +chunk documents configuration and command line options are the same as for the base python transform. + +## Running + +### Launched Command Line Options +In addition to those available to the transform as defined above, +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. + +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/doc_chunk_transform.py using command line args +* `run-local-sample` - runs src/doc_chunk_local_ray.py +* `run-s3-sample` - runs src/doc_chunk_s3_ray.py + * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) + and [here](https://min.io/docs/minio/linux/index.html) + and invocation of `make minio-start` to load data into local minio for S3 access. + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/language/doc_chunk/dpk_doc_chunk/__init__.py b/transforms/language/doc_chunk/dpk_doc_chunk/__init__.py new file mode 100644 index 000000000..29621e921 --- /dev/null +++ b/transforms/language/doc_chunk/dpk_doc_chunk/__init__.py @@ -0,0 +1 @@ +from .transform import * diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py b/transforms/language/doc_chunk/dpk_doc_chunk/chunkers.py similarity index 100% rename from transforms/language/doc_chunk/python/src/doc_chunk_chunkers.py rename to transforms/language/doc_chunk/dpk_doc_chunk/chunkers.py diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local.py b/transforms/language/doc_chunk/dpk_doc_chunk/local.py similarity index 96% rename from transforms/language/doc_chunk/python/src/doc_chunk_local.py rename to transforms/language/doc_chunk/dpk_doc_chunk/local.py index 8c016bf7d..956205bad 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_local.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/local.py @@ -13,7 +13,7 @@ import os from data_processing.data_access import DataAccessLocal -from doc_chunk_transform import DocChunkTransform +from dpk_doc_chunk.transform import DocChunkTransform # create parameters diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py b/transforms/language/doc_chunk/dpk_doc_chunk/local_python.py similarity index 95% rename from transforms/language/doc_chunk/python/src/doc_chunk_local_python.py rename to transforms/language/doc_chunk/dpk_doc_chunk/local_python.py index 0c830ee98..51fd4de50 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_local_python.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/local_python.py @@ -16,8 +16,8 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from doc_chunk_transform_python import DocChunkPythonTransformConfiguration -from doc_chunk_transform import chunking_types +from dpk_doc_chunk.transform_python import DocChunkPythonTransformConfiguration +from dpk_doc_chunk.transform import chunking_types # create parameters input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "input")) diff --git a/transforms/language/doc_chunk/dpk_doc_chunk/ray/__init__.py b/transforms/language/doc_chunk/dpk_doc_chunk/ray/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/transforms/language/doc_chunk/ray/src/doc_chunk_local_ray.py b/transforms/language/doc_chunk/dpk_doc_chunk/ray/local.py similarity index 96% rename from transforms/language/doc_chunk/ray/src/doc_chunk_local_ray.py rename to transforms/language/doc_chunk/dpk_doc_chunk/ray/local.py index 1e2173150..33f601667 100644 --- a/transforms/language/doc_chunk/ray/src/doc_chunk_local_ray.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/ray/local.py @@ -15,7 +15,7 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from doc_chunk_transform_ray import DocChunkRayTransformConfiguration +from dpk_doc_chunk.ray.transform import DocChunkRayTransformConfiguration # create parameters diff --git a/transforms/language/doc_chunk/ray/src/doc_chunk_s3_ray.py b/transforms/language/doc_chunk/dpk_doc_chunk/ray/s3.py similarity index 96% rename from transforms/language/doc_chunk/ray/src/doc_chunk_s3_ray.py rename to transforms/language/doc_chunk/dpk_doc_chunk/ray/s3.py index 929674a61..1519b603f 100644 --- a/transforms/language/doc_chunk/ray/src/doc_chunk_s3_ray.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/ray/s3.py @@ -15,7 +15,7 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from doc_chunk_transform_ray import DocChunkRayTransformConfiguration +from dpk_doc_chunk.ray.transform import DocChunkRayTransformConfiguration print(os.environ) diff --git a/transforms/language/doc_chunk/ray/src/doc_chunk_transform_ray.py b/transforms/language/doc_chunk/dpk_doc_chunk/ray/transform.py similarity index 96% rename from transforms/language/doc_chunk/ray/src/doc_chunk_transform_ray.py rename to transforms/language/doc_chunk/dpk_doc_chunk/ray/transform.py index 1dffdbd49..8ebe326ef 100644 --- a/transforms/language/doc_chunk/ray/src/doc_chunk_transform_ray.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/ray/transform.py @@ -23,7 +23,7 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) -from doc_chunk_transform import DocChunkTransformConfiguration +from dpk_doc_chunk.transform import DocChunkTransformConfiguration logger = get_logger(__name__) diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py b/transforms/language/doc_chunk/dpk_doc_chunk/transform.py similarity index 99% rename from transforms/language/doc_chunk/python/src/doc_chunk_transform.py rename to transforms/language/doc_chunk/dpk_doc_chunk/transform.py index e64a7c1d1..55c287cc8 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/transform.py @@ -11,14 +11,13 @@ ################################################################################ import enum -import time from argparse import ArgumentParser, Namespace from typing import Any import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration from data_processing.utils import CLIArgumentProvider, TransformUtils, get_logger -from doc_chunk_chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter +from dpk_doc_chunk.chunkers import ChunkingExecutor, DLJsonChunker, LIMarkdown, LITokenTextSplitter short_name = "doc_chunk" diff --git a/transforms/language/doc_chunk/python/src/doc_chunk_transform_python.py b/transforms/language/doc_chunk/dpk_doc_chunk/transform_python.py similarity index 96% rename from transforms/language/doc_chunk/python/src/doc_chunk_transform_python.py rename to transforms/language/doc_chunk/dpk_doc_chunk/transform_python.py index 1d2738c3f..f037caeb0 100644 --- a/transforms/language/doc_chunk/python/src/doc_chunk_transform_python.py +++ b/transforms/language/doc_chunk/dpk_doc_chunk/transform_python.py @@ -15,7 +15,7 @@ PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger -from doc_chunk_transform import DocChunkTransformConfiguration +from dpk_doc_chunk.transform import DocChunkTransformConfiguration logger = get_logger(__name__) diff --git a/transforms/language/doc_chunk/kfp_ray/Makefile b/transforms/language/doc_chunk/kfp_ray/Makefile index 30e912e33..fcc12450d 100644 --- a/transforms/language/doc_chunk/kfp_ray/Makefile +++ b/transforms/language/doc_chunk/kfp_ray/Makefile @@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows -# Include the common configuration for this transform -include ../transform.config +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -SRC_DIR=${CURDIR}/../ray/ +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -17,27 +22,6 @@ clean: @# Help: Clean up the virtual environment. rm -rf ${REPOROOT}/transforms/venv -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -test-image:: - -publish:: - -image:: - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: .PHONY: workflow-build workflow-build: workflow-venv @@ -45,10 +29,15 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=doc_chunk_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \ + PIPELINE_FILE=doc_chunk_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ - done + done \ No newline at end of file diff --git a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py index 387c3bda7..580443bf3 100644 --- a/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py +++ b/transforms/language/doc_chunk/kfp_ray/doc_chunk_wf.py @@ -20,7 +20,7 @@ task_image = "quay.io/dataprep1/data-prep-kit/doc_chunk-ray:latest" # the name of the job script -EXEC_SCRIPT_NAME: str = "doc_chunk_transform_ray.py" +EXEC_SCRIPT_NAME: str = "-m dpk_doc_chunk.ray.transform" # components base_kfp_image = "quay.io/dataprep1/data-prep-kit/kfp-data-processing:latest" diff --git a/transforms/language/doc_chunk/python/.dockerignore b/transforms/language/doc_chunk/python/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/language/doc_chunk/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/language/doc_chunk/python/Makefile b/transforms/language/doc_chunk/python/Makefile deleted file mode 100644 index 2f2a7e789..000000000 --- a/transforms/language/doc_chunk/python/Makefile +++ /dev/null @@ -1,74 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -LINUX_WITH_CPU_TORCH?=true -OS := $(shell uname -s) -ifeq ($(OS),Linux) - ifeq ($(LINUX_WITH_CPU_TORCH),true) - PIP_INSTALL_EXTRA_ARGS=--extra-index-url=https://download.pytorch.org/whl/cpu - DOCKER_BUILD_EXTRA_ARGS=--build-arg PIP_INSTALL_EXTRA_ARGS=${PIP_INSTALL_EXTRA_ARGS} - endif -endif - - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(DOC_CHUNK_PYTHON_VERSION) TOML_VERSION=$(DOC_CHUNK_PYTHON_VERSION) .transforms.set-versions - -build-dist:: set-versions .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.python-test-image - -run-cli-sample: .transforms.run-cli-python-sample - -run-local-sample: .transforms.run-local-sample - -run-local-python-sample: .transforms.run-local-python-sample - -#run-s3-ray-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md deleted file mode 100644 index 1ec3a8080..000000000 --- a/transforms/language/doc_chunk/python/README.md +++ /dev/null @@ -1,117 +0,0 @@ -# Chunk documents Transform - -Please see the set of -[transform project conventions](../../../README.md#transform-project-conventions) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Contributors - -- Michele Dolfi (dol@zurich.ibm.com) - -## Description - -This transform is chunking documents. It supports multiple _chunker modules_ (see the `chunking_type` parameter). - -When using documents converted to JSON, the transform leverages the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` -to chunk according to the document layout segmentation, i.e. respecting the original document components as paragraphs, tables, enumerations, etc. -It relies on documents converted with the Docling library in the [pdf2parquet transform](../../pdf2parquet/python/README.md) using the option `contents_type: "application/json"`, -which provides the required JSON structure. - -When using documents converted to Markdown, the transform leverages the [Llama Index](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) `MarkdownNodeParser`, which is relying on its internal Markdown splitting logic. - - -### Input - -| input column name | data type | description | -|-|-|-| -| the one specified in _content_column_name_ configuration | string | the content used in this transform | - - -### Output format - -The output parquet file will contain all the original columns, but the content will be replaced with the individual chunks. - - -#### Tracing the origin of the chunks - -The transform allows to trace the origin of the chunk with the `source_doc_id` which is set to the value of the `document_id` column (if present) in the input table. -The actual name of columns can be customized with the parameters described below. - - -## Configuration - -The transform can be tuned with the following parameters. - - -| Parameter | Default | Description | -|------------|----------|--------------| -| `chunking_type` | `dl_json` | Chunking type to apply. Valid options are `li_markdown` for using the LlamaIndex [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser), `dl_json` for using the [Docling JSON chunking](https://github.com/DS4SD/docling), `li_token_text` for using the LlamaIndex [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/), which chunks the text into fixed-sized windows of tokens. | -| `content_column_name` | `contents` | Name of the column containing the text to be chunked. | -| `doc_id_column_name` | `document_id` | Name of the column containing the doc_id to be propagated in the output. | -| `chunk_size_tokens` | `128` | Size of the chunk in tokens for the token text chunker. | -| `chunk_overlap_tokens` | `30` | Number of tokens overlapping between chunks for the token text chunker. | -| `output_chunk_column_name` | `contents` | Column name to store the chunks in the output table. | -| `output_source_doc_id_column_name` | `source_document_id` | Column name to store the `doc_id` from the input table. | -| `output_jsonpath_column_name`| `doc_jsonpath` | Column name to store the document path of the chunk in the output table. | -| `output_pageno_column_name` | `page_number` | Column name to store the page number of the chunk in the output table. | -| `output_bbox_column_name` | `bbox` | Column name to store the bbox of the chunk in the output table. | - - - -## Usage - -### Launched Command Line Options - -When invoking the CLI, the parameters must be set as `--doc_chunk_`, e.g. `--doc_chunk_column_name_key=myoutput`. - - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/doc_chunk_transform.py using command line args -* `run-local-sample` - runs src/doc_chunk_local.py - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Code example - -TBD (link to the notebook will be provided) - -See the sample script [src/doc_chunk_local_python.py](src/doc_chunk_local_python.py). - - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. - -## Testing - -Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) - -Currently we have: -- [Unit test](test/test_doc_chunk_python.py) - - -## Further Resource - -- For the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` - - -- For the Markdown chunker in LlamaIndex - - [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) -- For the Token Text Splitter in LlamaIndex - - [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/) diff --git a/transforms/language/doc_chunk/python/pyproject.toml b/transforms/language/doc_chunk/python/pyproject.toml deleted file mode 100644 index c9728712e..000000000 --- a/transforms/language/doc_chunk/python/pyproject.toml +++ /dev/null @@ -1,47 +0,0 @@ -[project] -name = "dpk_doc_chunk_transform_python" -version = "0.3.0" -requires-python = ">=3.10,<3.13" -description = "chunk documents Python Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Michele Dolfi", email = "dol@zurich.ibm.com" }, - { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, - { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, -] -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/language/doc_chunk/ray/Makefile b/transforms/language/doc_chunk/ray/Makefile deleted file mode 100644 index b4f394f84..000000000 --- a/transforms/language/doc_chunk/ray/Makefile +++ /dev/null @@ -1,72 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -LINUX_WITH_CPU_TORCH?=true -OS := $(shell uname -s) -ifeq ($(OS),Linux) - ifeq ($(LINUX_WITH_CPU_TORCH),true) - PIP_INSTALL_EXTRA_ARGS=--extra-index-url=https://download.pytorch.org/whl/cpu - DOCKER_BUILD_EXTRA_ARGS=--build-arg PIP_INSTALL_EXTRA_ARGS=${PIP_INSTALL_EXTRA_ARGS} - endif -endif - -BASE_IMAGE=${RAY_BASE_IMAGE} -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -test-image:: .transforms.ray-test-image - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -setup:: .transforms.setup - -# set the version of python transform that this depends on. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=${DOC_CHUNK_PYTHON_VERSION} TOML_VERSION=$(DOC_CHUNK_RAY_VERSION) .transforms.set-versions - -build-dist:: set-versions .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -run-cli-sample: .transforms.run-cli-ray-sample - -run-local-sample: .transforms.run-local-ray-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/language/doc_chunk/ray/README.md b/transforms/language/doc_chunk/ray/README.md deleted file mode 100644 index f9bde5a1b..000000000 --- a/transforms/language/doc_chunk/ray/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# Chunk documents Ray Transform -Please see the set of -[transform project conventions](../../../README.md#transform-project-conventions) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary -This project wraps the [doc_chunk transform](../python) with a Ray runtime. - -## Configuration and command line Options - -chunk documents configuration and command line options are the same as for the base python transform. - -## Running - -### Launched Command Line Options -In addition to those available to the transform as defined in [here](../python/README.md), -the set of -[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/doc_chunk_transform.py using command line args -* `run-local-sample` - runs src/doc_chunk_local_ray.py -* `run-s3-sample` - runs src/doc_chunk_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/language/doc_chunk/ray/pyproject.toml b/transforms/language/doc_chunk/ray/pyproject.toml deleted file mode 100644 index 29b594fac..000000000 --- a/transforms/language/doc_chunk/ray/pyproject.toml +++ /dev/null @@ -1,47 +0,0 @@ -[project] -name = "dpk_doc_chunk_transform_ray" -version = "0.3.0" -requires-python = ">=3.10,<3.13" -description = "chunk documents Ray Transform" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Michele Dolfi", email = "dol@zurich.ibm.com" }, - { name = "Panos Vagenas", email = "pva@zurich.ibm.com" }, - { name = "Christoph Auer", email = "cau@zurich.ibm.com" }, -] -dependencies = [ - "dpk-doc-chunk-transform-python==0.3.0", - "data-prep-toolkit[ray]==0.2.2.dev2", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/requirements.txt similarity index 100% rename from transforms/language/doc_chunk/python/requirements.txt rename to transforms/language/doc_chunk/requirements.txt diff --git a/transforms/language/doc_chunk/python/test-data/expected/metadata.json b/transforms/language/doc_chunk/test-data/expected/metadata.json similarity index 100% rename from transforms/language/doc_chunk/python/test-data/expected/metadata.json rename to transforms/language/doc_chunk/test-data/expected/metadata.json diff --git a/transforms/language/doc_chunk/python/test-data/expected/test1.parquet b/transforms/language/doc_chunk/test-data/expected/test1.parquet similarity index 100% rename from transforms/language/doc_chunk/python/test-data/expected/test1.parquet rename to transforms/language/doc_chunk/test-data/expected/test1.parquet diff --git a/transforms/language/doc_chunk/python/test-data/expected_md/2206.01062.parquet b/transforms/language/doc_chunk/test-data/expected_md/2206.01062.parquet similarity index 100% rename from transforms/language/doc_chunk/python/test-data/expected_md/2206.01062.parquet rename to transforms/language/doc_chunk/test-data/expected_md/2206.01062.parquet diff --git a/transforms/language/doc_chunk/python/test-data/expected_md/metadata.json b/transforms/language/doc_chunk/test-data/expected_md/metadata.json similarity index 100% rename from transforms/language/doc_chunk/python/test-data/expected_md/metadata.json rename to transforms/language/doc_chunk/test-data/expected_md/metadata.json diff --git a/transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json b/transforms/language/doc_chunk/test-data/expected_token_text/metadata.json similarity index 100% rename from transforms/language/doc_chunk/python/test-data/expected_token_text/metadata.json rename to transforms/language/doc_chunk/test-data/expected_token_text/metadata.json diff --git a/transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet b/transforms/language/doc_chunk/test-data/expected_token_text/sample1.parquet similarity index 100% rename from transforms/language/doc_chunk/python/test-data/expected_token_text/sample1.parquet rename to transforms/language/doc_chunk/test-data/expected_token_text/sample1.parquet diff --git a/transforms/language/doc_chunk/python/test-data/input/test1.parquet b/transforms/language/doc_chunk/test-data/input/test1.parquet similarity index 100% rename from transforms/language/doc_chunk/python/test-data/input/test1.parquet rename to transforms/language/doc_chunk/test-data/input/test1.parquet diff --git a/transforms/language/doc_chunk/python/test-data/input_md/2206.01062.parquet b/transforms/language/doc_chunk/test-data/input_md/2206.01062.parquet similarity index 100% rename from transforms/language/doc_chunk/python/test-data/input_md/2206.01062.parquet rename to transforms/language/doc_chunk/test-data/input_md/2206.01062.parquet diff --git a/transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet b/transforms/language/doc_chunk/test-data/input_token_text/sample1.parquet similarity index 100% rename from transforms/language/doc_chunk/python/test-data/input_token_text/sample1.parquet rename to transforms/language/doc_chunk/test-data/input_token_text/sample1.parquet diff --git a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py b/transforms/language/doc_chunk/test/test_doc_chunk_python.py similarity index 95% rename from transforms/language/doc_chunk/python/test/test_doc_chunk_python.py rename to transforms/language/doc_chunk/test/test_doc_chunk_python.py index 5ecfa49a2..9f9c9b796 100644 --- a/transforms/language/doc_chunk/python/test/test_doc_chunk_python.py +++ b/transforms/language/doc_chunk/test/test_doc_chunk_python.py @@ -16,12 +16,12 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from doc_chunk_transform import ( +from dpk_doc_chunk.transform import ( chunking_type_cli_param, output_chunk_column_name_cli_param, chunking_types ) -from doc_chunk_transform_python import DocChunkPythonTransformConfiguration +from dpk_doc_chunk.transform_python import DocChunkPythonTransformConfiguration class TestPythonDocChunkTransform(AbstractTransformLauncherTest): diff --git a/transforms/language/doc_chunk/ray/test/test_doc_chunk_ray.py b/transforms/language/doc_chunk/test/test_doc_chunk_ray.py similarity index 95% rename from transforms/language/doc_chunk/ray/test/test_doc_chunk_ray.py rename to transforms/language/doc_chunk/test/test_doc_chunk_ray.py index 847101587..c8083ab7d 100644 --- a/transforms/language/doc_chunk/ray/test/test_doc_chunk_ray.py +++ b/transforms/language/doc_chunk/test/test_doc_chunk_ray.py @@ -16,7 +16,7 @@ AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher -from doc_chunk_transform_ray import DocChunkRayTransformConfiguration +from dpk_doc_chunk.ray.transform import DocChunkRayTransformConfiguration class TestRayDocChunkTransform(AbstractTransformLauncherTest): diff --git a/transforms/pyproject.toml b/transforms/pyproject.toml index 2357553e4..00f3d0433 100644 --- a/transforms/pyproject.toml +++ b/transforms/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "data_prep_toolkit_transforms" -version = "0.2.2.dev3" +version = "1.0.1.dev0" requires-python = ">=3.10,<3.13" keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] description = "Data Preparation Toolkit Transforms using Ray" @@ -30,7 +30,7 @@ all = { file = [ "code/code2parquet/python/requirements.txt", "language/doc_quality/python/requirements.txt", -"language/doc_chunk/python/requirements.txt", +"language/doc_chunk/requirements.txt", ##### Cannot have html2parquet until we solve ## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1 ## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8" @@ -61,7 +61,7 @@ code_quality = { file = ["code/code_quality/python/requirements.txt"]} code2parquet = {file = ["code/code2parquet/python/requirements.txt"]} doc_quality = { file = ["language/doc_quality/python/requirements.txt"]} -doc_chunk = { file = ["language/doc_chunk/python/requirements.txt"]} +doc_chunk = { file = ["language/doc_chunk/requirements.txt"]} html2parquet = { file = ["language/html2parquet/python/requirements.txt"]} pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]} lang_id = { file = ["language/lang_id/python/requirements.txt"]} @@ -80,21 +80,18 @@ web2parquet = { file = ["universal/web2parquet/requirements.txt"]} # Does not seem to work for our custom layout # copy all files to a single src and let automatic discovery find them -[tool.setuptools.package-data] -"*" = ["*.txt"] - -[tool.setuptools.packages.find] -where = ["src"] +#[tool.setuptools.package-data] +#"*" = ["*.txt"] +# To include this, comment out the package.find section, +# uncomment the package-dir section and rerun the build +# while keeping the build folder from previous run #[tool.setuptools.package-dir] +#dpk_pdf2parquet = "language/html2parquet/dpk_pdf2parquet" +#dpk_doc_chunck = "universal/doc_chunck/dpk_web2parquet" +#dpk_html2parquet = "language/html2parquet/dpk_html2parquet" #dpk_web2parquet = "universal/web2parquet/dpk_web2parquet" -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src"] - [tool.pytest.ini_options] # Currently we use low coverage since we have to run tests separately (see makefile) #addopts = "--cov --cov-report term-missing --cov-fail-under 25"