diff --git a/.github/workflows/test-code-code2parquet-kfp.yml b/.github/workflows/test-code-code2parquet-kfp.yml index 710654571..882e5b65e 100644 --- a/.github/workflows/test-code-code2parquet-kfp.yml +++ b/.github/workflows/test-code-code2parquet-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/code2parquet/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/code2parquet/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-code_quality-kfp.yml b/.github/workflows/test-code-code_quality-kfp.yml index c07bc1d2d..f6ccb3a9e 100644 --- a/.github/workflows/test-code-code_quality-kfp.yml +++ b/.github/workflows/test-code-code_quality-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/code_quality/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/code_quality/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-header_cleanser-kfp.yml b/.github/workflows/test-code-header_cleanser-kfp.yml index 7c419fd14..755393e5d 100644 --- a/.github/workflows/test-code-header_cleanser-kfp.yml +++ b/.github/workflows/test-code-header_cleanser-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/header_cleanser/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/header_cleanser/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-license_select-kfp.yml b/.github/workflows/test-code-license_select-kfp.yml index d72a85dd4..7b6b69ef9 100644 --- a/.github/workflows/test-code-license_select-kfp.yml +++ b/.github/workflows/test-code-license_select-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/license_select/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/license_select/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-malware-kfp.yml b/.github/workflows/test-code-malware-kfp.yml index 89bf47239..e68eb175e 100644 --- a/.github/workflows/test-code-malware-kfp.yml +++ b/.github/workflows/test-code-malware-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/malware/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/malware/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-proglang_select-kfp.yml b/.github/workflows/test-code-proglang_select-kfp.yml index 31328f3d5..edbf09ea1 100644 --- a/.github/workflows/test-code-proglang_select-kfp.yml +++ b/.github/workflows/test-code-proglang_select-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/proglang_select/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/proglang_select/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-code-repo_level_ordering-kfp.yml b/.github/workflows/test-code-repo_level_ordering-kfp.yml index 4e328f53e..26374677a 100644 --- a/.github/workflows/test-code-repo_level_ordering-kfp.yml +++ b/.github/workflows/test-code-repo_level_ordering-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/repo_level_ordering/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/code/repo_level_ordering/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-kfp-transform.template b/.github/workflows/test-kfp-transform.template index f12511118..bed645bed 100644 --- a/.github/workflows/test-kfp-transform.template +++ b/.github/workflows/test-kfp-transform.template @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "@TARGET_TRANSFORM_DIR@/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "@TARGET_TRANSFORM_DIR@/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-doc_chunk-kfp.yml b/.github/workflows/test-language-doc_chunk-kfp.yml index 985f79b97..fe347dc61 100644 --- a/.github/workflows/test-language-doc_chunk-kfp.yml +++ b/.github/workflows/test-language-doc_chunk-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/doc_chunk/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/doc_chunk/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-doc_quality-kfp.yml b/.github/workflows/test-language-doc_quality-kfp.yml index 6842a1859..33910778a 100644 --- a/.github/workflows/test-language-doc_quality-kfp.yml +++ b/.github/workflows/test-language-doc_quality-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/doc_quality/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/doc_quality/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-html2parquet-kfp.yml b/.github/workflows/test-language-html2parquet-kfp.yml index c7a1cecc5..5da046347 100644 --- a/.github/workflows/test-language-html2parquet-kfp.yml +++ b/.github/workflows/test-language-html2parquet-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/html2parquet/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/html2parquet/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-lang_id-kfp.yml b/.github/workflows/test-language-lang_id-kfp.yml index 936ba8e45..562c38362 100644 --- a/.github/workflows/test-language-lang_id-kfp.yml +++ b/.github/workflows/test-language-lang_id-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/lang_id/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/lang_id/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-pdf2parquet-kfp.yml b/.github/workflows/test-language-pdf2parquet-kfp.yml index f232d78e4..a304e22ec 100644 --- a/.github/workflows/test-language-pdf2parquet-kfp.yml +++ b/.github/workflows/test-language-pdf2parquet-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/pdf2parquet/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/pdf2parquet/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-pii_redactor-kfp.yml b/.github/workflows/test-language-pii_redactor-kfp.yml index 451ac7961..3982c2f67 100644 --- a/.github/workflows/test-language-pii_redactor-kfp.yml +++ b/.github/workflows/test-language-pii_redactor-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/pii_redactor/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/pii_redactor/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-language-text_encoder-kfp.yml b/.github/workflows/test-language-text_encoder-kfp.yml index 96b8308d1..fdc085058 100644 --- a/.github/workflows/test-language-text_encoder-kfp.yml +++ b/.github/workflows/test-language-text_encoder-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/text_encoder/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/language/text_encoder/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-doc_id-kfp.yml b/.github/workflows/test-universal-doc_id-kfp.yml index fadcc6403..194b5b65f 100644 --- a/.github/workflows/test-universal-doc_id-kfp.yml +++ b/.github/workflows/test-universal-doc_id-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/doc_id/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/doc_id/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-ededup-kfp.yml b/.github/workflows/test-universal-ededup-kfp.yml index 225d7539e..48d3a1469 100644 --- a/.github/workflows/test-universal-ededup-kfp.yml +++ b/.github/workflows/test-universal-ededup-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/ededup/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/ededup/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-fdedup-kfp.yml b/.github/workflows/test-universal-fdedup-kfp.yml index b36df964d..cb2055bfc 100644 --- a/.github/workflows/test-universal-fdedup-kfp.yml +++ b/.github/workflows/test-universal-fdedup-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/fdedup/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/fdedup/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-filter-kfp.yml b/.github/workflows/test-universal-filter-kfp.yml index cab769f11..2fe6d33f7 100644 --- a/.github/workflows/test-universal-filter-kfp.yml +++ b/.github/workflows/test-universal-filter-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/filter/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/filter/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-hap-kfp.yml b/.github/workflows/test-universal-hap-kfp.yml index d5d6aa63c..c69077368 100644 --- a/.github/workflows/test-universal-hap-kfp.yml +++ b/.github/workflows/test-universal-hap-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/hap/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/hap/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-noop-kfp.yml b/.github/workflows/test-universal-noop-kfp.yml index 9322ea9b6..2be2a6e87 100644 --- a/.github/workflows/test-universal-noop-kfp.yml +++ b/.github/workflows/test-universal-noop-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/noop/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/noop/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-profiler-kfp.yml b/.github/workflows/test-universal-profiler-kfp.yml index f31543ad3..63631e4d6 100644 --- a/.github/workflows/test-universal-profiler-kfp.yml +++ b/.github/workflows/test-universal-profiler-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/profiler/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/profiler/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-resize-kfp.yml b/.github/workflows/test-universal-resize-kfp.yml index 3e099bba5..c502cb4b9 100644 --- a/.github/workflows/test-universal-resize-kfp.yml +++ b/.github/workflows/test-universal-resize-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/resize/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/resize/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/.github/workflows/test-universal-tokenization-kfp.yml b/.github/workflows/test-universal-tokenization-kfp.yml index e4a5b5693..887dd4eb3 100644 --- a/.github/workflows/test-universal-tokenization-kfp.yml +++ b/.github/workflows/test-universal-tokenization-kfp.yml @@ -13,6 +13,7 @@ on: - "*" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/tokenization/**" - "!kfp/**" # This is tested in separate workflow @@ -27,6 +28,7 @@ on: - "releases/**" paths: - ".make.*" + - "scripts/k8s-setup/requirements.env" - "transforms/.make.workflows" - "transforms/universal/tokenization/**" - "!data-processing-lib/**" # This is tested in separate workflow diff --git a/README.md b/README.md index 2c1caa04e..716f3b0f2 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,14 @@ Explore more examples [here](examples/notebooks). ### Run your first data prep pipeline -Now that you have run a single transform, the next step is to explore how to put these transforms together to run a data prep pipeline for an end to end use case like fine tuning model or building a RAG application. This [notebook](examples/notebooks/fine%20tuning/code/sample-notebook.ipynb) gives an example of how to build an end to end data prep pipeline for fine tuning for code LLMs. You can also explore how to build a RAG pipeline [here](examples/notebooks/rag). +Now that you have run a single transform, the next step is to explore how to put these transforms +together to run a data prep pipeline for an end to end use case like fine tuning a model or building +a RAG application. +This [notebook](examples/notebooks/fine%20tuning/code/sample-notebook.ipynb) gives an example of +how to build an end to end data prep pipeline for fine tuning for code LLMs. Similarly, this +[notebook](examples/notebooks/fine%20tuning/language/demo_with_launcher.ipynb) is a fine tuning +example of an end-to-end sample data pipeline designed for processing language datasets. +You can also explore how to build a RAG pipeline [here](examples/notebooks/rag). ### Current list of transforms The matrix below shows the the combination of modules and supported runtimes. All the modules can be accessed [here](transforms) and can be combined to form data processing pipelines, as shown in the [examples](examples) folder. @@ -133,7 +140,8 @@ The matrix below shows the the combination of modules and supported runtimes. Al | **Data Ingestion** | | | | | | [Code (from zip) to Parquet](transforms/code/code2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | [PDF to Parquet](transforms/language/pdf2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | -| [HTML to Parquet](transforms/language/html2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | | +| [HTML to Parquet](transforms/language/html2parquet/python/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | +| [Web to Parquet](transforms/universal/web2parquet/README.md) | :white_check_mark: | | | | | **Universal (Code & Language)** | | | | | | [Exact dedup filter](transforms/universal/ededup/ray/README.md) | :white_check_mark: | :white_check_mark: | | :white_check_mark: | | [Fuzzy dedup filter](transforms/universal/fdedup/ray/README.md) | | :white_check_mark: | | :white_check_mark: | @@ -223,11 +231,11 @@ If you use Data Prep Kit in your research, please cite our paper: @misc{wood2024dataprepkitgettingdataready, title={Data-Prep-Kit: getting your data ready for LLM application development}, author={David Wood and Boris Lublinsky and Alexy Roytman and Shivdeep Singh - and Abdulhamid Adebayo and Revital Eres and Mohammad Nassar and Hima Patel - and Yousaf Shah and Constantin Adam and Petros Zerfos and Nirmit Desai - and Daiki Tsuzuku and Takuya Goto and Michele Dolfi and Saptha Surendran - and Paramesvaran Selvam and Sungeun An and Yuan Chi Chang and Dhiraj Joshi - and Hajar Emami-Gohari and Xuan-Hong Dang and Yan Koyfman and Shahrokh Daijavad}, + and Constantin Adam and Abdulhamid Adebayo and Sungeun An and Yuan Chi Chang + and Xuan-Hong Dang and Nirmit Desai and Michele Dolfi and Hajar Emami-Gohari + and Revital Eres and Takuya Goto and Dhiraj Joshi and Yan Koyfman + and Mohammad Nassar and Hima Patel and Paramesvaran Selvam and Yousaf Shah + and Saptha Surendran and Daiki Tsuzuku and Petros Zerfos and Shahrokh Daijavad}, year={2024}, eprint={2409.18164}, archivePrefix={arXiv}, diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py index ac5e32689..28f36acf7 100644 --- a/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py +++ b/kfp/kfp_support_lib/kfp_v2_workflow_support/src/workflow_support/compile_utils/component.py @@ -103,7 +103,7 @@ def _add_node_selector() -> None: def set_s3_env_vars_to_component( task: dsl.PipelineTask, secret: str = "", - env2key: Dict[str, str] = {"s3-key": "S3_KEY", "s3-secret": "S3_SECRET", "s3-endpoint": "ENDPOINT"}, + env2key: Dict[str, str] = None, prefix: str = None, ) -> None: """ @@ -113,6 +113,8 @@ def set_s3_env_vars_to_component( :param env2key: dict with mapping each env variable to a key in the secret :param prefix: prefix to add to env name """ + if env2key is None: + env2key = {"s3-key": "S3_KEY", "s3-secret": "S3_SECRET", "s3-endpoint": "ENDPOINT"} if prefix is not None: for secret_key, _ in env2key.items(): diff --git a/scripts/k8s-setup/requirements.env b/scripts/k8s-setup/requirements.env index 756cd69e4..9fbc5f9de 100644 --- a/scripts/k8s-setup/requirements.env +++ b/scripts/k8s-setup/requirements.env @@ -1,5 +1,5 @@ KUBERAY_OPERATOR=1.0.0 -KUBERAY_APISERVER=1.1.0 +KUBERAY_APISERVER=1.2.2 KIND_VERSION=0.22.0 HELM_VERSION=3.10.0 diff --git a/scripts/k8s-setup/tools/install_kuberay.sh b/scripts/k8s-setup/tools/install_kuberay.sh index 341ad2c85..30563b931 100755 --- a/scripts/k8s-setup/tools/install_kuberay.sh +++ b/scripts/k8s-setup/tools/install_kuberay.sh @@ -9,10 +9,10 @@ MAX_RETRIES="${MAX_RETRIES:-5}" EXIT_CODE=0 deploy() { - sed -i.back "s/tag: v[0-9].*/tag: v${KUBERAY_APISERVER}/" ${K8S_SETUP_SCRIPTS}/ray_api_server_values.yaml helm repo add kuberay https://ray-project.github.io/kuberay-helm/ helm repo update kuberay helm install kuberay-operator kuberay/kuberay-operator -n kuberay --version ${KUBERAY_OPERATOR} --set image.pullPolicy=IfNotPresent --create-namespace + sed -i.back "s/tag: v[0-9].*/tag: v${KUBERAY_APISERVER}/" ${K8S_SETUP_SCRIPTS}/ray_api_server_values.yaml helm install -f ${K8S_SETUP_SCRIPTS}/ray_api_server_values.yaml kuberay-apiserver kuberay/kuberay-apiserver -n kuberay --version ${KUBERAY_APISERVER} --set image.pullPolicy=IfNotPresent echo "Finished KubeRay deployment." } diff --git a/transforms/README-list.md b/transforms/README-list.md index 3e70b6b62..8040dc7a9 100644 --- a/transforms/README-list.md +++ b/transforms/README-list.md @@ -36,8 +36,13 @@ Note: This list includes the transforms that were part of the release starting w * [tokenization](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/tokenization/python/README.md) * [doc_id](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/doc_id/python/README.md) * [web2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/universal/web2parquet/README.md) + +## Release notes: - +### 0.2.2.dev3 +* web2parquet +### 0.2.2.dev2 +* pdf2parquet now supports HTML,DOCX,PPTX, ... in addition to PDF diff --git a/transforms/language/doc_chunk/doc_chunk.ipynb b/transforms/language/doc_chunk/doc_chunk.ipynb new file mode 100644 index 000000000..3a8466037 --- /dev/null +++ b/transforms/language/doc_chunk/doc_chunk.ipynb @@ -0,0 +1,192 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv\n", + "source venv/bin/activate && pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. We will only show the use of data_files_to_use and doc_chunk_chunking_type. For a complete list of parameters, please refer to the README.md for this transform\n", + "##### \n", + "| parameter:type | value | Description |\n", + "| --- | --- | --- |\n", + "|data_files_to_use: list | .parquet | Process all parquet files in the input folder |\n", + "| doc_chunk_chunking_type: str | dl_json | |\n" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required Classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from doc_chunk_transform_python import DocChunkPythonTransformConfiguration\n" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "# create parameters\n", + "input_folder = os.path.join(\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.parquet']\"),\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + " \"doc_chunk_chunking_type\": \"dl_json\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:19:48 INFO - pipeline id pipeline_id\n", + "15:19:48 INFO - code location None\n", + "15:19:48 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "15:19:48 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:19:48 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:19:48 INFO - orchestrator doc_chunk started at 2024-11-20 15:19:48\n", + "15:19:48 INFO - Number of files is 1, source profile {'max_file_size': 0.011513710021972656, 'min_file_size': 0.011513710021972656, 'total_file_size': 0.011513710021972656}\n", + "15:19:48 INFO - Completed 1 files (100.0%) in 0.001 min\n", + "15:19:48 INFO - Done processing 1 files, waiting for flush() completion.\n", + "15:19:48 INFO - done flushing in 0.0 sec\n", + "15:19:48 INFO - Completed execution in 0.001 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=DocChunkPythonTransformConfiguration())\n", + "launcher.launch()\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/metadata.json', 'python/output/test1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/doc_chunk/python/README.md b/transforms/language/doc_chunk/python/README.md index 9abca2b79..1ec3a8080 100644 --- a/transforms/language/doc_chunk/python/README.md +++ b/transforms/language/doc_chunk/python/README.md @@ -1,5 +1,16 @@ # Chunk documents Transform +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Contributors + +- Michele Dolfi (dol@zurich.ibm.com) + +## Description + This transform is chunking documents. It supports multiple _chunker modules_ (see the `chunking_type` parameter). When using documents converted to JSON, the transform leverages the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` @@ -9,20 +20,26 @@ which provides the required JSON structure. When using documents converted to Markdown, the transform leverages the [Llama Index](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) `MarkdownNodeParser`, which is relying on its internal Markdown splitting logic. -## Output format + +### Input + +| input column name | data type | description | +|-|-|-| +| the one specified in _content_column_name_ configuration | string | the content used in this transform | + + +### Output format The output parquet file will contain all the original columns, but the content will be replaced with the individual chunks. -### Tracing the origin of the chunks +#### Tracing the origin of the chunks The transform allows to trace the origin of the chunk with the `source_doc_id` which is set to the value of the `document_id` column (if present) in the input table. The actual name of columns can be customized with the parameters described below. -## Running - -### Parameters +## Configuration The transform can be tuned with the following parameters. @@ -40,6 +57,12 @@ The transform can be tuned with the following parameters. | `output_pageno_column_name` | `page_number` | Column name to store the page number of the chunk in the output table. | | `output_bbox_column_name` | `bbox` | Column name to store the bbox of the chunk in the output table. | + + +## Usage + +### Launched Command Line Options + When invoking the CLI, the parameters must be set as `--doc_chunk_`, e.g. `--doc_chunk_column_name_key=myoutput`. @@ -63,8 +86,32 @@ ls output ``` To see results of the transform. +### Code example + +TBD (link to the notebook will be provided) + +See the sample script [src/doc_chunk_local_python.py](src/doc_chunk_local_python.py). + + ### Transforming data using the transform image To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. + +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_doc_chunk_python.py) + + +## Further Resource + +- For the [Docling Core](https://github.com/DS4SD/docling-core) `HierarchicalChunker` + - +- For the Markdown chunker in LlamaIndex + - [Markdown chunking](https://docs.llamaindex.ai/en/stable/module_guides/loading/node_parsers/modules/#markdownnodeparser) +- For the Token Text Splitter in LlamaIndex + - [Token Text Splitter](https://docs.llamaindex.ai/en/stable/api_reference/node_parsers/token_text_splitter/) diff --git a/transforms/language/doc_chunk/python/requirements.txt b/transforms/language/doc_chunk/python/requirements.txt index dd076d0e0..c24d0c3e2 100644 --- a/transforms/language/doc_chunk/python/requirements.txt +++ b/transforms/language/doc_chunk/python/requirements.txt @@ -1,3 +1,4 @@ data-prep-toolkit==0.2.2.dev2 docling-core==2.3.0 +pydantic>=2.0.0,<2.10.0 llama-index-core>=0.11.22,<0.12.0 diff --git a/transforms/language/html2parquet/notebooks/html2parquet.ipynb b/transforms/language/html2parquet/notebooks/html2parquet.ipynb new file mode 100644 index 000000000..669a4d30d --- /dev/null +++ b/transforms/language/html2parquet/notebooks/html2parquet.ipynb @@ -0,0 +1,235 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8435e1f7-0c2e-49f4-a77a-b525ee6c532b", + "metadata": {}, + "source": [ + "# Html2Parquet Transform Sample Notebook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9420989-ec8a-4fde-9a93-dc25096389f1", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install data-prep-toolkit==0.2.2.dev2\n", + "!pip install 'data-prep-toolkit-transforms[html2parquet]==0.2.2.dev2'\n", + "!pip install pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "20663a67-5aa1-4b61-b989-94201613e41f", + "metadata": {}, + "outputs": [], + "source": [ + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "\n", + "from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration\n" + ] + }, + { + "cell_type": "markdown", + "id": "6d85491b-0093-46e7-8653-ca8052ea59f0", + "metadata": {}, + "source": [ + "## Specify input/output folders and parameters" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e75f6922-eb0f-4164-a536-f96393e04604", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "\n", + "# create parameters\n", + "local_conf = {\n", + " \"input_folder\": \"/path/to/your/input/folder\", # For the sample input files, refer to the 'python/test-data/input' folder\n", + " \"output_folder\": \"/path/to/your/output/folder\",\n", + "}\n", + "\n", + "params = {\n", + " # Data access. Only required parameters are specified\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"data_files_to_use\": ast.literal_eval(\"['.zip', '.html']\"),\n", + "}\n" + ] + }, + { + "cell_type": "markdown", + "id": "0dcd1249-1eb8-4b33-9827-626f90c840b4", + "metadata": {}, + "source": [ + "## Invoke the html2parquet transformation" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4d2354db-1bb3-4a71-98df-f0f148af3a02", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "17:09:40 INFO - html2parquet parameters are : {'output_format': , 'favor_precision': , 'favor_recall': }\n", + "17:09:40 INFO - pipeline id pipeline_id\n", + "17:09:40 INFO - code location None\n", + "17:09:40 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n", + "17:09:40 INFO - data factory data_ max_files -1, n_sample -1\n", + "17:09:40 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n", + "17:09:40 INFO - orchestrator html2parquet started at 2024-11-13 17:09:40\n", + "17:09:40 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n", + "17:09:47 INFO - Completed 1 files (100.0%) in 0.111 min\n", + "17:09:47 INFO - Done processing 1 files, waiting for flush() completion.\n", + "17:09:47 INFO - done flushing in 0.0 sec\n", + "17:09:47 INFO - Completed execution in 0.111 min, execution result 0\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.argv = ParamsUtils.dict_to_req(d=(params))\n", + "# create launcher\n", + "launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())\n", + "# launch\n", + "return_code = launcher.launch()\n" + ] + }, + { + "cell_type": "markdown", + "id": "3c66468d-703f-427f-a1dd-a758edd334de", + "metadata": {}, + "source": [ + "## Checking the output Parquet file" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e2bee8da-c566-4e45-bca1-354dfd04b0df", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
titledocumentcontentsdocument_idsizedate_acquired
0ai-alliance-index.htmlai-alliance-index.html![](https://images.prismic.io/ai-alliance/Ztf3...f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...3942024-11-13T17:09:40.947095
\n", + "
" + ], + "text/plain": [ + " title document \\\n", + "0 ai-alliance-index.html ai-alliance-index.html \n", + "\n", + " contents \\\n", + "0 ![](https://images.prismic.io/ai-alliance/Ztf3... \n", + "\n", + " document_id size \\\n", + "0 f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121... 394 \n", + "\n", + " date_acquired \n", + "0 2024-11-13T17:09:40.947095 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pyarrow.parquet as pq\n", + "import pandas as pd\n", + "table = pq.read_table('/path/to/your/output/folder/sample.parquet')\n", + "table.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "cde6e37d-c437-490f-8e01-f4f51a123484", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "table.to_pandas()['contents'][0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/language/html2parquet/python/README.md b/transforms/language/html2parquet/python/README.md index 0d25553e1..35e781007 100644 --- a/transforms/language/html2parquet/python/README.md +++ b/transforms/language/html2parquet/python/README.md @@ -1,25 +1,52 @@ -# html2parquet Transform +# HTML to Parquet Transform -This tranforms iterate through zip of HTML files or single HTML files and generates parquet files containing the converted document in string. +--- -The HTML conversion is using the [Trafilatura](https://trafilatura.readthedocs.io/en/latest/usage-python.html). +## Description -## Output format +This transform iterates through zipped collections of HTML files or single HTML files and generates Parquet files containing the extracted content, leveraging the [Trafilatura library](https://trafilatura.readthedocs.io/en/latest/usage-python.html) for extraction of text, tables, images, and other components. -The output format will contain the following colums +--- + +## Contributors + +- Sungeun An (sungeun.an@ibm.com) +- Syed Zawad (szawad@ibm.com) + +--- + +## Date + +**Last updated:** 10/16/24 +**Update details:** Enhanced table and image extraction features by adding the corresponding Trafilatura parameters. + +--- + +## Input and Output + +### Input +- Accepted Formats: Single HTML files or zipped collections of HTML files. +- Sample Input Files: [sample html files](test-data/input) + +### Output +- Format: Parquet files with the following structure: ```jsonc { - "title": "string", // the member filename - "document": "string", // the base of the source archive - "contents": "string", // the content of the HTML + "title": "string", // the member filename + "document": "string", // the base of the source archive + "contents": "string", // the content of the HTML "document_id": "string", // the document id, a hash of `contents` "size": "string", // the size of `contents` "date_acquired": "date", // the date when the transform was executing } ``` + + ## Parameters +### User-Configurable Parameters + The table below provides the parameters that users can adjust to control the behavior of the extraction: | Parameter | Default | Description | @@ -28,6 +55,8 @@ The table below provides the parameters that users can adjust to control the beh | `favor_precision` | `True` | Prefers less content but more accurate extraction. Options: `True`, `False`. | | `favor_recall` | `True` | Extracts more content when uncertain. Options: `True`, `False`. | +### Default Parameters + The table below provides the parameters that are enabled by default to ensure a comprehensive extraction process: | Parameter | Default | Description | @@ -43,6 +72,7 @@ The table below provides the parameters that are enabled by default to ensure a - To prioritize extracting more content over accuracy, set `favor_recall=True` and `favor_precision=False`. - When invoking the CLI, use the following syntax for these parameters: `--html2parquet_`. For example: `--html2parquet_output_format='markdown'`. + ## Example ### Sample HTML @@ -155,3 +185,27 @@ Chicago | ## Contact Us ``` +## Usage + +### Command-Line Interface (CLI) + +Run the transform with the following command: + +``` +python ../html2parquet/python/src/html2parquet_transform_python.py \ + --data_local_config "{'input_folder': '../html2parquet/python/test-data/input', 'output_folder': '../html2parquet/python/test-data/expected'}" \ + --data_files_to_use '[".html", ".zip"]' +``` + +- When invoking the CLI, use the following syntax for these parameters: `--html2parquet_`. For example: `--html2parquet_output_format='markdown'`. + + +### Sample Notebook + +See the [sample notebook](../notebooks/html2parquet.ipynb) +) for an example. + + +## Further Resources + +- [Trafilatura](https://trafilatura.readthedocs.io/en/latest/usage-python.html). diff --git a/transforms/language/pdf2parquet/README.md b/transforms/language/pdf2parquet/README.md index 16e318679..f483e9d11 100644 --- a/transforms/language/pdf2parquet/README.md +++ b/transforms/language/pdf2parquet/README.md @@ -1,11 +1,22 @@ -# Ingest PDF to Parquet +# Pdf2Parquet Transform -This tranforms iterate through document files or zip of files and generates parquet files +The Pdf2Parquet transforms iterate through PDF, Docx, Pptx, Images files or zip of files and generates parquet files containing the converted document in Markdown or JSON format. -The PDF conversion is using the [Docling package](https://github.com/DS4SD/docling). -The Docling configuration in DPK is tuned for best results when running large batch ingestions. -For more details on the multiple configuration options, please refer to the official [Docling documentation](https://ds4sd.github.io/docling/). +The conversion is using the [Docling package](https://github.com/DS4SD/docling). + +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + + +## Contributors + +- Michele Dolfi (dol@zurich.ibm.com) + + +## Input files This transform supports the following input formats: @@ -17,37 +28,38 @@ This transform supports the following input formats: - Markdown documents - ASCII Docs documents +The input documents can be provided in a folder structure, or as a zip archive. +Please see the configuration section for specifying the input files. + ## Output format -The output format will contain all the columns of the metadata CSV file, -with the addition of the following columns +The output table will contain following columns -```jsonc -{ - "source_filename": "string", // the basename of the source archive or file - "filename": "string", // the basename of the PDF file - "contents": "string", // the content of the PDF - "document_id": "string", // the document id, a random uuid4 - "document_hash": "string", // the document hash of the input content - "ext": "string", // the detected file extension - "hash": "string", // the hash of the `contents` column - "size": "string", // the size of `contents` - "date_acquired": "date", // the date when the transform was executing - "num_pages": "number", // number of pages in the PDF - "num_tables": "number", // number of tables in the PDF - "num_doc_elements": "number", // number of document elements in the PDF - "pdf_convert_time": "float", // time taken to convert the document in seconds -} -``` +| output column name | data type | description | +|-|-|-| +| source_filename | string | the basename of the source archive or file | +| filename | string | the basename of the PDF file | +| contents | string | the content of the PDF | +| document_id | string | the document id, a random uuid4 | +| document_hash | string | the document hash of the input content | +| ext | string | the detected file extension | +| hash | string | the hash of the `contents` column | +| size | string | the size of `contents` | +| date_acquired | date | the date when the transform was executing | +| num_pages | number | number of pages in the PDF | +| num_tables | number | number of tables in the PDF | +| num_doc_elements | number | number of document elements in the PDF | +| pdf_convert_time | float | time taken to convert the document in seconds | -## Parameters +## Configuration The transform can be initialized with the following parameters. | Parameter | Default | Description | |------------|----------|--------------| +| `data_files_to_use` | - | The files extensions to be considered when running the transform. Example value `['.pdf','.docx','.pptx','.zip']`. For all the supported input formats, see the section above. | | `batch_size` | -1 | Number of documents to be saved in the same result table. A value of -1 will generate one result file for each input file. | | `artifacts_path` | | Path where to Docling models artifacts are located, if unset they will be downloaded and fetched from the [HF_HUB_CACHE](https://huggingface.co/docs/huggingface_hub/en/guides/manage-cache) folder. | | `contents_type` | `text/markdown` | The output type for the `contents` column. Valid types are `text/markdown`, `text/plain` and `application/json`. | @@ -58,14 +70,123 @@ The transform can be initialized with the following parameters. | `pdf_backend` | `dlparse_v2` | The PDF backend to use. Valid values are `dlparse_v2`, `dlparse_v1`, `pypdfium2`. | | `double_precision` | `8` | If set, all floating points (e.g. bounding boxes) are rounded to this precision. For tests it is advised to use 0. | + +Example + +```py +{ + "data_files_to_use": ast.literal_eval("['.pdf','.docx','.pptx','.zip']"), + "contents_type": "application/json", + "do_ocr": True, +} +``` + + +## Usage + +### Launched Command Line Options + When invoking the CLI, the parameters must be set as `--pdf2parquet_`, e.g. `--pdf2parquet_do_ocr=true`. -# PDF2PARQUET Ray Transform +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/pdf2parquet_transform_python.py using command line args +* `run-local-sample` - runs src/pdf2parquet_local.py +* `run-local-python-sample` - runs src/pdf2parquet_local_python.py + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-local-python-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + + +### Code example + +TBD (link to the notebook will be provided) + +See the sample script [src/pdf2parquet_local_python.py](src/pdf2parquet_local_python.py). + + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py) +- [Integration test](transforms/language/pdf2parquet/python/test/test_pdf2parquet.py) + + + + +# Pdf2parquet Ray Transform This module implements the ray version of the [pdf2parquet transform](../python/). +## Summary +This project wraps the [Ingest PDF to Parquet transform](../python) with a Ray runtime. + + +## Configuration and command line Options + +Ingest PDF to Parquet configuration and command line options are the same as for the base python transform. + + +## Running + +### Launched Command Line Options +When running the transform with the Ray launcher (i.e. TransformLauncher), +In addition to those available to the transform as defined in [here](../python/README.md), +the set of +[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. + +### Running the samples +To run the samples, use the following `make` targets + +* `run-cli-sample` - runs src/pdf2parquet_transform_ray.py using command line args +* `run-local-sample` - runs src/pdf2parquet_local_ray.py +* `run-s3-sample` - runs src/pdf2parquet_s3_ray.py + * Requires prior invocation of `make minio-start` to load data into local minio for S3 access. + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + + ## Prometheus metrics The transform will produce the following statsd metrics: @@ -78,7 +199,7 @@ The transform will produce the following statsd metrics: | worker_pdf_convert_time | Time spent converting a single document | -## Credits +# Credits The PDF document conversion is developed by the AI for Knowledge group in IBM Research Zurich. The main package is [Docling](https://github.com/DS4SD/docling). diff --git a/transforms/language/pdf2parquet/notebooks/input/redp5110-ch1.pdf b/transforms/language/pdf2parquet/notebooks/input/redp5110-ch1.pdf deleted file mode 100644 index ea85731a2..000000000 Binary files a/transforms/language/pdf2parquet/notebooks/input/redp5110-ch1.pdf and /dev/null differ diff --git a/transforms/language/pdf2parquet/notebooks/pdf2parquet.ipynb b/transforms/language/pdf2parquet/pdf2parquet.ipynb similarity index 80% rename from transforms/language/pdf2parquet/notebooks/pdf2parquet.ipynb rename to transforms/language/pdf2parquet/pdf2parquet.ipynb index d330114de..0bef7fcf6 100644 --- a/transforms/language/pdf2parquet/notebooks/pdf2parquet.ipynb +++ b/transforms/language/pdf2parquet/pdf2parquet.ipynb @@ -12,7 +12,8 @@ "%%capture\n", "#!pip install data-prep-toolkit[ray]==0.2.2.dev2\n", "#!pip install 'data-prep-toolkit-transforms[pdf2parquet]'\n", - "!pip install pandas" + "!pip install pandas\n", + "!pip install -U ipywidgets" ] }, { @@ -39,24 +40,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "18:53:57 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", - "18:53:57 INFO - pipeline id pipeline_id\n", - "18:53:57 INFO - code location None\n", - "18:53:57 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", - "18:53:57 INFO - actor creation delay 0\n", - "18:53:57 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", - "18:53:57 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n", - "18:53:57 INFO - data factory data_ max_files -1, n_sample -1\n", - "18:53:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", - "18:53:57 INFO - Running locally\n", - "2024-11-19 18:54:02,814\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", - "18:55:07 INFO - Completed execution in 1.173 min, execution result 0\n" + "08:48:36 INFO - pdf2parquet parameters are : {'batch_size': -1, 'artifacts_path': None, 'contents_type': , 'do_table_structure': True, 'do_ocr': True, 'ocr_engine': , 'bitmap_area_threshold': 0.05, 'pdf_backend': , 'double_precision': 8}\n", + "08:48:36 INFO - pipeline id pipeline_id\n", + "08:48:36 INFO - code location None\n", + "08:48:36 INFO - number of workers 2 worker options {'num_cpus': 1, 'memory': 2147483648, 'max_restarts': -1}\n", + "08:48:36 INFO - actor creation delay 0\n", + "08:48:36 INFO - job details {'job category': 'preprocessing', 'job name': 'pdf2parquet', 'job type': 'ray', 'job id': 'job_id'}\n", + "08:48:36 INFO - data factory data_ is using local data access: input_folder - test-data/input output_folder - output\n", + "08:48:36 INFO - data factory data_ max_files -1, n_sample -1\n", + "08:48:36 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.pdf'], files to checkpoint ['.parquet']\n", + "08:48:36 INFO - Running locally\n", + "2024-11-24 08:48:38,466\tINFO worker.py:1777 -- Started a local Ray instance. View the dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265 \u001b[39m\u001b[22m\n", + "08:49:36 INFO - Completed execution in 1.011 min, execution result 0\n" ] } ], "source": [ "%%capture\n", - "x=Pdf2ParquetRuntime(input_folder= \"input\", \n", + "x=Pdf2ParquetRuntime(input_folder= \"test-data/input\", \n", " output_folder= \"output\", \n", " data_files_to_use=['.pdf'],\n", " run_locally= True,\n", @@ -116,13 +117,13 @@ " 5\n", " 0\n", " 48\n", - " 95004329-0741-4cb2-93bf-f4f4b31e2dd3\n", + " 058d5c86-4d02-434e-9c45-b431d6171e09\n", " 74198560999363607\n", " pdf\n", - " b3d1d0ff2295cb06e5b91364c72a01e11544b5bda52614...\n", - " 35182\n", - " 2024-11-19T18:54:57.646155\n", - " 5.723686\n", + " 63938f3cfc14037a64a9009e233fe982da6981002035c7...\n", + " 35186\n", + " 2024-11-24T08:49:26.795515\n", + " 22.769895\n", " redp5110-ch1.pdf\n", " \n", " \n", @@ -137,13 +138,13 @@ "0 5 0 48 \n", "\n", " document_id document_hash ext \\\n", - "0 95004329-0741-4cb2-93bf-f4f4b31e2dd3 74198560999363607 pdf \n", + "0 058d5c86-4d02-434e-9c45-b431d6171e09 74198560999363607 pdf \n", "\n", " hash size \\\n", - "0 b3d1d0ff2295cb06e5b91364c72a01e11544b5bda52614... 35182 \n", + "0 63938f3cfc14037a64a9009e233fe982da6981002035c7... 35186 \n", "\n", " date_acquired pdf_convert_time source_filename \n", - "0 2024-11-19T18:54:57.646155 5.723686 redp5110-ch1.pdf " + "0 2024-11-24T08:49:26.795515 22.769895 redp5110-ch1.pdf " ] }, "execution_count": 4, diff --git a/transforms/language/text_encoder/python/README.md b/transforms/language/text_encoder/python/README.md index 4c927d1ed..fa9c54ada 100644 --- a/transforms/language/text_encoder/python/README.md +++ b/transforms/language/text_encoder/python/README.md @@ -1,14 +1,36 @@ # Text Encoder Transform -## Summary +Please see the set of +[transform project conventions](../../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Contributors + +- Michele Dolfi (dol@zurich.ibm.com) + +## Description + This transform is using [sentence encoder models](https://en.wikipedia.org/wiki/Sentence_embedding) to create embedding vectors of the text in each row of the input .parquet table. The embeddings vectors generated by the transform are useful for tasks like sentence similarity, features extraction, etc which are also at the core of retrieval-augmented generation (RAG) applications. +### Input + +| input column name | data type | description | +|-|-|-| +| the one specified in _content_column_name_ configuration | string | the content used in this transform | + + +### Output columns + + +| output column name | data type | description | +|-|-|-| +| the one specified in _output_embeddings_column_name_ configuration | `array[float]` | the embeddings vectors of the content | -## Running -### Parameters +## Configuration The transform can be tuned with the following parameters. @@ -18,7 +40,11 @@ The transform can be tuned with the following parameters. | `model_name` | `BAAI/bge-small-en-v1.5` | The HF model to use for encoding the text. | | `content_column_name` | `contents` | Name of the column containing the text to be encoded. | | `output_embeddings_column_name` | `embeddings` | Column name to store the embeddings in the output table. | -| `output_path_column_name` | `doc_path` | Column name to store the document path of the chunk in the output table. | + + +## Usage + +### Launched Command Line Options When invoking the CLI, the parameters must be set as `--text_encoder_`, e.g. `--text_encoder_column_name_key=myoutput`. @@ -43,8 +69,20 @@ ls output ``` To see results of the transform. +### Code example + +TBD (link to the notebook will be provided) + + ### Transforming data using the transform image To use the transform image to transform your data, please refer to the [running images quickstart](../../../../doc/quick-start/run-transform-image.md), substituting the name of this transform image and runtime as appropriate. + +## Testing + +Following [the testing strategy of data-processing-lib](../../../../data-processing-lib/doc/transform-testing.md) + +Currently we have: +- [Unit test](test/test_text_encoder_python.py) \ No newline at end of file diff --git a/transforms/language/text_encoder/text_encoder.ipynb b/transforms/language/text_encoder/text_encoder.ipynb new file mode 100644 index 000000000..aca309594 --- /dev/null +++ b/transforms/language/text_encoder/text_encoder.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "#!pip install data-prep-toolkit\n", + "#!pip install data-prep-toolkit-transforms\n", + "#!pip install data-prep-connector" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. For this notebook, we use all the default parameters. For a complete list of parameters, please refer to the README.md for this transform.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c2a12abc-9460-4e45-8961-873b48a9ab19", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "\n", + "from data_processing.runtime.pure_python import PythonTransformLauncher\n", + "from data_processing.utils import ParamsUtils\n", + "from text_encoder_transform_python import TextEncoderPythonTransformConfiguration\n" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e90a853e-412f-45d7-af3d-959e755aeebb", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "input_folder = os.path.join (\"python\", \"test-data\", \"input\")\n", + "output_folder = os.path.join( \"python\", \"output\")\n", + "local_conf = {\n", + " \"input_folder\": input_folder,\n", + " \"output_folder\": output_folder,\n", + "}\n", + "params = {\n", + " \"data_local_config\": ParamsUtils.convert_to_ast(local_conf),\n", + " \"runtime_pipeline_id\": \"pipeline_id\",\n", + " \"runtime_job_id\": \"job_id\",\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "7949f66a-d207-45ef-9ad7-ad9406f8d42a", + "metadata": {}, + "source": [ + "##### ***** Use python runtime to invoke the transform" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0775e400-7469-49a6-8998-bd4772931459", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "15:44:57 INFO - pipeline id pipeline_id\n", + "15:44:57 INFO - code location None\n", + "15:44:57 INFO - data factory data_ is using local data access: input_folder - python/test-data/input output_folder - python/output\n", + "15:44:57 INFO - data factory data_ max_files -1, n_sample -1\n", + "15:44:57 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "15:44:57 INFO - orchestrator text_encoder started at 2024-11-20 15:44:57\n", + "15:44:57 INFO - Number of files is 1, source profile {'max_file_size': 0.0010089874267578125, 'min_file_size': 0.0010089874267578125, 'total_file_size': 0.0010089874267578125}\n", + "15:44:58 INFO - Completed 1 files (100.0%) in 0.003 min\n", + "15:44:58 INFO - Done processing 1 files, waiting for flush() completion.\n", + "15:44:58 INFO - done flushing in 0.0 sec\n", + "15:44:58 INFO - Completed execution in 0.017 min, execution result 0\n" + ] + } + ], + "source": [ + "%%capture\n", + "sys.argv = ParamsUtils.dict_to_req(d=params)\n", + "launcher = PythonTransformLauncher(runtime_config=TextEncoderPythonTransformConfiguration())\n", + "launcher.launch()\n" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['python/output/metadata.json', 'python/output/test1.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"python/output/*\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/ededup/README.md b/transforms/universal/ededup/README.md index 9a112e816..0390cc19c 100644 --- a/transforms/universal/ededup/README.md +++ b/transforms/universal/ededup/README.md @@ -1,4 +1,4 @@ -# Exect Deduplification Transform +# Exact Deduplication Transform ## Summary diff --git a/transforms/universal/web2parquet/README.md b/transforms/universal/web2parquet/README.md index 1841403a7..1a8ecb408 100644 --- a/transforms/universal/web2parquet/README.md +++ b/transforms/universal/web2parquet/README.md @@ -21,16 +21,24 @@ For configuring the crawl, users need to specify the following parameters: The transform can be installed directly from pypi and has a dependency on the data-prep-toolkit and the data-prep-connector +Set up the local environment to run Jupyter notebook: +``` +python -v venv venv +source venv/bin/activate +pip install jupyter lab +``` +Install pre-requisites: + ``` pip install data-prep-connector pip install data-prep-toolkit>=0.2.2.dev2 -pip install data-prep-toolkit-transform[web2parquet]>=0.2.2.dev3 +pip install 'data-prep-toolkit-transforms[web2parquet]>=0.2.2.dev3' ``` If working from a fork in the git repo, from the root folder of the git repo, do the following: ``` -cd transform/universal/web2parquet +cd transforms/universal/web2parquet make venv source venv/bin/activate pip install -r requirements.txt @@ -49,4 +57,4 @@ Web2Parquet(urls= ['https://thealliance.ai/'], depth=2, downloads=10, folder='downloads').transform() -```` \ No newline at end of file +```` diff --git a/transforms/universal/web2parquet/web2parquet.ipynb b/transforms/universal/web2parquet/web2parquet.ipynb index 2bd55f0bc..ea802d734 100644 --- a/transforms/universal/web2parquet/web2parquet.ipynb +++ b/transforms/universal/web2parquet/web2parquet.ipynb @@ -5,12 +5,12 @@ "id": "afd55886-5f5b-4794-838e-ef8179fb0394", "metadata": {}, "source": [ - "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release\n", + "##### **** These pip install need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "##### \n", "\n", - "##### **** example: \n", "```\n", - "python -m venv && source venv/bin/activate\n", - "pip install -r requirements.txt\n", + "make venv \n", + "source venv/bin/activate \n", "pip install jupyterlab\n", "```" ]