Merge branch 'dev' into ian-cho-patch-1

IBM · Dec 3, 2024 · 7102f05 · 7102f05
2 parents 2a374d1 + 19600d9
commit 7102f05
Show file tree

Hide file tree

Showing 386 changed files with 8,176 additions and 2,402 deletions.
diff --git a/.make.versions b/.make.versions
@@ -16,10 +16,10 @@ DPK_MAJOR_VERSION=0
 # The minor version is incremented manually when significant features have been added that are backward compatible with the previous major.minor release.
 DPK_MINOR_VERSION=2
 # The minor version is incremented AUTOMATICALLY by the release.sh script when a new release is set.
-DPK_MICRO_VERSION=2
+DPK_MICRO_VERSION=3
 # The suffix is generally always set in the main/development branch and only nulled out when creating release branches.
 # It can be manually incremented, for example, to allow publishing a new intermediate version wheel to pypi. 
-DPK_VERSION_SUFFIX=.dev2
+DPK_VERSION_SUFFIX=.dev0
 
 DPK_VERSION=$(DPK_MAJOR_VERSION).$(DPK_MINOR_VERSION).$(DPK_MICRO_VERSION)$(DPK_VERSION_SUFFIX)
 
@@ -39,7 +39,7 @@ DPK_LIB_KFP_SHARED=$(DPK_VERSION)
 KFP_DOCKER_VERSION=$(DOCKER_IMAGE_VERSION)
 KFP_DOCKER_VERSION_v2=$(DOCKER_IMAGE_VERSION)
 
-DPK_CONNECTOR_VERSION=0.2.3.dev0
+DPK_CONNECTOR_VERSION=0.2.4.dev0
 
 ################## ################## ################## ################## ################## ##################
 # Begin versions that the repo depends on. 
@@ -59,3 +59,11 @@ else
         WORKFLOW_SUPPORT_LIB=kfp_v1_workflow_support
 endif
 
+################################################################################
+# This defines the transforms' package version number as would be used
+# when publishing the wheel.  In general, only the micro version
+# number should be advanced relative to the DPK_VERSION. 
+#
+# If you change the versions numbers, be sure to run "make set-versions" to 
+# update version numbers across the transform (e.g., pyproject.toml).
+TRANSFORMS_PKG_VERSION=0.2.3.dev0
diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_connector"
-version = "0.2.3.dev1"
+version = "0.2.4.dev0"
 requires-python = ">=3.10,<3.13"
 keywords = [
     "data",

diff --git a/data-processing-lib/pyproject.toml b/data-processing-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 keywords = ["data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Toolkit Library for Ray and Python"
@@ -16,7 +16,6 @@ dynamic = ["dependencies", "optional-dependencies"]
 Repository = "https://github.com/IBM/data-prep-kit"
 Issues = "https://github.com/IBM/data-prep-kit/issues"
 Documentation = "https://ibm.github.io/data-prep-kit/doc"
-"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/noop"
 
 [build-system]
 requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]

diff --git a/data-processing-lib/python/requirements.txt b/data-processing-lib/python/requirements.txt
@@ -4,3 +4,4 @@
   argparse
   mmh3
   psutil
+  polars>=1.9.0
diff --git a/data-processing-lib/python/src/data_processing/utils/transform_utils.py b/data-processing-lib/python/src/data_processing/utils/transform_utils.py
@@ -11,6 +11,7 @@
 ################################################################################
 
 import hashlib
+import io
 import os
 import string
 import sys
@@ -144,8 +145,21 @@ def convert_binary_to_arrow(data: bytes, schema: pa.schema = None) -> pa.Table:
             table = pq.read_table(reader, schema=schema)
             return table
         except Exception as e:
-            logger.error(f"Failed to convert byte array to arrow table, exception {e}. Skipping it")
-            return None
+            logger.warning(f"Could not convert bytes to pyarrow: {e}")
+
+        # We have seen this exception before when using pyarrow, but polars does not throw it.
+        # "Nested data conversions not implemented for chunked array outputs"
+        # See issue 816 https://github.com/IBM/data-prep-kit/issues/816.
+        logger.info(f"Attempting read of pyarrow Table using polars")
+        try:
+            import polars
+
+            df = polars.read_parquet(io.BytesIO(data))
+            table = df.to_arrow()
+        except Exception as e:
+            logger.error(f"Could not convert bytes to pyarrow using polars: {e}. Skipping.")
+            table = None
+        return table
 
     @staticmethod
     def convert_arrow_to_binary(table: pa.Table) -> bytes:

diff --git a/data-processing-lib/spark/Makefile b/data-processing-lib/spark/Makefile
@@ -11,9 +11,14 @@ setup::
 
 set-versions: .check-env
 	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
-	sed -e 's/"pyspark...*",/"pyspark>=${SPARK_VERSION}",/'				\
-	    pyproject.toml > tt.toml
-	mv tt.toml pyproject.toml
+	if [ -e pyproject.toml ]; then					\
+		cat pyproject.toml | sed -e 's/"spark[default]==.*",/"spark[default]==$(SPARK_VERSION)",/' > tt.toml; \
+		mv tt.toml pyproject.toml; \
+	fi
+	if [ -e requirements.txt ]; then					\
+		cat requirements.txt | sed -e 's/ray[default]==.*/ray[default]==$(SPARK_VERSION)/' > tt.txt; \
+		mv tt.txt requirements.txt; \
+	fi
 
 build:: build-dist 
 
@@ -26,7 +31,7 @@ publish-dist :: .check-env .defaults.publish-dist
 
 publish-image:: .defaults.publish-image
 
-venv::  pyproject.toml
+venv::
 	$(MAKE) .defaults.spark-lib-src-venv
 	pip install pytest pytest-cov 
 

diff --git a/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v1_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_v1"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -13,7 +13,7 @@ authors = [
 ]
 dependencies = [
     "kfp==1.8.22",
-    "data-prep-toolkit-kfp-shared==0.2.2.dev2",
+    "data-prep-toolkit-kfp-shared==0.2.3.dev0",
 ]
 
 [build-system]

diff --git a/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml b/kfp/kfp_support_lib/kfp_v2_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_v2"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -14,7 +14,7 @@ authors = [
 dependencies = [
     "kfp==2.8.0",
     "kfp-kubernetes==1.2.0",
-    "data-prep-toolkit-kfp-shared==0.2.2.dev2",
+    "data-prep-toolkit-kfp-shared==0.2.3.dev0",
 ]
 
 [build-system]

diff --git a/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml b/kfp/kfp_support_lib/shared_workflow_support/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_toolkit_kfp_shared"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Data Preparation Kit Library. KFP support"
 license = {text = "Apache-2.0"}
@@ -14,7 +14,7 @@ authors = [
 dependencies = [
     "requests",
     "kubernetes",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]

diff --git a/release-notes.md b/release-notes.md
@@ -1,5 +1,42 @@
 # Data Prep Kit Release notes
 
+## Release 0.2.2 - 11/25/2024
+
+### General 
+1. Update RAG example to use granite model 
+1. Updated transforms with Docling 2
+1. Added single package for dpk with extra for \[spark\] and \[ray\]
+1. Added single package for transforms with extra for \[all\] or \[individual-transform-name\]
+
+
+### data-prep-toolkit libraries (python, ray, spark) 
+
+1. Fix metadata logging even when actors crash 
+1. Add multilock for ray workers downloads/cleanup
+1. Multiple updates to spark runtime
+1. Added support for python 3.12
+1. refactoring of data access code
+
+
+### KFP Workloads 
+
+1. Modify superpipeline params type Str/json
+1. Set kuberay apiserver version 
+1. Add Super pipeline for code transforms
+
+
+### Transforms
+
+1. Enhance pdf2parquet with docling2 support for extracting HTML, DOCS, etc.
+1. Added web2parquet transform
+1. Added HAP transform
+
+### HTTP Connector 0.2.3
+
+1. Enhanced parameter/configuration allows the user to customize crawler settings 
+1. implement subdomain focus feature in data-prep-connector 
+
+
 ## Release 0.2.2- HTTP Connector Module - 10/23/2024
 
 ### General 

diff --git a/resources.md b/resources.md
@@ -1,3 +1,8 @@
+# New Features & Enhancements
+
+- Support for Docling 2.0 added to DPK in [pdf2parquet](https://github.com/IBM/data-prep-kit/tree/dev/transforms/language/pdf2parquet/python) transform. The new updates allow DPK users to ingest other type of documents, e.g. MS Word, MS Powerpoint, Images, Markdown, Asciidocs, etc.
+- Released [Web2parquet](https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/web2parquet) transform for crawling the web.
+
 # Data Prep Kit Resources
 
 ## 📄 Papers
@@ -7,24 +12,43 @@
 3. [Scaling Granite Code Models to 128K Context](https://arxiv.org/abs/2407.13739)
 
 
-## 🎤 Talks
+## 🎤 External Events and Showcase
 
 1. **"Building Successful LLM Apps: The Power of high quality data"** - [Video](https://www.youtube.com/watch?v=u_2uiZBBVIE)  |   [Slides](https://www.slideshare.net/slideshow/data_prep_techniques_challenges_methods-pdf-a190/271527890)
 2. **"Hands on session for fine tuning LLMs"** - [Video](https://www.youtube.com/watch?v=VEHIA3E64DM)
 3. **"Build your own data preparation module using data-prep-kit"** - [Video](https://www.youtube.com/watch?v=0WUMG6HIgMg)
 4. **"Data Prep Kit: A Comprehensive Cloud-Native Toolkit for Scalable Data Preparation in GenAI App"** - [Video](https://www.youtube.com/watch?v=WJ147TGULwo) | [Slides](https://ossaidevjapan24.sched.com/event/1jKBm)
+5. **"RAG with Data Prep Kit" Workshop** @ Mountain View, CA, USA ** - [info](https://github.com/sujee/data-prep-kit-examples/blob/main/events/2024-09-21__RAG-workshop-data-riders.md)
+6. **Tech Educator summit** [IBM CSR Event](https://www.linkedin.com/posts/aanchalaggarwal_github-ibmdata-prep-kit-open-source-project-activity-7254062098295472128-OA_x?utm_source=share&utm_medium=member_desktop)
+7. **Talk and Hands on session** at [MIT Bangalore](https://www.linkedin.com/posts/saptha-surendran-71a4a0ab_ibmresearch-dataprepkit-llms-activity-7261987741087801346-h0no?utm_source=share&utm_medium=member_desktop)
+8. **PyData NYC 2024** - [90 mins Tutorial](https://nyc2024.pydata.org/cfp/talk/AWLTZP/)
+9. **Open Source AI** [Demo Night](https://lu.ma/oss-ai?tk=A8BgIt)
+10. [**Data Exchange Podcast with Ben Lorica**](https://thedataexchange.media/ibm-data-prep-kit/)
+11. Unstructured Data Meetup - SF, NYC, Silicon Valley
+12. IBM TechXchange Las Vegas
+13. Open Source [**RAG Pipeline workshop**](https://www.linkedin.com/posts/sujeemaniyam_dataprepkit-workshop-llm-activity-7256176802383986688-2UKc?utm_source=share&utm_medium=member_desktop) with Data Prep Kit at TechEquity's AI Summit in Silicon Valley
+14. **Data Science Dojo Meetup** - [video](https://datasciencedojo.com/tutorial/data-preparation-toolkit/)
+15. [**DPK tutorial and hands on session at IIIT Delhi**](https://www.linkedin.com/posts/cai-iiitd-97a6a4232_datascience-datapipelines-machinelearning-activity-7263121565125349376-FG8E?utm_source=share&utm_medium=member_desktop)
+
 
 ## Example Code
+Find example code in readme section of each tranform and some sample jupyter notebooks for getting started [**here**](examples/notebooks)
 
 ## Blogs / Tutorials
 
 - [**IBM Developer Blog**](https://developer.ibm.com/blogs/awb-unleash-potential-llms-data-prep-kit/) 
+- [**Introductory Blog on DPK**](https://www.linkedin.com/pulse/unleashing-potential-large-language-models-through-data-aanchal-goyal-fgtff)
+- [**DPK Header Cleanser Module Blog by external contributor**](https://www.linkedin.com/pulse/enhancing-data-quality-developing-header-cleansing-tool-kalathiya-i1ohc/?trackingId=6iAeBkBBRrOLijg3LTzIGA%3D%3D) 
+
 
-## Workshops
+# Relevant online communities
 
-- **2024-09-21: "RAG with Data Prep Kit" Workshop** @ Mountain View, CA, USA - [info](https://github.com/sujee/data-prep-kit-examples/blob/main/events/2024-09-21__RAG-workshop-data-riders.md)
+- [**Data Prep Kit Discord Channel**](https://discord.com/channels/1276554812359442504/1303454647427661866)
+- [**DPK is now listed in Github Awesome-LLM under LLM Data section**](https://github.com/Hannibal046/Awesome-LLM)
+- [**DPK is now up for access via IBM Skills Build Download**](https://academic.ibm.com/a2mt/downloads/artificial_intelligence#/)
+- [**DPK added to the Application Hub of “AI Sustainability Catalog”**](https://enterprise-neurosystem.github.io/Sustainability-Catalog/)
 
-## Discord
+## We Want Your Feedback!
+ Feel free to contribute to discussions or create a new one to share your [feedback](https://github.com/IBM/data-prep-kit/discussions)
 
-- [**Data Prep Kit Discord Channel**](https://discord.com/channels/1276554812359442504/1286046139921207476)
 
diff --git a/scripts/check-workflows.sh b/scripts/check-workflows.sh
@@ -17,7 +17,7 @@ if [ ! -d transforms ]; then
     echo Please run this script from the top of the repository
     exit 1
 fi
-KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser"
+KFP_BLACK_LIST="doc_chunk pdf2parquet pii_redactor text_encoder license_select repo_level_ordering header_cleanser fdedup"
 while [ $# -ne 0 ]; do
    case $1 in
         -show-kfp-black-list)    echo $KFP_BLACK_LIST; exit 0;

diff --git a/transforms/code/code2parquet/python/pyproject.toml b/transforms/code/code2parquet/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code2parquet_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "code2parquet Python Transform"
 license = {text = "Apache-2.0"}

diff --git a/transforms/code/code2parquet/python/requirements.txt b/transforms/code/code2parquet/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.3.dev0
 parameterized
 pandas
diff --git a/transforms/code/code2parquet/ray/pyproject.toml b/transforms/code/code2parquet/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code2parquet_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "code2parquet Ray Transform"
 license = {text = "Apache-2.0"}
@@ -10,8 +10,8 @@ authors = [
     { name = "Boris Lublinsky", email = "[email protected]" },
 ]
 dependencies = [
-    "data-prep-toolkit[ray]==0.2.2.dev2",
-    "dpk-code2parquet-transform-python==0.2.2.dev2",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
+    "dpk-code2parquet-transform-python==0.2.3.dev0",
     "parameterized",
     "pandas",
 ]

diff --git a/transforms/code/code_profiler/python/pyproject.toml b/transforms/code/code_profiler/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_profiler_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Code Profiler Python Transform"
 license = {text = "Apache-2.0"}

diff --git a/transforms/code/code_profiler/python/requirements.txt b/transforms/code/code_profiler/python/requirements.txt
@@ -1,4 +1,4 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.3.dev0
 parameterized
 pandas
 aiolimiter==1.1.0

diff --git a/transforms/code/code_profiler/ray/pyproject.toml b/transforms/code/code_profiler/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_profiler_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Code Profiler Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Pankaj Thorat", email = "[email protected]" },
 ]
 dependencies = [
-	"dpk-code-profiler-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+	"dpk-code-profiler-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 	]
 
 [build-system]

diff --git a/transforms/code/code_quality/python/pyproject.toml b/transforms/code/code_quality/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_quality_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Code Quality Python Transform"
 license = {text = "Apache-2.0"}

diff --git a/transforms/code/code_quality/python/requirements.txt b/transforms/code/code_quality/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.3.dev0
 bs4==0.0.2
 transformers==4.38.2
diff --git a/transforms/code/code_quality/ray/pyproject.toml b/transforms/code/code_quality/ray/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_code_quality_transform_ray"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "Code Quality Ray Transform"
 license = {text = "Apache-2.0"}
@@ -9,8 +9,8 @@ authors = [
     { name = "Shivdeep Singh", email = "[email protected]" },
 ]
 dependencies = [
-    "dpk-code-quality-transform-python==0.2.2.dev2",
-    "data-prep-toolkit[ray]==0.2.2.dev2",
+    "dpk-code-quality-transform-python==0.2.3.dev0",
+    "data-prep-toolkit[ray]==0.2.3.dev0",
 ]
 
 [build-system]

diff --git a/transforms/code/header_cleanser/python/pyproject.toml b/transforms/code/header_cleanser/python/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dpk_header_cleanser_transform_python"
-version = "0.2.2.dev2"
+version = "0.2.3.dev0"
 requires-python = ">=3.10,<3.13"
 description = "License and Copyright Removal Transform for Python"
 license = {text = "Apache-2.0"}

diff --git a/transforms/code/header_cleanser/python/requirements.txt b/transforms/code/header_cleanser/python/requirements.txt
@@ -1,3 +1,3 @@
-data-prep-toolkit==0.2.2.dev2
+data-prep-toolkit==0.2.3.dev0
 scancode-toolkit==32.1.0 ; platform_system != 'Darwin'
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ @@
       argparse
       mmh3
       psutil
+      polars>=1.9.0