diff --git a/transforms/Makefile.transform.template b/transforms/Makefile.transform.template
new file mode 100644
index 000000000..72feb87cf
--- /dev/null
+++ b/transforms/Makefile.transform.template
@@ -0,0 +1,16 @@
+REPOROOT=../../..
+# Use make help, to see the available rules
+include $(REPOROOT)/transforms/.make.cicd.targets
+
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
+
+################################################################################
+
+
diff --git a/transforms/universal/tokenization/python/Dockerfile b/transforms/universal/tokenization/Dockerfile.python
similarity index 72%
rename from transforms/universal/tokenization/python/Dockerfile
rename to transforms/universal/tokenization/Dockerfile.python
index 9f2c9dc38..35552e198 100644
--- a/transforms/universal/tokenization/python/Dockerfile
+++ b/transforms/universal/tokenization/Dockerfile.python
@@ -18,21 +18,10 @@ RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
 
 # END OF STEPS destined for a data-prep-kit base image 
 
-COPY --chown=dpk:root src/ src/
-COPY --chown=dpk:root pyproject.toml pyproject.toml 
+COPY --chown=dpk:root dpk_tokenization/ dpk_tokenization/
 COPY --chown=dpk:root requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r  requirements.txt
-RUN pip install --no-cache-dir -e .
 
-# copy the main() entry point to the image 
-COPY ./src/tokenization_transform_python.py .
-
-# copy some of the samples in
-COPY src/tokenization_local_python.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/dpk
diff --git a/transforms/universal/tokenization/ray/Dockerfile b/transforms/universal/tokenization/Dockerfile.ray
similarity index 56%
rename from transforms/universal/tokenization/ray/Dockerfile
rename to transforms/universal/tokenization/Dockerfile.ray
index 223b0c483..5462e48e8 100644
--- a/transforms/universal/tokenization/ray/Dockerfile
+++ b/transforms/universal/tokenization/Dockerfile.ray
@@ -13,24 +13,11 @@ ARG DPK_WHEEL_FILE_NAME
 COPY --chown=ray:users data-processing-dist data-processing-dist
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
 
-## Copy the python version of the tansform
-COPY --chown=ray:users python-transform/ python-transform
-RUN cd python-transform && pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -e .
 
+COPY --chown=ray:users dpk_tokenization/ dpk_tokenization/
+COPY --chown=ray:users requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
 
-COPY --chown=ray:users src/ src/
-COPY --chown=ray:users pyproject.toml pyproject.toml 
-RUN pip install --no-cache-dir -e .
-
-# copy the main() entry point to the image 
-COPY ./src/tokenization_transform_ray.py .
-
-# copy some of the samples in
-COPY src/tokenization_local_ray.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/ray
diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile
index bca6f7e85..09feb6f02 100644
--- a/transforms/universal/tokenization/Makefile
+++ b/transforms/universal/tokenization/Makefile
@@ -1,79 +1,21 @@
 REPOROOT=../../..
 # Use make help, to see the available rules
-include $(REPOROOT)/.make.defaults
+include $(REPOROOT)/transforms/.make.cicd.targets
 
-setup::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
 
-clean::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
+################################################################################
 
-build::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-venv::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-set-versions:  
-	@# Help: Recursively $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-publish:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-src::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-kind-load-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-docker-load-image::
-	@# Help: Recursively make $@ in all subdirs
-	$(MAKE) RULE=$@ .recurse
-
-docker-save-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-.PHONY: workflow-venv
-workflow-venv:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-venv;   \
-	fi
-
-.PHONY: workflow-test
-workflow-test:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-test;   \
-	fi
-	
-.PHONY: workflow-upload
-workflow-upload:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-upload; \
-	fi
-
-.PHONY: workflow-build
-workflow-build:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C  kfp_ray workflow-build; \
-	fi
 
+run-cli-sample: 
+	make venv
+	source venv/bin/activate && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
+            --data_local_config "{ 'input_folder' : 'test-data/ds01/input', 'output_folder' : 'output'}"
diff --git a/transforms/universal/tokenization/README.md b/transforms/universal/tokenization/README.md
index 3fd4571ff..b80ed2c99 100644
--- a/transforms/universal/tokenization/README.md
+++ b/transforms/universal/tokenization/README.md
@@ -1,13 +1,122 @@
-# Tokenization Transform 
-The tokenization transform annotates pyarrow tables and parquet files
-to add a column containing tokens for the document column. 
-Per the set of 
+<p align="Left"> Distributed tokenization module for data sets using any Hugging Face compatible tokenizer.
+    <br> 
+</p>
+
+
+## Contributors
+
+- Xuan-Hong Dang (xuan-hong.dang@ibm.com)
+
+# Data Tokenization
+Please see the set of
 [transform project conventions](../../README.md#transform-project-conventions)
-the following runtimes are available:
-
-* [python](python/README.md) - provides the core python-based transformation 
-implementation.
-* [ray](ray/README.md) - enables the running of the python-based transformation
-in a Ray runtime
-* [kfp](kfp_ray/README.md) - enables running the ray docker image 
-the transform in a kubernetes cluster using a generated `yaml` file.
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary 
+The data tokenization transform operates by converting a (non-empty) input table into an output table 
+using a pre-trained tokenizer. The input table is required to have a minimum of two columns, 
+named `document_id` and `contents` by default. However, alternate column names can be specified using 
+`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents.
+It is essential for the values within the `document_id` column to be unique across the dataset, 
+while the `contents` column stores their respective document content. To execute example demonstrations within this directory, 
+a machine with `64GiB` of RAM is recommended.
+
+To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter. 
+This parameter accepts the name of a tokenizer ready for download from Hugging Face, 
+such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible 
+with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter 
+to include extra arguments specific to the chosen tokenizer. 
+For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token, 
+you can specify `use_auth_token=<your token>` in `--tkn_tokenizer`. 
+
+The tokenization transformer utilizes the specified tokenizer to tokenize each row, 
+assuming each row represents a document, in the input table and save it to a corresponding row in the output table. 
+The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`.
+
+The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process. 
+The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID, 
+while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens.
+During tokenization, the tokenizer will disregard empty documents (rows) in the input table, 
+as well as documents that yield no tokens or encounter failure during tokenization. 
+The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file.
+
+
+In certain cases, the tokenization process of some tokenizers may be sluggish, 
+particularly when handling lengthy documents containing millions of characters. 
+To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time.
+For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text. 
+The tokenizer will then tokenize each chunk separately and combine their resulting token IDs.
+By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length.
+
+
+
+## Running
+
+### CLI Options
+The following command line arguments are available in addition to 
+the options provided by the [python launcher](../../../data-processing-lib/doc/python-launcher-options.md).
+```
+  --tkn_tokenizer TKN_TOKENIZER
+                        Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used
+  --tkn_tokenizer_args TKN_TOKENIZER_ARGS
+                        Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace
+  --tkn_doc_id_column TKN_DOC_ID_COLUMN
+                        Column contains document id which values should be unique across dataset
+  --tkn_doc_content_column TKN_DOC_CONTENT_COLUMN
+                        Column contains document content
+  --tkn_text_lang TKN_TEXT_LANG
+                        Specify language used in the text content for better text splitting if needed
+  --tkn_chunk_size TKN_CHUNK_SIZE
+                        Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)
+```
+
+### Running the samples
+To run the samples, use the following `make` target
+
+* `run-cli-sample` - runs dpk_tokenization/transform_python.py using command line args
+
+
+These targets will activate the virtual environment and set up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example, 
+```shell
+make run-cli-sample
+...
+```
+Then 
+```shell
+ls output
+```
+To see results of the transform.
+
+### Code example
+Here is a sample [notebook](tokenization.ipynb)
+
+
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+# Tokenization Transform for Ray
+Please see the set of
+[transform project conventions](../../README.md#transform-project-conventions)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary 
+This project wraps the tokenization transform with a Ray runtime.
+
+## Configuration and command line Options
+
+Configuration and command line options are the same as for the base python transform. 
+
+### Launched Command Line Options 
+In addition to those available to the transform as defined in here,
+the set of 
+[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available.
+
diff --git a/transforms/universal/tokenization/python/src/tokenization_local_python.py b/transforms/universal/tokenization/dpk_tokenization/local.py
similarity index 91%
rename from transforms/universal/tokenization/python/src/tokenization_local_python.py
rename to transforms/universal/tokenization/dpk_tokenization/local.py
index eb4766d60..7978e4dee 100644
--- a/transforms/universal/tokenization/python/src/tokenization_local_python.py
+++ b/transforms/universal/tokenization/dpk_tokenization/local.py
@@ -15,12 +15,12 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
diff --git a/transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py b/transforms/universal/tokenization/dpk_tokenization/local_long_doc.py
similarity index 92%
rename from transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py
rename to transforms/universal/tokenization/dpk_tokenization/local_long_doc.py
index 788ec0d08..f657d946d 100644
--- a/transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py
+++ b/transforms/universal/tokenization/dpk_tokenization/local_long_doc.py
@@ -15,12 +15,12 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds02", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds02"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds02", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds02"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
diff --git a/transforms/universal/tokenization/ray/src/tokenization_local_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/local.py
similarity index 92%
rename from transforms/universal/tokenization/ray/src/tokenization_local_ray.py
rename to transforms/universal/tokenization/dpk_tokenization/ray/local.py
index bd92415a3..45a32880a 100644
--- a/transforms/universal/tokenization/ray/src/tokenization_local_ray.py
+++ b/transforms/universal/tokenization/dpk_tokenization/ray/local.py
@@ -15,12 +15,12 @@
 
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from tokenization_transform_ray import TokenizationRayConfiguration
+from dpk_tokenization.ray.transform import TokenizationRayConfiguration
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,
diff --git a/transforms/universal/tokenization/ray/src/tokenization_s3_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/s3.py
similarity index 96%
rename from transforms/universal/tokenization/ray/src/tokenization_s3_ray.py
rename to transforms/universal/tokenization/dpk_tokenization/ray/s3.py
index 4ad450912..8777e00e9 100644
--- a/transforms/universal/tokenization/ray/src/tokenization_s3_ray.py
+++ b/transforms/universal/tokenization/dpk_tokenization/ray/s3.py
@@ -15,7 +15,7 @@
 
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from tokenization_transform_ray import TokenizationRayConfiguration
+from dpk_tokenization.ray.transform import TokenizationRayConfiguration
 
 
 print(os.environ)
diff --git a/transforms/universal/tokenization/ray/src/tokenization_transform_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/transform.py
similarity index 94%
rename from transforms/universal/tokenization/ray/src/tokenization_transform_ray.py
rename to transforms/universal/tokenization/dpk_tokenization/ray/transform.py
index c7d210417..b95d2d30b 100644
--- a/transforms/universal/tokenization/ray/src/tokenization_transform_ray.py
+++ b/transforms/universal/tokenization/dpk_tokenization/ray/transform.py
@@ -15,7 +15,7 @@
 from data_processing_ray.runtime.ray.runtime_configuration import (
     RayTransformRuntimeConfiguration,
 )
-from tokenization_transform import TokenizationTransformConfiguration
+from dpk_tokenization.transform import TokenizationTransformConfiguration
 
 
 logger = get_logger(__name__)
diff --git a/transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py b/transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py
similarity index 96%
rename from transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py
rename to transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py
index 90e3cc29e..fffb2bbb4 100644
--- a/transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py
+++ b/transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py
@@ -14,7 +14,7 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
 
 
 # create parameters
diff --git a/transforms/universal/tokenization/python/src/tokenization_transform.py b/transforms/universal/tokenization/dpk_tokenization/transform.py
similarity index 99%
rename from transforms/universal/tokenization/python/src/tokenization_transform.py
rename to transforms/universal/tokenization/dpk_tokenization/transform.py
index aedb5ca08..277c333fa 100644
--- a/transforms/universal/tokenization/python/src/tokenization_transform.py
+++ b/transforms/universal/tokenization/dpk_tokenization/transform.py
@@ -21,7 +21,7 @@
 
 import pyarrow as pa
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from tokenization_utils import is_valid_argument_string, load_tokenizer, split_text
+from dpk_tokenization.utils import is_valid_argument_string, load_tokenizer, split_text
 
 
 CHUNK_CHECKPOINT_INTERVAL = 100
diff --git a/transforms/universal/tokenization/python/src/tokenization_transform_python.py b/transforms/universal/tokenization/dpk_tokenization/transform_python.py
similarity index 52%
rename from transforms/universal/tokenization/python/src/tokenization_transform_python.py
rename to transforms/universal/tokenization/dpk_tokenization/transform_python.py
index 2d22a52cb..8efed547d 100644
--- a/transforms/universal/tokenization/python/src/tokenization_transform_python.py
+++ b/transforms/universal/tokenization/dpk_tokenization/transform_python.py
@@ -10,11 +10,14 @@
 # limitations under the License.
 ################################################################################
 
+import sys
+
 from data_processing.runtime.pure_python import (
     PythonTransformLauncher,
     PythonTransformRuntimeConfiguration,
 )
-from tokenization_transform import TokenizationTransformConfiguration
+from data_processing.utils import ParamsUtils
+from dpk_tokenization.transform import TokenizationTransformConfiguration
 
 
 class TokenizationPythonConfiguration(PythonTransformRuntimeConfiguration):
@@ -22,6 +25,29 @@ def __init__(self):
         super().__init__(transform_config=TokenizationTransformConfiguration())
 
 
+class Tokenization:
+    def __init__(self, **kwargs):
+        self.params = {}
+        for key in kwargs:
+            self.params[key] = kwargs[key]
+        # if input_folder and output_folder are specified, then assume it is represent data_local_config
+        try:
+            local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")}
+            self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
+            del self.params["input_folder"]
+            del self.params["output_folder"]
+        except:
+            pass
+
+    def transform(self):
+        sys.argv = ParamsUtils.dict_to_req(d=(self.params))
+        # create launcher
+        launcher = PythonTransformLauncher(TokenizationPythonConfiguration())
+        # launch
+        return_code = launcher.launch()
+        return return_code
+
+
 if __name__ == "__main__":
     launcher = PythonTransformLauncher(TokenizationPythonConfiguration())
     launcher.launch()
diff --git a/transforms/universal/tokenization/python/src/tokenization_utils.py b/transforms/universal/tokenization/dpk_tokenization/utils.py
similarity index 100%
rename from transforms/universal/tokenization/python/src/tokenization_utils.py
rename to transforms/universal/tokenization/dpk_tokenization/utils.py
diff --git a/transforms/universal/tokenization/kfp_ray/Makefile b/transforms/universal/tokenization/kfp_ray/Makefile
index c43105ff1..858db1b0a 100644
--- a/transforms/universal/tokenization/kfp_ray/Makefile
+++ b/transforms/universal/tokenization/kfp_ray/Makefile
@@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../
 WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
 include $(REPOROOT)/transforms/.make.workflows
 
-# Include the common configuration for this transform
-include ../transform.config
+SRC_DIR=${CURDIR}/../
+# Use the docker image that is built for ray runtime
+TRANSFORM_RUNTIME=ray
+## override settings in .make.default as they assume old structure with ray being the current folder
+DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME)
+DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
 
-SRC_DIR=${CURDIR}/../ray/
+# Only build the image with -f Dockerfile.ray
+BUILD_SPECIFIC_RUNTIME=ray
 
 PYTHON_WF := $(shell find ./ -name '*_wf.py')
 YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF})
@@ -15,29 +20,8 @@ workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE}
 .PHONY: clean
 clean:
 	@# Help: Clean up the virtual environment.
-	rm -rf ${REPOROOT}/transforms/venv
+	rm -rf ${REPOROOT}/transforms/venv 
 
-venv::
-
-build::
-
-setup::
-
-test::
-
-test-src::
-
-test-image::
-
-publish::
-
-image::
-
-kind-load-image::
-
-docker-load-image::
-
-docker-save-image::
 
 .PHONY: workflow-build
 workflow-build: workflow-venv
@@ -45,10 +29,19 @@ workflow-build: workflow-venv
 
 .PHONY: workflow-test
 workflow-test: workflow-build
-	$(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=tokenization_wf.yaml
+	$(MAKE) TRANSFORM_SRC=${SRC_DIR} \
+		TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \
+		TRANSFORM_NAME=$(TRANSFORM_NAME) \
+		BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \
+		DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \
+		PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline
 
 .PHONY: workflow-upload
 workflow-upload: workflow-build
 	@for file in $(YAML_WF); do \
 		$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
 	done
+
+
+
+
diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
index c131d11ea..c9fb6f2e9 100644
--- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
+++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
@@ -18,7 +18,7 @@
 
 
 # the name of the job script
-EXEC_SCRIPT_NAME: str = "tokenization_transform_ray.py"
+EXEC_SCRIPT_NAME: str = "-m dpk_tokenization.ray.transform"
 
 task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest"
 
@@ -112,7 +112,14 @@ def tokenization(
     ray_name: str = "tkn-kfp-ray",  # name of Ray cluster
     # Add image_pull_secret and image_pull_policy to ray workers if needed
     ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image},
-    ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
+    ray_worker_options: dict = {
+        "replicas": 2,
+        "max_replicas": 2,
+        "min_replicas": 2,
+        "cpu": 2,
+        "memory": 4,
+        "image": task_image,
+    },
     server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
     # data access
     data_s3_config: str = "{'input_folder': 'test/tokenization/ds01/input/', 'output_folder': 'test/tokenization/ds01/output/'}",
@@ -120,9 +127,9 @@ def tokenization(
     data_max_files: int = -1,
     data_num_samples: int = -1,
     # orchestrator
-    runtime_actor_options: dict = {'num_cpus': 0.8},
+    runtime_actor_options: dict = {"num_cpus": 0.8},
     runtime_pipeline_id: str = "pipeline_id",
-    runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
+    runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"},
     # tokenizer parameters
     tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer",
     tkn_doc_id_column: str = "document_id",
@@ -175,7 +182,9 @@ def tokenization(
     :return: None
     """
     # create clean_up task
-    clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params)
+    clean_up_task = cleanup_ray_op(
+        ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params
+    )
     ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2)
     # pipeline definition
     with dsl.ExitHandler(clean_up_task):
diff --git a/transforms/universal/tokenization/python/.dockerignore b/transforms/universal/tokenization/python/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/tokenization/python/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/tokenization/python/Makefile b/transforms/universal/tokenization/python/Makefile
deleted file mode 100644
index 8f4f7fbf5..000000000
--- a/transforms/universal/tokenization/python/Makefile
+++ /dev/null
@@ -1,65 +0,0 @@
-# Define the root of the local git clone for the common rules to be able 
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to 
-#   1 if requirements reference the latest code in the data processing library 
-#     in this repo (that is not yet published to pypi).	 This is the default setting.
-#   0 if the transforms DPK dependencies are on wheels published to 
-#     pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below. 
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-venv::	.transforms.python-venv
-
-test::	.transforms.python-test
-
-clean:: .transforms.clean
-
-image:: .transforms.python-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-python
-
-setup:: .transforms.setup
-
-# distribution versions is the same as image version.
-set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=$(TOKENIZATION_PYTHON_VERSION) TOML_VERSION=$(TOKENIZATION_PYTHON_VERSION) .transforms.set-versions 
-        
-build-dist:: .defaults.build-dist 
-
-publish-dist:: .defaults.publish-dist
-
-test-image:: .transforms.python-test-image
-
-run-cli-sample: 
-	$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \
-                RUN_ARGS="--data_local_config \"{ 'input_folder' : '../test-data/ds01/input', 'output_folder' : '../output'}\"  \
-                "  .transforms.run-src-file
-
-run-local-sample: .transforms.run-local-python-sample
-
-#run-s3-sample: .transforms.run-s3-sample
-
-minio-start:	.minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/tokenization/python/README.md b/transforms/universal/tokenization/python/README.md
deleted file mode 100644
index 0c470bb73..000000000
--- a/transforms/universal/tokenization/python/README.md
+++ /dev/null
@@ -1,100 +0,0 @@
-<p align="Left"> Distributed tokenization module for data sets using any Hugging Face compatible tokenizer.
-    <br> 
-</p>
-
-## 📝 Table of Contents
-- [Summary](#Summary)
-- [Running](#Running)
-- [CLI Options](#cli_options)
-
-# Data Tokenization
-Please see the set of
-[transform project conventions](../../../README.md)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
-
-## Summary 
-The data tokenization transform operates by converting a (non-empty) input table into an output table 
-using a pre-trained tokenizer. The input table is required to have a minimum of two columns, 
-named `document_id` and `contents` by default. However, alternate column names can be specified using 
-`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents.
-It is essential for the values within the `document_id` column to be unique across the dataset, 
-while the `contents` column stores their respective document content. To execute example demonstrations within this directory, 
-a machine with `64GiB` of RAM is recommended.
-
-To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter. 
-This parameter accepts the name of a tokenizer ready for download from Hugging Face, 
-such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible 
-with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter 
-to include extra arguments specific to the chosen tokenizer. 
-For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token, 
-you can specify `use_auth_token=<your token>` in `--tkn_tokenizer`. 
-
-The tokenization transformer utilizes the specified tokenizer to tokenize each row, 
-assuming each row represents a document, in the input table and save it to a corresponding row in the output table. 
-The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`.
-
-The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process. 
-The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID, 
-while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens.
-During tokenization, the tokenizer will disregard empty documents (rows) in the input table, 
-as well as documents that yield no tokens or encounter failure during tokenization. 
-The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file.
-
-
-In certain cases, the tokenization process of some tokenizers may be sluggish, 
-particularly when handling lengthy documents containing millions of characters. 
-To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time.
-For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text. 
-The tokenizer will then tokenize each chunk separately and combine their resulting token IDs.
-By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length.
-
-
-
-## Running
-
-### CLI Options
-The following command line arguments are available in addition to 
-the options provided by the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md)
-and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
-```
-  --tkn_tokenizer TKN_TOKENIZER
-                        Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used
-  --tkn_tokenizer_args TKN_TOKENIZER_ARGS
-                        Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace
-  --tkn_doc_id_column TKN_DOC_ID_COLUMN
-                        Column contains document id which values should be unique across dataset
-  --tkn_doc_content_column TKN_DOC_CONTENT_COLUMN
-                        Column contains document content
-  --tkn_text_lang TKN_TEXT_LANG
-                        Specify language used in the text content for better text splitting if needed
-  --tkn_chunk_size TKN_CHUNK_SIZE
-                        Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)
-```
-
-### Running the samples
-To run the samples, use the following `make` targets
-
-* `run-cli-sample` - runs src/tokenization_transform_python.py using command line args
-* `run-local-sample` - runs src/tokenization_local_python.py
-
-These targets will activate the virtual environment and set up any configuration needed.
-Use the `-n` option of `make` to see the detail of what is done to run the sample.
-
-For example, 
-```shell
-make run-cli-sample
-...
-```
-Then 
-```shell
-ls output
-```
-To see results of the transform.
-
-
-### Transforming data using the transform image
-
-To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml
deleted file mode 100644
index fb9ee0c48..000000000
--- a/transforms/universal/tokenization/python/pyproject.toml
+++ /dev/null
@@ -1,53 +0,0 @@
-[project]
-name = "dpk_tokenization_transform_python"
-keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
-version = "0.2.3.dev2"
-requires-python = ">=3.10,<3.13"
-description = "Tokenization Transform for Python"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
-    { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"},
-]
-
-[project_urls]
-Repository = "https://github.com/IBM/data-prep-kit"
-Issues = "https://github.com/IBM/data-prep-kit/issues"
-Documentation = "https://ibm.github.io/data-prep-kit/"
-"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/tokenization"
-
-dynamic = ["dependencies"]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-
-[tool.setuptools.dynamic]
-dependencies = {file = ["requirements.txt"]}
-
-[project.optional-dependencies]
-dev = [
-    "twine",
-    "pytest>=7.3.2",
-    "pytest-dotenv>=0.5.2",
-    "pytest-env>=1.0.0",
-    "pre-commit>=3.3.2",
-    "pytest-cov>=4.1.0",
-    "pytest-mock>=3.10.0",
-    "moto==5.0.5",
-    "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/tokenization/ray/.dockerignore b/transforms/universal/tokenization/ray/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/tokenization/ray/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/tokenization/ray/.gitignore b/transforms/universal/tokenization/ray/.gitignore
deleted file mode 100644
index 3ea7fd4ab..000000000
--- a/transforms/universal/tokenization/ray/.gitignore
+++ /dev/null
@@ -1,38 +0,0 @@
-test-data/output
-output/*
-/output/
-data-processing-lib/
-
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-
-# Distribution / packaging
-bin/
-build/
-develop-eggs/
-dist/
-eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-.tox/
-htmlcov
-.coverage
-.cache
-nosetests.xml
-coverage.xml
\ No newline at end of file
diff --git a/transforms/universal/tokenization/ray/Makefile b/transforms/universal/tokenization/ray/Makefile
deleted file mode 100644
index 0a4e3a370..000000000
--- a/transforms/universal/tokenization/ray/Makefile
+++ /dev/null
@@ -1,65 +0,0 @@
-# Define the root of the local git clone for the common rules to be able 
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to 
-#   1 if requirements reference the latest code in the data processing library 
-#     in this repo (that is not yet published to pypi).	 This is the default setting.
-#   0 if the transforms DPK dependencies are on wheels published to 
-#     pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse.  However, feel free
-# to override/redefine the rules below. 
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-BASE_IMAGE=${RAY_BASE_IMAGE}
-venv::	.transforms.ray-venv
-
-test::	.transforms.ray-test
-
-clean:: .transforms.clean
-
-image:: .transforms.ray-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-ray
-
-setup:: .transforms.setup
-
-set-versions:
-	$(MAKE) TRANSFORM_PYTHON_VERSION=$(TOKENIZATION_PYTHON_VERSION) TOML_VERSION=$(TOKENIZATION_RAY_VERSION) .transforms.set-versions
-        
-build-dist:: .defaults.build-dist 
-
-publish-dist:: .defaults.publish-dist
-
-test-image:: .transforms.ray-test-image
-
-run-cli-sample: 
-	$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \
-                RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/ds01/input', 'output_folder' : '../output'}\"  \
-                "  .transforms.run-src-file
-
-#run-local-sample: .transforms.run-local-sample
-
-run-s3-sample: .transforms.run-s3-ray-sample
-
-minio-start:	.minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/tokenization/ray/README.md b/transforms/universal/tokenization/ray/README.md
deleted file mode 100644
index 1181d6878..000000000
--- a/transforms/universal/tokenization/ray/README.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Tokenization Transform for Ray
-Please see the set of
-[transform project conventions](../../../README.md#transform-project-conventions)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
-
-## Summary 
-This project wraps the [tokenization transform](../python) with a Ray runtime.
-
-## Configuration and command line Options
-
-Noop configuration and command line options are the same as for the base python transform. 
-
-## Running
-
-### Launched Command Line Options 
-In addition to those available to the transform as defined in [here](../python/README.md),
-the set of 
-[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available.
-
-### Running the samples
-To run the samples, use the following `make` targets
-
-* `run-cli-sample` - runs src/tokenization_transform_ray.py using command line args
-* `run-local-sample` - runs src/tokenization_local_ray.py
-* `run-s3-sample` - runs src/filter_s3_ray.py
-    * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html)
-     and [here](https://min.io/docs/minio/linux/index.html) 
-     and invocation of `make minio-start` to load data into local minio for S3 access.
-
-These targets will activate the virtual environment and set up any configuration needed.
-Use the `-n` option of `make` to see the detail of what is done to run the sample.
-
-For example, 
-```shell
-make run-cli-sample
-...
-```
-Then 
-```shell
-ls output
-```
-To see results of the transform.
-
-### Transforming data using the transform image
-
-To use the transform image to transform your data, please refer to the 
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml
deleted file mode 100644
index 0829e002c..000000000
--- a/transforms/universal/tokenization/ray/pyproject.toml
+++ /dev/null
@@ -1,45 +0,0 @@
-[project]
-name = "dpk_tokenization_transform_ray"
-version = "0.2.3.dev2"
-requires-python = ">=3.10,<3.13"
-description = "Tokenization Transform for Ray"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
-    { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"},
-]
-dependencies = [
-    "dpk-tokenization-transform-python==0.2.3.dev2",
-    "data-prep-toolkit[ray]>=0.2.3.dev2",
-]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-
-[project.optional-dependencies]
-dev = [
-    "twine",
-    "pytest>=7.3.2",
-    "pytest-dotenv>=0.5.2",
-    "pytest-env>=1.0.0",
-    "pre-commit>=3.3.2",
-    "pytest-cov>=4.1.0",
-    "pytest-mock>=3.10.0",
-    "moto==5.0.5",
-    "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
deleted file mode 100644
index e452fbf9a..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet
deleted file mode 100644
index bff7a5ed9..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet
deleted file mode 100644
index fbdeaf64b..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json b/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json
deleted file mode 100644
index e6c190807..000000000
--- a/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json
+++ /dev/null
@@ -1,59 +0,0 @@
-{
-    "pipeline": "pipeline_id",
-    "job details": {
-        "job category": "preprocessing",
-        "job name": "Tokenization",
-        "job type": "ray",
-        "job id": "job_id",
-        "start_time": "2024-03-29 13:30:56",
-        "end_time": "2024-03-29 13:30:57",
-        "status": "success"
-    },
-    "code": {
-        "github": "github",
-        "commit_hash": "12345",
-        "path": "path"
-    },
-    "job_input_params": {
-        "tokenizer": "hf-internal-testing/llama-tokenizer",
-        "tokenizer_args": null,
-        "doc_id_column": "document_id",
-        "doc_content_column": "contents",
-        "text_lang": "en",
-        "chunk_size": 0,
-        "checkpointing": false,
-        "max_files": -1,
-        "number of workers": 5,
-        "worker options": {
-            "num_cpus": 0.8
-        },
-        "actor creation delay": 0
-    },
-    "execution_stats": {
-        "cpus": 10,
-        "gpus": 0,
-        "memory": 27.31659088190645,
-        "object_store": 2.0
-    },
-    "job_output_stats": {
-        "source_files": 5,
-        "source_size": 450,
-        "result_files": 3,
-        "result_size": 842,
-        "table_processing": 0.03880786895751953,
-        "num_files": 3,
-        "num_rows": 6,
-        "num_tokenized_rows": 6,
-        "num_tokens": 85,
-        "num_chars": 384,
-        "skipped empty tables": 2
-    },
-    "source": {
-        "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds01/input",
-        "type": "path"
-    },
-    "target": {
-        "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds01",
-        "type": "path"
-    }
-}
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
deleted file mode 100644
index 83bdac0b4..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet
deleted file mode 100644
index 5a86a7b13..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet
deleted file mode 100644
index 5a86a7b13..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet
deleted file mode 100644
index 07fd2adfe..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet
deleted file mode 100644
index 879fcf0f0..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet b/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet
deleted file mode 100644
index e81983916..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json b/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json
deleted file mode 100644
index dc9813beb..000000000
--- a/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-    "pipeline": "pipeline_id",
-    "job details": {
-        "job category": "preprocessing",
-        "job name": "Tokenization",
-        "job type": "ray",
-        "job id": "job_id",
-        "start_time": "2024-03-29 14:03:15",
-        "end_time": "2024-03-29 14:03:32",
-        "status": "success"
-    },
-    "code": {
-        "github": "github",
-        "commit_hash": "12345",
-        "path": "path"
-    },
-    "job_input_params": {
-        "tokenizer": "hf-internal-testing/llama-tokenizer",
-        "tokenizer_args": null,
-        "doc_id_column": "document_id",
-        "doc_content_column": "contents",
-        "text_lang": "en",
-        "chunk_size": 20000,
-        "checkpointing": false,
-        "max_files": -1,
-        "number of workers": 5,
-        "worker options": {
-            "num_cpus": 0.8
-        },
-        "actor creation delay": 0
-    },
-    "execution_stats": {
-        "cpus": 10,
-        "gpus": 0,
-        "memory": 27.180484008975327,
-        "object_store": 2.0
-    },
-    "job_output_stats": {
-        "source_files": 1,
-        "source_size": 16863266,
-        "result_files": 1,
-        "result_size": 37109764,
-        "table_processing": 15.886597871780396,
-        "num_files": 1,
-        "num_rows": 1,
-        "num_tokenized_rows": 1,
-        "num_tokens": 4638717,
-        "num_chars": 16836009
-    },
-    "source": {
-        "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds02/input",
-        "type": "path"
-    },
-    "target": {
-        "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds02",
-        "type": "path"
-    }
-}
diff --git a/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet b/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet
deleted file mode 100644
index b7f3df71b..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/requirements.txt
similarity index 100%
rename from transforms/universal/tokenization/python/requirements.txt
rename to transforms/universal/tokenization/requirements.txt
diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq01.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq01.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq01.parquet
rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq01.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq02.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq02.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq02.parquet
rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq02.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/metadata.json b/transforms/universal/tokenization/test-data/ds01/expected/metadata.json
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/expected/metadata.json
rename to transforms/universal/tokenization/test-data/ds01/expected/metadata.json
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq01.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/pq01.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq01.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/pq01.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq02.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/pq02.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq02.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/pq02.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds02/expected/df_17m.parquet b/transforms/universal/tokenization/test-data/ds02/expected/df_17m.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds02/expected/df_17m.parquet
rename to transforms/universal/tokenization/test-data/ds02/expected/df_17m.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds02/expected/metadata.json b/transforms/universal/tokenization/test-data/ds02/expected/metadata.json
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds02/expected/metadata.json
rename to transforms/universal/tokenization/test-data/ds02/expected/metadata.json
diff --git a/transforms/universal/tokenization/python/test-data/ds02/input/df_17m.parquet b/transforms/universal/tokenization/test-data/ds02/input/df_17m.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds02/input/df_17m.parquet
rename to transforms/universal/tokenization/test-data/ds02/input/df_17m.parquet
diff --git a/transforms/universal/tokenization/python/test/test_tokenization.py b/transforms/universal/tokenization/test/test_tokenization.py
similarity index 97%
rename from transforms/universal/tokenization/python/test/test_tokenization.py
rename to transforms/universal/tokenization/test/test_tokenization.py
index e4f13fd13..3cb53a047 100644
--- a/transforms/universal/tokenization/python/test/test_tokenization.py
+++ b/transforms/universal/tokenization/test/test_tokenization.py
@@ -16,7 +16,7 @@
 from data_processing.test_support.transform.table_transform_test import (
     AbstractTableTransformTest,
 )
-from tokenization_transform import TokenizationTransform
+from dpk_tokenization.transform import TokenizationTransform
 
 
 """
diff --git a/transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py b/transforms/universal/tokenization/test/test_tokenization_long_doc_python.py
similarity index 95%
rename from transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py
rename to transforms/universal/tokenization/test/test_tokenization_long_doc_python.py
index ef68ce549..5fe98689e 100644
--- a/transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py
+++ b/transforms/universal/tokenization/test/test_tokenization_long_doc_python.py
@@ -16,7 +16,7 @@
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
 
 
 tkn_params = {
diff --git a/transforms/universal/tokenization/python/test/test_tokenization_python.py b/transforms/universal/tokenization/test/test_tokenization_python.py
similarity index 95%
rename from transforms/universal/tokenization/python/test/test_tokenization_python.py
rename to transforms/universal/tokenization/test/test_tokenization_python.py
index c198d561a..00a3a6f58 100644
--- a/transforms/universal/tokenization/python/test/test_tokenization_python.py
+++ b/transforms/universal/tokenization/test/test_tokenization_python.py
@@ -16,7 +16,7 @@
 from data_processing.test_support.launch.transform_test import (
     AbstractTransformLauncherTest,
 )
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
 
 
 tkn_params = {
diff --git a/transforms/universal/tokenization/ray/test/test_tokenization_ray.py b/transforms/universal/tokenization/test/test_tokenization_ray.py
similarity index 95%
rename from transforms/universal/tokenization/ray/test/test_tokenization_ray.py
rename to transforms/universal/tokenization/test/test_tokenization_ray.py
index 17791e370..6ef31a914 100644
--- a/transforms/universal/tokenization/ray/test/test_tokenization_ray.py
+++ b/transforms/universal/tokenization/test/test_tokenization_ray.py
@@ -16,7 +16,7 @@
     AbstractTransformLauncherTest,
 )
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from tokenization_transform_ray import TokenizationRayConfiguration
+from dpk_tokenization.ray.transform import TokenizationRayConfiguration
 
 
 tkn_params = {
diff --git a/transforms/universal/tokenization/tokenization.ipynb b/transforms/universal/tokenization/tokenization.ipynb
new file mode 100644
index 000000000..8b59e6885
--- /dev/null
+++ b/transforms/universal/tokenization/tokenization.ipynb
@@ -0,0 +1,185 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+   "metadata": {},
+   "source": [
+    "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+    "```\n",
+    "make venv \n",
+    "source venv/bin/activate \n",
+    "pip install jupyterlab\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "## This is here as a reference only\n",
+    "# Users and application developers must use the right tag for the latest from pypi\n",
+    "!pip install data-prep-toolkit\n",
+    "!pip install data-prep-toolkit-transforms[tokenization]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true
+   },
+   "source": [
+    "##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n",
+    "| Name | Description|\n",
+    "| -----|------------|\n",
+    "|tkn_tokenizer | Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used |\n",
+    "|tkn_tokenizer_args |Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace|\n",
+    "|tkn_doc_id_column|Column contains document id which values should be unique across dataset|\n",
+    "|tkn_doc_content_column|Column contains document content|\n",
+    "|tkn_text_lang|Specify language used in the text content for better text splitting if needed|\n",
+    "|tkn_chunk_size|Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)|\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+   "metadata": {},
+   "source": [
+    "##### ***** Import required classes and modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from dpk_tokenization.transform_python import Tokenization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+   "metadata": {},
+   "source": [
+    "##### ***** Setup runtime parameters for this transform"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "16:52:19 INFO - pipeline id pipeline_id\n",
+      "16:52:19 INFO - code location None\n",
+      "16:52:19 INFO - data factory data_ is using local data access: input_folder - test-data/ds02/input output_folder - output\n",
+      "16:52:19 INFO - data factory data_ max_files -1, n_sample -1\n",
+      "16:52:19 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+      "16:52:19 INFO - orchestrator Tokenization started at 2024-12-11 16:52:19\n",
+      "16:52:19 INFO - Number of files is 1, source profile {'max_file_size': 6.856490135192871, 'min_file_size': 6.856490135192871, 'total_file_size': 6.856490135192871}\n",
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (5256 > 2048). Running this sequence through the model will result in indexing errors\n",
+      "16:52:33 INFO - Completed 1 files (100.0%) in 0.228 min\n",
+      "16:52:33 INFO - Done processing 1 files, waiting for flush() completion.\n",
+      "16:52:33 INFO - done flushing in 0.0 sec\n",
+      "16:52:33 INFO - Completed execution in 0.235 min, execution result 0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "Tokenization(input_folder= \"test-data/ds02/input\",\n",
+    "        output_folder= \"output\",\n",
+    "        tkn_tokenizer=  \"hf-internal-testing/llama-tokenizer\",\n",
+    "        tkn_chunk_size= 20_000).transform()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+   "metadata": {},
+   "source": [
+    "##### **** The specified folder will include the transformed parquet files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['output/lang=en', 'output/metadata.json', 'output/df_17m.parquet']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import glob\n",
+    "glob.glob(\"output/*\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/universal/tokenization/transform.config b/transforms/universal/tokenization/transform.config
deleted file mode 100644
index 04f517d42..000000000
--- a/transforms/universal/tokenization/transform.config
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# This is intended to be included across the Makefiles provided within
-# a given transform's directory tree,  so must use compatible syntax.
-#
-################################################################################
-# This defines the name of the transform and is used to match against
-# expected files and is used to define the transform's image name. 
-TRANSFORM_NAME=tokenization
-
-################################################################################
-# This defines the transforms' version number as would be used
-# when publishing the wheel.  In general, only the micro version
-# number should be advanced relative to the DPK_VERSION. 
-#
-# If you change the versions numbers, be sure to run "make set-versions" to 
-# update version numbers across the transform (e.g., pyproject.toml).
-TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION)
-TOKENIZATION_RAY_VERSION=$(TOKENIZATION_PYTHON_VERSION)
-TOKENIZATION_SPARK_VERSION=$(TOKENIZATION_PYTHON_VERSION)
-