diff --git a/transforms/Makefile.transform.template b/transforms/Makefile.transform.template
new file mode 100644
index 000000000..72feb87cf
--- /dev/null
+++ b/transforms/Makefile.transform.template
@@ -0,0 +1,16 @@
+REPOROOT=../../..
+# Use make help, to see the available rules
+include $(REPOROOT)/transforms/.make.cicd.targets
+
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree, so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name.
+TRANSFORM_NAME=$(shell basename `pwd`)
+
+################################################################################
+
+
diff --git a/transforms/universal/tokenization/python/Dockerfile b/transforms/universal/tokenization/Dockerfile.python
similarity index 72%
rename from transforms/universal/tokenization/python/Dockerfile
rename to transforms/universal/tokenization/Dockerfile.python
index 9f2c9dc38..35552e198 100644
--- a/transforms/universal/tokenization/python/Dockerfile
+++ b/transforms/universal/tokenization/Dockerfile.python
@@ -18,21 +18,10 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
# END OF STEPS destined for a data-prep-kit base image
-COPY --chown=dpk:root src/ src/
-COPY --chown=dpk:root pyproject.toml pyproject.toml
+COPY --chown=dpk:root dpk_tokenization/ dpk_tokenization/
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install --no-cache-dir -e .
-# copy the main() entry point to the image
-COPY ./src/tokenization_transform_python.py .
-
-# copy some of the samples in
-COPY src/tokenization_local_python.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
# Set environment
ENV PYTHONPATH /home/dpk
diff --git a/transforms/universal/tokenization/ray/Dockerfile b/transforms/universal/tokenization/Dockerfile.ray
similarity index 56%
rename from transforms/universal/tokenization/ray/Dockerfile
rename to transforms/universal/tokenization/Dockerfile.ray
index 223b0c483..5462e48e8 100644
--- a/transforms/universal/tokenization/ray/Dockerfile
+++ b/transforms/universal/tokenization/Dockerfile.ray
@@ -13,24 +13,11 @@ ARG DPK_WHEEL_FILE_NAME
COPY --chown=ray:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
-## Copy the python version of the tansform
-COPY --chown=ray:users python-transform/ python-transform
-RUN cd python-transform && pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -e .
+COPY --chown=ray:users dpk_tokenization/ dpk_tokenization/
+COPY --chown=ray:users requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
-COPY --chown=ray:users src/ src/
-COPY --chown=ray:users pyproject.toml pyproject.toml
-RUN pip install --no-cache-dir -e .
-
-# copy the main() entry point to the image
-COPY ./src/tokenization_transform_ray.py .
-
-# copy some of the samples in
-COPY src/tokenization_local_ray.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
# Set environment
ENV PYTHONPATH /home/ray
diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile
index bca6f7e85..09feb6f02 100644
--- a/transforms/universal/tokenization/Makefile
+++ b/transforms/universal/tokenization/Makefile
@@ -1,79 +1,21 @@
REPOROOT=../../..
# Use make help, to see the available rules
-include $(REPOROOT)/.make.defaults
+include $(REPOROOT)/transforms/.make.cicd.targets
-setup::
- @# Help: Recursively make $@ all subdirs
- $(MAKE) RULE=$@ .recurse
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree, so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name.
+TRANSFORM_NAME=$(shell basename `pwd`)
-clean::
- @# Help: Recursively make $@ all subdirs
- $(MAKE) RULE=$@ .recurse
+################################################################################
-build::
- @# Help: Recursively make $@ in subdirs
- $(MAKE) RULE=$@ .recurse
-venv::
- @# Help: Recursively make $@ in subdirs
- $(MAKE) RULE=$@ .recurse
-
-image::
- @# Help: Recursively make $@ in all subdirs
- @$(MAKE) RULE=$@ .recurse
-
-set-versions:
- @# Help: Recursively $@ in all subdirs
- @$(MAKE) RULE=$@ .recurse
-
-publish::
- @# Help: Recursively make $@ in all subdirs
- @$(MAKE) RULE=$@ .recurse
-
-test-image::
- @# Help: Recursively make $@ in all subdirs
- @$(MAKE) RULE=$@ .recurse
-
-test::
- @# Help: Recursively make $@ in all subdirs
- @$(MAKE) RULE=$@ .recurse
-
-test-src::
- @# Help: Recursively make $@ in all subdirs
- $(MAKE) RULE=$@ .recurse
-
-kind-load-image::
- @# Help: Recursively make $@ in all subdirs
- $(MAKE) RULE=$@ .recurse
-
-docker-load-image::
- @# Help: Recursively make $@ in all subdirs
- $(MAKE) RULE=$@ .recurse
-
-docker-save-image::
- @# Help: Recursively make $@ in all subdirs
- $(MAKE) RULE=$@ .recurse
-
-.PHONY: workflow-venv
-workflow-venv:
- if [ -e kfp_ray ]; then \
- $(MAKE) -C kfp_ray workflow-venv; \
- fi
-
-.PHONY: workflow-test
-workflow-test:
- if [ -e kfp_ray ]; then \
- $(MAKE) -C kfp_ray workflow-test; \
- fi
-
-.PHONY: workflow-upload
-workflow-upload:
- if [ -e kfp_ray ]; then \
- $(MAKE) -C kfp_ray workflow-upload; \
- fi
-
-.PHONY: workflow-build
-workflow-build:
- if [ -e kfp_ray ]; then \
- $(MAKE) -C kfp_ray workflow-build; \
- fi
+run-cli-sample:
+ make venv
+ source venv/bin/activate && \
+ $(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
+ --data_local_config "{ 'input_folder' : 'test-data/ds01/input', 'output_folder' : 'output'}"
diff --git a/transforms/universal/tokenization/README.md b/transforms/universal/tokenization/README.md
index 3fd4571ff..b80ed2c99 100644
--- a/transforms/universal/tokenization/README.md
+++ b/transforms/universal/tokenization/README.md
@@ -1,13 +1,122 @@
-# Tokenization Transform
-The tokenization transform annotates pyarrow tables and parquet files
-to add a column containing tokens for the document column.
-Per the set of
+
Distributed tokenization module for data sets using any Hugging Face compatible tokenizer.
+
+
+
+
+## Contributors
+
+- Xuan-Hong Dang (xuan-hong.dang@ibm.com)
+
+# Data Tokenization
+Please see the set of
[transform project conventions](../../README.md#transform-project-conventions)
-the following runtimes are available:
-
-* [python](python/README.md) - provides the core python-based transformation
-implementation.
-* [ray](ray/README.md) - enables the running of the python-based transformation
-in a Ray runtime
-* [kfp](kfp_ray/README.md) - enables running the ray docker image
-the transform in a kubernetes cluster using a generated `yaml` file.
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary
+The data tokenization transform operates by converting a (non-empty) input table into an output table
+using a pre-trained tokenizer. The input table is required to have a minimum of two columns,
+named `document_id` and `contents` by default. However, alternate column names can be specified using
+`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents.
+It is essential for the values within the `document_id` column to be unique across the dataset,
+while the `contents` column stores their respective document content. To execute example demonstrations within this directory,
+a machine with `64GiB` of RAM is recommended.
+
+To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter.
+This parameter accepts the name of a tokenizer ready for download from Hugging Face,
+such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible
+with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter
+to include extra arguments specific to the chosen tokenizer.
+For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token,
+you can specify `use_auth_token=` in `--tkn_tokenizer`.
+
+The tokenization transformer utilizes the specified tokenizer to tokenize each row,
+assuming each row represents a document, in the input table and save it to a corresponding row in the output table.
+The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`.
+
+The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process.
+The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID,
+while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens.
+During tokenization, the tokenizer will disregard empty documents (rows) in the input table,
+as well as documents that yield no tokens or encounter failure during tokenization.
+The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file.
+
+
+In certain cases, the tokenization process of some tokenizers may be sluggish,
+particularly when handling lengthy documents containing millions of characters.
+To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time.
+For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text.
+The tokenizer will then tokenize each chunk separately and combine their resulting token IDs.
+By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length.
+
+
+
+## Running
+
+### CLI Options
+The following command line arguments are available in addition to
+the options provided by the [python launcher](../../../data-processing-lib/doc/python-launcher-options.md).
+```
+ --tkn_tokenizer TKN_TOKENIZER
+ Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used
+ --tkn_tokenizer_args TKN_TOKENIZER_ARGS
+ Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace
+ --tkn_doc_id_column TKN_DOC_ID_COLUMN
+ Column contains document id which values should be unique across dataset
+ --tkn_doc_content_column TKN_DOC_CONTENT_COLUMN
+ Column contains document content
+ --tkn_text_lang TKN_TEXT_LANG
+ Specify language used in the text content for better text splitting if needed
+ --tkn_chunk_size TKN_CHUNK_SIZE
+ Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)
+```
+
+### Running the samples
+To run the samples, use the following `make` target
+
+* `run-cli-sample` - runs dpk_tokenization/transform_python.py using command line args
+
+
+These targets will activate the virtual environment and set up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example,
+```shell
+make run-cli-sample
+...
+```
+Then
+```shell
+ls output
+```
+To see results of the transform.
+
+### Code example
+Here is a sample [notebook](tokenization.ipynb)
+
+
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+# Tokenization Transform for Ray
+Please see the set of
+[transform project conventions](../../README.md#transform-project-conventions)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary
+This project wraps the tokenization transform with a Ray runtime.
+
+## Configuration and command line Options
+
+Configuration and command line options are the same as for the base python transform.
+
+### Launched Command Line Options
+In addition to those available to the transform as defined in here,
+the set of
+[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available.
+
diff --git a/transforms/universal/tokenization/python/src/tokenization_local_python.py b/transforms/universal/tokenization/dpk_tokenization/local.py
similarity index 91%
rename from transforms/universal/tokenization/python/src/tokenization_local_python.py
rename to transforms/universal/tokenization/dpk_tokenization/local.py
index eb4766d60..7978e4dee 100644
--- a/transforms/universal/tokenization/python/src/tokenization_local_python.py
+++ b/transforms/universal/tokenization/dpk_tokenization/local.py
@@ -15,12 +15,12 @@
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
# create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
diff --git a/transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py b/transforms/universal/tokenization/dpk_tokenization/local_long_doc.py
similarity index 92%
rename from transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py
rename to transforms/universal/tokenization/dpk_tokenization/local_long_doc.py
index 788ec0d08..f657d946d 100644
--- a/transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py
+++ b/transforms/universal/tokenization/dpk_tokenization/local_long_doc.py
@@ -15,12 +15,12 @@
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
# create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds02", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds02"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds02", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds02"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
diff --git a/transforms/universal/tokenization/ray/src/tokenization_local_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/local.py
similarity index 92%
rename from transforms/universal/tokenization/ray/src/tokenization_local_ray.py
rename to transforms/universal/tokenization/dpk_tokenization/ray/local.py
index bd92415a3..45a32880a 100644
--- a/transforms/universal/tokenization/ray/src/tokenization_local_ray.py
+++ b/transforms/universal/tokenization/dpk_tokenization/ray/local.py
@@ -15,12 +15,12 @@
from data_processing.utils import ParamsUtils
from data_processing_ray.runtime.ray import RayTransformLauncher
-from tokenization_transform_ray import TokenizationRayConfiguration
+from dpk_tokenization.ray.transform import TokenizationRayConfiguration
# create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
diff --git a/transforms/universal/tokenization/ray/src/tokenization_s3_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/s3.py
similarity index 96%
rename from transforms/universal/tokenization/ray/src/tokenization_s3_ray.py
rename to transforms/universal/tokenization/dpk_tokenization/ray/s3.py
index 4ad450912..8777e00e9 100644
--- a/transforms/universal/tokenization/ray/src/tokenization_s3_ray.py
+++ b/transforms/universal/tokenization/dpk_tokenization/ray/s3.py
@@ -15,7 +15,7 @@
from data_processing.utils import ParamsUtils
from data_processing_ray.runtime.ray import RayTransformLauncher
-from tokenization_transform_ray import TokenizationRayConfiguration
+from dpk_tokenization.ray.transform import TokenizationRayConfiguration
print(os.environ)
diff --git a/transforms/universal/tokenization/ray/src/tokenization_transform_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/transform.py
similarity index 94%
rename from transforms/universal/tokenization/ray/src/tokenization_transform_ray.py
rename to transforms/universal/tokenization/dpk_tokenization/ray/transform.py
index c7d210417..b95d2d30b 100644
--- a/transforms/universal/tokenization/ray/src/tokenization_transform_ray.py
+++ b/transforms/universal/tokenization/dpk_tokenization/ray/transform.py
@@ -15,7 +15,7 @@
from data_processing_ray.runtime.ray.runtime_configuration import (
RayTransformRuntimeConfiguration,
)
-from tokenization_transform import TokenizationTransformConfiguration
+from dpk_tokenization.transform import TokenizationTransformConfiguration
logger = get_logger(__name__)
diff --git a/transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py b/transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py
similarity index 96%
rename from transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py
rename to transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py
index 90e3cc29e..fffb2bbb4 100644
--- a/transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py
+++ b/transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py
@@ -14,7 +14,7 @@
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
# create parameters
diff --git a/transforms/universal/tokenization/python/src/tokenization_transform.py b/transforms/universal/tokenization/dpk_tokenization/transform.py
similarity index 99%
rename from transforms/universal/tokenization/python/src/tokenization_transform.py
rename to transforms/universal/tokenization/dpk_tokenization/transform.py
index aedb5ca08..277c333fa 100644
--- a/transforms/universal/tokenization/python/src/tokenization_transform.py
+++ b/transforms/universal/tokenization/dpk_tokenization/transform.py
@@ -21,7 +21,7 @@
import pyarrow as pa
from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from tokenization_utils import is_valid_argument_string, load_tokenizer, split_text
+from dpk_tokenization.utils import is_valid_argument_string, load_tokenizer, split_text
CHUNK_CHECKPOINT_INTERVAL = 100
diff --git a/transforms/universal/tokenization/python/src/tokenization_transform_python.py b/transforms/universal/tokenization/dpk_tokenization/transform_python.py
similarity index 52%
rename from transforms/universal/tokenization/python/src/tokenization_transform_python.py
rename to transforms/universal/tokenization/dpk_tokenization/transform_python.py
index 2d22a52cb..8efed547d 100644
--- a/transforms/universal/tokenization/python/src/tokenization_transform_python.py
+++ b/transforms/universal/tokenization/dpk_tokenization/transform_python.py
@@ -10,11 +10,14 @@
# limitations under the License.
################################################################################
+import sys
+
from data_processing.runtime.pure_python import (
PythonTransformLauncher,
PythonTransformRuntimeConfiguration,
)
-from tokenization_transform import TokenizationTransformConfiguration
+from data_processing.utils import ParamsUtils
+from dpk_tokenization.transform import TokenizationTransformConfiguration
class TokenizationPythonConfiguration(PythonTransformRuntimeConfiguration):
@@ -22,6 +25,29 @@ def __init__(self):
super().__init__(transform_config=TokenizationTransformConfiguration())
+class Tokenization:
+ def __init__(self, **kwargs):
+ self.params = {}
+ for key in kwargs:
+ self.params[key] = kwargs[key]
+ # if input_folder and output_folder are specified, then assume it is represent data_local_config
+ try:
+ local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")}
+ self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf)
+ del self.params["input_folder"]
+ del self.params["output_folder"]
+ except:
+ pass
+
+ def transform(self):
+ sys.argv = ParamsUtils.dict_to_req(d=(self.params))
+ # create launcher
+ launcher = PythonTransformLauncher(TokenizationPythonConfiguration())
+ # launch
+ return_code = launcher.launch()
+ return return_code
+
+
if __name__ == "__main__":
launcher = PythonTransformLauncher(TokenizationPythonConfiguration())
launcher.launch()
diff --git a/transforms/universal/tokenization/python/src/tokenization_utils.py b/transforms/universal/tokenization/dpk_tokenization/utils.py
similarity index 100%
rename from transforms/universal/tokenization/python/src/tokenization_utils.py
rename to transforms/universal/tokenization/dpk_tokenization/utils.py
diff --git a/transforms/universal/tokenization/kfp_ray/Makefile b/transforms/universal/tokenization/kfp_ray/Makefile
index c43105ff1..858db1b0a 100644
--- a/transforms/universal/tokenization/kfp_ray/Makefile
+++ b/transforms/universal/tokenization/kfp_ray/Makefile
@@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../
WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate
include $(REPOROOT)/transforms/.make.workflows
-# Include the common configuration for this transform
-include ../transform.config
+SRC_DIR=${CURDIR}/../
+# Use the docker image that is built for ray runtime
+TRANSFORM_RUNTIME=ray
+## override settings in .make.default as they assume old structure with ray being the current folder
+DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME)
+DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION)
-SRC_DIR=${CURDIR}/../ray/
+# Only build the image with -f Dockerfile.ray
+BUILD_SPECIFIC_RUNTIME=ray
PYTHON_WF := $(shell find ./ -name '*_wf.py')
YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF})
@@ -15,29 +20,8 @@ workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE}
.PHONY: clean
clean:
@# Help: Clean up the virtual environment.
- rm -rf ${REPOROOT}/transforms/venv
+ rm -rf ${REPOROOT}/transforms/venv
-venv::
-
-build::
-
-setup::
-
-test::
-
-test-src::
-
-test-image::
-
-publish::
-
-image::
-
-kind-load-image::
-
-docker-load-image::
-
-docker-save-image::
.PHONY: workflow-build
workflow-build: workflow-venv
@@ -45,10 +29,19 @@ workflow-build: workflow-venv
.PHONY: workflow-test
workflow-test: workflow-build
- $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=tokenization_wf.yaml
+ $(MAKE) TRANSFORM_SRC=${SRC_DIR} \
+ TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \
+ TRANSFORM_NAME=$(TRANSFORM_NAME) \
+ BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \
+ DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \
+ PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline
.PHONY: workflow-upload
workflow-upload: workflow-build
@for file in $(YAML_WF); do \
$(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \
done
+
+
+
+
diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
index c131d11ea..c9fb6f2e9 100644
--- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
+++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py
@@ -18,7 +18,7 @@
# the name of the job script
-EXEC_SCRIPT_NAME: str = "tokenization_transform_ray.py"
+EXEC_SCRIPT_NAME: str = "-m dpk_tokenization.ray.transform"
task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest"
@@ -112,7 +112,14 @@ def tokenization(
ray_name: str = "tkn-kfp-ray", # name of Ray cluster
# Add image_pull_secret and image_pull_policy to ray workers if needed
ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image},
- ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image},
+ ray_worker_options: dict = {
+ "replicas": 2,
+ "max_replicas": 2,
+ "min_replicas": 2,
+ "cpu": 2,
+ "memory": 4,
+ "image": task_image,
+ },
server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888",
# data access
data_s3_config: str = "{'input_folder': 'test/tokenization/ds01/input/', 'output_folder': 'test/tokenization/ds01/output/'}",
@@ -120,9 +127,9 @@ def tokenization(
data_max_files: int = -1,
data_num_samples: int = -1,
# orchestrator
- runtime_actor_options: dict = {'num_cpus': 0.8},
+ runtime_actor_options: dict = {"num_cpus": 0.8},
runtime_pipeline_id: str = "pipeline_id",
- runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'},
+ runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"},
# tokenizer parameters
tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer",
tkn_doc_id_column: str = "document_id",
@@ -175,7 +182,9 @@ def tokenization(
:return: None
"""
# create clean_up task
- clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params)
+ clean_up_task = cleanup_ray_op(
+ ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params
+ )
ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2)
# pipeline definition
with dsl.ExitHandler(clean_up_task):
diff --git a/transforms/universal/tokenization/python/.dockerignore b/transforms/universal/tokenization/python/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/tokenization/python/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/tokenization/python/Makefile b/transforms/universal/tokenization/python/Makefile
deleted file mode 100644
index 8f4f7fbf5..000000000
--- a/transforms/universal/tokenization/python/Makefile
+++ /dev/null
@@ -1,65 +0,0 @@
-# Define the root of the local git clone for the common rules to be able
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to
-# 1 if requirements reference the latest code in the data processing library
-# in this repo (that is not yet published to pypi). This is the default setting.
-# 0 if the transforms DPK dependencies are on wheels published to
-# pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse. However, feel free
-# to override/redefine the rules below.
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-venv:: .transforms.python-venv
-
-test:: .transforms.python-test
-
-clean:: .transforms.clean
-
-image:: .transforms.python-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-python
-
-setup:: .transforms.setup
-
-# distribution versions is the same as image version.
-set-versions:
- $(MAKE) TRANSFORM_PYTHON_VERSION=$(TOKENIZATION_PYTHON_VERSION) TOML_VERSION=$(TOKENIZATION_PYTHON_VERSION) .transforms.set-versions
-
-build-dist:: .defaults.build-dist
-
-publish-dist:: .defaults.publish-dist
-
-test-image:: .transforms.python-test-image
-
-run-cli-sample:
- $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \
- RUN_ARGS="--data_local_config \"{ 'input_folder' : '../test-data/ds01/input', 'output_folder' : '../output'}\" \
- " .transforms.run-src-file
-
-run-local-sample: .transforms.run-local-python-sample
-
-#run-s3-sample: .transforms.run-s3-sample
-
-minio-start: .minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/tokenization/python/README.md b/transforms/universal/tokenization/python/README.md
deleted file mode 100644
index 0c470bb73..000000000
--- a/transforms/universal/tokenization/python/README.md
+++ /dev/null
@@ -1,100 +0,0 @@
- Distributed tokenization module for data sets using any Hugging Face compatible tokenizer.
-
-
-
-## 📝 Table of Contents
-- [Summary](#Summary)
-- [Running](#Running)
-- [CLI Options](#cli_options)
-
-# Data Tokenization
-Please see the set of
-[transform project conventions](../../../README.md)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
-
-## Summary
-The data tokenization transform operates by converting a (non-empty) input table into an output table
-using a pre-trained tokenizer. The input table is required to have a minimum of two columns,
-named `document_id` and `contents` by default. However, alternate column names can be specified using
-`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents.
-It is essential for the values within the `document_id` column to be unique across the dataset,
-while the `contents` column stores their respective document content. To execute example demonstrations within this directory,
-a machine with `64GiB` of RAM is recommended.
-
-To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter.
-This parameter accepts the name of a tokenizer ready for download from Hugging Face,
-such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible
-with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter
-to include extra arguments specific to the chosen tokenizer.
-For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token,
-you can specify `use_auth_token=` in `--tkn_tokenizer`.
-
-The tokenization transformer utilizes the specified tokenizer to tokenize each row,
-assuming each row represents a document, in the input table and save it to a corresponding row in the output table.
-The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`.
-
-The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process.
-The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID,
-while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens.
-During tokenization, the tokenizer will disregard empty documents (rows) in the input table,
-as well as documents that yield no tokens or encounter failure during tokenization.
-The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file.
-
-
-In certain cases, the tokenization process of some tokenizers may be sluggish,
-particularly when handling lengthy documents containing millions of characters.
-To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time.
-For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text.
-The tokenizer will then tokenize each chunk separately and combine their resulting token IDs.
-By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length.
-
-
-
-## Running
-
-### CLI Options
-The following command line arguments are available in addition to
-the options provided by the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md)
-and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md).
-```
- --tkn_tokenizer TKN_TOKENIZER
- Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used
- --tkn_tokenizer_args TKN_TOKENIZER_ARGS
- Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace
- --tkn_doc_id_column TKN_DOC_ID_COLUMN
- Column contains document id which values should be unique across dataset
- --tkn_doc_content_column TKN_DOC_CONTENT_COLUMN
- Column contains document content
- --tkn_text_lang TKN_TEXT_LANG
- Specify language used in the text content for better text splitting if needed
- --tkn_chunk_size TKN_CHUNK_SIZE
- Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)
-```
-
-### Running the samples
-To run the samples, use the following `make` targets
-
-* `run-cli-sample` - runs src/tokenization_transform_python.py using command line args
-* `run-local-sample` - runs src/tokenization_local_python.py
-
-These targets will activate the virtual environment and set up any configuration needed.
-Use the `-n` option of `make` to see the detail of what is done to run the sample.
-
-For example,
-```shell
-make run-cli-sample
-...
-```
-Then
-```shell
-ls output
-```
-To see results of the transform.
-
-
-### Transforming data using the transform image
-
-To use the transform image to transform your data, please refer to the
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml
deleted file mode 100644
index fb9ee0c48..000000000
--- a/transforms/universal/tokenization/python/pyproject.toml
+++ /dev/null
@@ -1,53 +0,0 @@
-[project]
-name = "dpk_tokenization_transform_python"
-keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
-version = "0.2.3.dev2"
-requires-python = ">=3.10,<3.13"
-description = "Tokenization Transform for Python"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
- { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"},
-]
-
-[project_urls]
-Repository = "https://github.com/IBM/data-prep-kit"
-Issues = "https://github.com/IBM/data-prep-kit/issues"
-Documentation = "https://ibm.github.io/data-prep-kit/"
-"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/tokenization"
-
-dynamic = ["dependencies"]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-
-[tool.setuptools.dynamic]
-dependencies = {file = ["requirements.txt"]}
-
-[project.optional-dependencies]
-dev = [
- "twine",
- "pytest>=7.3.2",
- "pytest-dotenv>=0.5.2",
- "pytest-env>=1.0.0",
- "pre-commit>=3.3.2",
- "pytest-cov>=4.1.0",
- "pytest-mock>=3.10.0",
- "moto==5.0.5",
- "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/tokenization/ray/.dockerignore b/transforms/universal/tokenization/ray/.dockerignore
deleted file mode 100644
index f7275bbbd..000000000
--- a/transforms/universal/tokenization/ray/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-venv/
diff --git a/transforms/universal/tokenization/ray/.gitignore b/transforms/universal/tokenization/ray/.gitignore
deleted file mode 100644
index 3ea7fd4ab..000000000
--- a/transforms/universal/tokenization/ray/.gitignore
+++ /dev/null
@@ -1,38 +0,0 @@
-test-data/output
-output/*
-/output/
-data-processing-lib/
-
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-
-# Distribution / packaging
-bin/
-build/
-develop-eggs/
-dist/
-eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-*.egg-info/
-.installed.cfg
-*.egg
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-.tox/
-htmlcov
-.coverage
-.cache
-nosetests.xml
-coverage.xml
\ No newline at end of file
diff --git a/transforms/universal/tokenization/ray/Makefile b/transforms/universal/tokenization/ray/Makefile
deleted file mode 100644
index 0a4e3a370..000000000
--- a/transforms/universal/tokenization/ray/Makefile
+++ /dev/null
@@ -1,65 +0,0 @@
-# Define the root of the local git clone for the common rules to be able
-# know where they are running from.
-REPOROOT=../../../..
-
-# Set this, before including .make.defaults, to
-# 1 if requirements reference the latest code in the data processing library
-# in this repo (that is not yet published to pypi). This is the default setting.
-# 0 if the transforms DPK dependencies are on wheels published to
-# pypi (e.g. data-prep-toolkit=0.2.1)
-#USE_REPO_LIB_SRC=1
-
-# Include a library of common .transform.* targets which most
-# transforms should be able to reuse. However, feel free
-# to override/redefine the rules below.
-include $(REPOROOT)/transforms/.make.transforms
-
-# Include the common configuration for this transform
-include ../transform.config
-
-BASE_IMAGE=${RAY_BASE_IMAGE}
-venv:: .transforms.ray-venv
-
-test:: .transforms.ray-test
-
-clean:: .transforms.clean
-
-image:: .transforms.ray-image
-
-test-src:: .transforms.test-src
-
-setup:: .transforms.setup
-
-build:: build-dist image
-
-publish: publish-image
-
-publish-image:: .transforms.publish-image-ray
-
-setup:: .transforms.setup
-
-set-versions:
- $(MAKE) TRANSFORM_PYTHON_VERSION=$(TOKENIZATION_PYTHON_VERSION) TOML_VERSION=$(TOKENIZATION_RAY_VERSION) .transforms.set-versions
-
-build-dist:: .defaults.build-dist
-
-publish-dist:: .defaults.publish-dist
-
-test-image:: .transforms.ray-test-image
-
-run-cli-sample:
- $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \
- RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/ds01/input', 'output_folder' : '../output'}\" \
- " .transforms.run-src-file
-
-#run-local-sample: .transforms.run-local-sample
-
-run-s3-sample: .transforms.run-s3-ray-sample
-
-minio-start: .minio-start
-
-kind-load-image:: .transforms.kind-load-image
-
-docker-load-image: .defaults.docker-load-image
-
-docker-save-image: .defaults.docker-save-image
diff --git a/transforms/universal/tokenization/ray/README.md b/transforms/universal/tokenization/ray/README.md
deleted file mode 100644
index 1181d6878..000000000
--- a/transforms/universal/tokenization/ray/README.md
+++ /dev/null
@@ -1,49 +0,0 @@
-# Tokenization Transform for Ray
-Please see the set of
-[transform project conventions](../../../README.md#transform-project-conventions)
-for details on general project conventions, transform configuration,
-testing and IDE set up.
-
-## Summary
-This project wraps the [tokenization transform](../python) with a Ray runtime.
-
-## Configuration and command line Options
-
-Noop configuration and command line options are the same as for the base python transform.
-
-## Running
-
-### Launched Command Line Options
-In addition to those available to the transform as defined in [here](../python/README.md),
-the set of
-[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available.
-
-### Running the samples
-To run the samples, use the following `make` targets
-
-* `run-cli-sample` - runs src/tokenization_transform_ray.py using command line args
-* `run-local-sample` - runs src/tokenization_local_ray.py
-* `run-s3-sample` - runs src/filter_s3_ray.py
- * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html)
- and [here](https://min.io/docs/minio/linux/index.html)
- and invocation of `make minio-start` to load data into local minio for S3 access.
-
-These targets will activate the virtual environment and set up any configuration needed.
-Use the `-n` option of `make` to see the detail of what is done to run the sample.
-
-For example,
-```shell
-make run-cli-sample
-...
-```
-Then
-```shell
-ls output
-```
-To see results of the transform.
-
-### Transforming data using the transform image
-
-To use the transform image to transform your data, please refer to the
-[running images quickstart](../../../../doc/quick-start/run-transform-image.md),
-substituting the name of this transform image and runtime as appropriate.
diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml
deleted file mode 100644
index 0829e002c..000000000
--- a/transforms/universal/tokenization/ray/pyproject.toml
+++ /dev/null
@@ -1,45 +0,0 @@
-[project]
-name = "dpk_tokenization_transform_ray"
-version = "0.2.3.dev2"
-requires-python = ">=3.10,<3.13"
-description = "Tokenization Transform for Ray"
-license = {text = "Apache-2.0"}
-readme = {file = "README.md", content-type = "text/markdown"}
-authors = [
- { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"},
-]
-dependencies = [
- "dpk-tokenization-transform-python==0.2.3.dev2",
- "data-prep-toolkit[ray]>=0.2.3.dev2",
-]
-
-[build-system]
-requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
-build-backend = "setuptools.build_meta"
-
-[project.optional-dependencies]
-dev = [
- "twine",
- "pytest>=7.3.2",
- "pytest-dotenv>=0.5.2",
- "pytest-env>=1.0.0",
- "pre-commit>=3.3.2",
- "pytest-cov>=4.1.0",
- "pytest-mock>=3.10.0",
- "moto==5.0.5",
- "markupsafe==2.0.1",
-]
-
-[options]
-package_dir = ["src","test"]
-
-[options.packages.find]
-where = ["src/"]
-
-[tool.pytest.ini_options]
-# Currently we use low coverage since we have to run tests separately (see makefile)
-#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
-markers = ["unit: unit tests", "integration: integration tests"]
-
-[tool.coverage.run]
-include = ["src/*"]
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
deleted file mode 100644
index e452fbf9a..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet
deleted file mode 100644
index bff7a5ed9..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet
deleted file mode 100644
index fbdeaf64b..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json b/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json
deleted file mode 100644
index e6c190807..000000000
--- a/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json
+++ /dev/null
@@ -1,59 +0,0 @@
-{
- "pipeline": "pipeline_id",
- "job details": {
- "job category": "preprocessing",
- "job name": "Tokenization",
- "job type": "ray",
- "job id": "job_id",
- "start_time": "2024-03-29 13:30:56",
- "end_time": "2024-03-29 13:30:57",
- "status": "success"
- },
- "code": {
- "github": "github",
- "commit_hash": "12345",
- "path": "path"
- },
- "job_input_params": {
- "tokenizer": "hf-internal-testing/llama-tokenizer",
- "tokenizer_args": null,
- "doc_id_column": "document_id",
- "doc_content_column": "contents",
- "text_lang": "en",
- "chunk_size": 0,
- "checkpointing": false,
- "max_files": -1,
- "number of workers": 5,
- "worker options": {
- "num_cpus": 0.8
- },
- "actor creation delay": 0
- },
- "execution_stats": {
- "cpus": 10,
- "gpus": 0,
- "memory": 27.31659088190645,
- "object_store": 2.0
- },
- "job_output_stats": {
- "source_files": 5,
- "source_size": 450,
- "result_files": 3,
- "result_size": 842,
- "table_processing": 0.03880786895751953,
- "num_files": 3,
- "num_rows": 6,
- "num_tokenized_rows": 6,
- "num_tokens": 85,
- "num_chars": 384,
- "skipped empty tables": 2
- },
- "source": {
- "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds01/input",
- "type": "path"
- },
- "target": {
- "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds01",
- "type": "path"
- }
-}
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
deleted file mode 100644
index 83bdac0b4..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet
deleted file mode 100644
index 5a86a7b13..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet
deleted file mode 100644
index 5a86a7b13..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet
deleted file mode 100644
index 07fd2adfe..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet
deleted file mode 100644
index 879fcf0f0..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet b/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet
deleted file mode 100644
index e81983916..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json b/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json
deleted file mode 100644
index dc9813beb..000000000
--- a/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
- "pipeline": "pipeline_id",
- "job details": {
- "job category": "preprocessing",
- "job name": "Tokenization",
- "job type": "ray",
- "job id": "job_id",
- "start_time": "2024-03-29 14:03:15",
- "end_time": "2024-03-29 14:03:32",
- "status": "success"
- },
- "code": {
- "github": "github",
- "commit_hash": "12345",
- "path": "path"
- },
- "job_input_params": {
- "tokenizer": "hf-internal-testing/llama-tokenizer",
- "tokenizer_args": null,
- "doc_id_column": "document_id",
- "doc_content_column": "contents",
- "text_lang": "en",
- "chunk_size": 20000,
- "checkpointing": false,
- "max_files": -1,
- "number of workers": 5,
- "worker options": {
- "num_cpus": 0.8
- },
- "actor creation delay": 0
- },
- "execution_stats": {
- "cpus": 10,
- "gpus": 0,
- "memory": 27.180484008975327,
- "object_store": 2.0
- },
- "job_output_stats": {
- "source_files": 1,
- "source_size": 16863266,
- "result_files": 1,
- "result_size": 37109764,
- "table_processing": 15.886597871780396,
- "num_files": 1,
- "num_rows": 1,
- "num_tokenized_rows": 1,
- "num_tokens": 4638717,
- "num_chars": 16836009
- },
- "source": {
- "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds02/input",
- "type": "path"
- },
- "target": {
- "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds02",
- "type": "path"
- }
-}
diff --git a/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet b/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet
deleted file mode 100644
index b7f3df71b..000000000
Binary files a/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet and /dev/null differ
diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/requirements.txt
similarity index 100%
rename from transforms/universal/tokenization/python/requirements.txt
rename to transforms/universal/tokenization/requirements.txt
diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq01.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq01.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq01.parquet
rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq01.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq02.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq02.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq02.parquet
rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq02.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/metadata.json b/transforms/universal/tokenization/test-data/ds01/expected/metadata.json
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/expected/metadata.json
rename to transforms/universal/tokenization/test-data/ds01/expected/metadata.json
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq01.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/pq01.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq01.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/pq01.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq02.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/pq02.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq02.parquet
rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/pq02.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds02/expected/df_17m.parquet b/transforms/universal/tokenization/test-data/ds02/expected/df_17m.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds02/expected/df_17m.parquet
rename to transforms/universal/tokenization/test-data/ds02/expected/df_17m.parquet
diff --git a/transforms/universal/tokenization/python/test-data/ds02/expected/metadata.json b/transforms/universal/tokenization/test-data/ds02/expected/metadata.json
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds02/expected/metadata.json
rename to transforms/universal/tokenization/test-data/ds02/expected/metadata.json
diff --git a/transforms/universal/tokenization/python/test-data/ds02/input/df_17m.parquet b/transforms/universal/tokenization/test-data/ds02/input/df_17m.parquet
similarity index 100%
rename from transforms/universal/tokenization/python/test-data/ds02/input/df_17m.parquet
rename to transforms/universal/tokenization/test-data/ds02/input/df_17m.parquet
diff --git a/transforms/universal/tokenization/python/test/test_tokenization.py b/transforms/universal/tokenization/test/test_tokenization.py
similarity index 97%
rename from transforms/universal/tokenization/python/test/test_tokenization.py
rename to transforms/universal/tokenization/test/test_tokenization.py
index e4f13fd13..3cb53a047 100644
--- a/transforms/universal/tokenization/python/test/test_tokenization.py
+++ b/transforms/universal/tokenization/test/test_tokenization.py
@@ -16,7 +16,7 @@
from data_processing.test_support.transform.table_transform_test import (
AbstractTableTransformTest,
)
-from tokenization_transform import TokenizationTransform
+from dpk_tokenization.transform import TokenizationTransform
"""
diff --git a/transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py b/transforms/universal/tokenization/test/test_tokenization_long_doc_python.py
similarity index 95%
rename from transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py
rename to transforms/universal/tokenization/test/test_tokenization_long_doc_python.py
index ef68ce549..5fe98689e 100644
--- a/transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py
+++ b/transforms/universal/tokenization/test/test_tokenization_long_doc_python.py
@@ -16,7 +16,7 @@
from data_processing.test_support.launch.transform_test import (
AbstractTransformLauncherTest,
)
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
tkn_params = {
diff --git a/transforms/universal/tokenization/python/test/test_tokenization_python.py b/transforms/universal/tokenization/test/test_tokenization_python.py
similarity index 95%
rename from transforms/universal/tokenization/python/test/test_tokenization_python.py
rename to transforms/universal/tokenization/test/test_tokenization_python.py
index c198d561a..00a3a6f58 100644
--- a/transforms/universal/tokenization/python/test/test_tokenization_python.py
+++ b/transforms/universal/tokenization/test/test_tokenization_python.py
@@ -16,7 +16,7 @@
from data_processing.test_support.launch.transform_test import (
AbstractTransformLauncherTest,
)
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
tkn_params = {
diff --git a/transforms/universal/tokenization/ray/test/test_tokenization_ray.py b/transforms/universal/tokenization/test/test_tokenization_ray.py
similarity index 95%
rename from transforms/universal/tokenization/ray/test/test_tokenization_ray.py
rename to transforms/universal/tokenization/test/test_tokenization_ray.py
index 17791e370..6ef31a914 100644
--- a/transforms/universal/tokenization/ray/test/test_tokenization_ray.py
+++ b/transforms/universal/tokenization/test/test_tokenization_ray.py
@@ -16,7 +16,7 @@
AbstractTransformLauncherTest,
)
from data_processing_ray.runtime.ray import RayTransformLauncher
-from tokenization_transform_ray import TokenizationRayConfiguration
+from dpk_tokenization.ray.transform import TokenizationRayConfiguration
tkn_params = {
diff --git a/transforms/universal/tokenization/tokenization.ipynb b/transforms/universal/tokenization/tokenization.ipynb
new file mode 100644
index 000000000..8b59e6885
--- /dev/null
+++ b/transforms/universal/tokenization/tokenization.ipynb
@@ -0,0 +1,185 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "afd55886-5f5b-4794-838e-ef8179fb0394",
+ "metadata": {},
+ "source": [
+ "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n",
+ "```\n",
+ "make venv \n",
+ "source venv/bin/activate \n",
+ "pip install jupyterlab\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%capture\n",
+ "## This is here as a reference only\n",
+ "# Users and application developers must use the right tag for the latest from pypi\n",
+ "!pip install data-prep-toolkit\n",
+ "!pip install data-prep-toolkit-transforms[tokenization]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3",
+ "metadata": {
+ "jp-MarkdownHeadingCollapsed": true
+ },
+ "source": [
+ "##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n",
+ "| Name | Description|\n",
+ "| -----|------------|\n",
+ "|tkn_tokenizer | Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used |\n",
+ "|tkn_tokenizer_args |Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace|\n",
+ "|tkn_doc_id_column|Column contains document id which values should be unique across dataset|\n",
+ "|tkn_doc_content_column|Column contains document content|\n",
+ "|tkn_text_lang|Specify language used in the text content for better text splitting if needed|\n",
+ "|tkn_chunk_size|Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)|\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ebf1f782-0e61-485c-8670-81066beb734c",
+ "metadata": {},
+ "source": [
+ "##### ***** Import required classes and modules"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n"
+ ]
+ }
+ ],
+ "source": [
+ "from dpk_tokenization.transform_python import Tokenization"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7234563c-2924-4150-8a31-4aec98c1bf33",
+ "metadata": {},
+ "source": [
+ "##### ***** Setup runtime parameters for this transform"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "16:52:19 INFO - pipeline id pipeline_id\n",
+ "16:52:19 INFO - code location None\n",
+ "16:52:19 INFO - data factory data_ is using local data access: input_folder - test-data/ds02/input output_folder - output\n",
+ "16:52:19 INFO - data factory data_ max_files -1, n_sample -1\n",
+ "16:52:19 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n",
+ "16:52:19 INFO - orchestrator Tokenization started at 2024-12-11 16:52:19\n",
+ "16:52:19 INFO - Number of files is 1, source profile {'max_file_size': 6.856490135192871, 'min_file_size': 6.856490135192871, 'total_file_size': 6.856490135192871}\n",
+ "Token indices sequence length is longer than the specified maximum sequence length for this model (5256 > 2048). Running this sequence through the model will result in indexing errors\n",
+ "16:52:33 INFO - Completed 1 files (100.0%) in 0.228 min\n",
+ "16:52:33 INFO - Done processing 1 files, waiting for flush() completion.\n",
+ "16:52:33 INFO - done flushing in 0.0 sec\n",
+ "16:52:33 INFO - Completed execution in 0.235 min, execution result 0\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Tokenization(input_folder= \"test-data/ds02/input\",\n",
+ " output_folder= \"output\",\n",
+ " tkn_tokenizer= \"hf-internal-testing/llama-tokenizer\",\n",
+ " tkn_chunk_size= 20_000).transform()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c3df5adf-4717-4a03-864d-9151cd3f134b",
+ "metadata": {},
+ "source": [
+ "##### **** The specified folder will include the transformed parquet files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "7276fe84-6512-4605-ab65-747351e13a7c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['output/lang=en', 'output/metadata.json', 'output/df_17m.parquet']"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import glob\n",
+ "glob.glob(\"output/*\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transforms/universal/tokenization/transform.config b/transforms/universal/tokenization/transform.config
deleted file mode 100644
index 04f517d42..000000000
--- a/transforms/universal/tokenization/transform.config
+++ /dev/null
@@ -1,20 +0,0 @@
-#
-# This is intended to be included across the Makefiles provided within
-# a given transform's directory tree, so must use compatible syntax.
-#
-################################################################################
-# This defines the name of the transform and is used to match against
-# expected files and is used to define the transform's image name.
-TRANSFORM_NAME=tokenization
-
-################################################################################
-# This defines the transforms' version number as would be used
-# when publishing the wheel. In general, only the micro version
-# number should be advanced relative to the DPK_VERSION.
-#
-# If you change the versions numbers, be sure to run "make set-versions" to
-# update version numbers across the transform (e.g., pyproject.toml).
-TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION)
-TOKENIZATION_RAY_VERSION=$(TOKENIZATION_PYTHON_VERSION)
-TOKENIZATION_SPARK_VERSION=$(TOKENIZATION_PYTHON_VERSION)
-