merge tokenization

Signed-off-by: matouma <[email protected]>
IBM · Dec 15, 2024 · b7c74f9 · b7c74f9
2 parents 352e267 + 0c3ae86
commit b7c74f9
Show file tree

Hide file tree

Showing 56 changed files with 418 additions and 716 deletions.
diff --git a/transforms/Makefile.transform.template b/transforms/Makefile.transform.template
@@ -0,0 +1,16 @@
+REPOROOT=../../..
+# Use make help, to see the available rules
+include $(REPOROOT)/transforms/.make.cicd.targets
+
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
+
+################################################################################
+
+
diff --git a/.../universal/tokenization/python/Dockerfile → .../universal/tokenization/Dockerfile.python b/.../universal/tokenization/python/Dockerfile → .../universal/tokenization/Dockerfile.python
@@ -18,21 +18,10 @@ RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}
 
 # END OF STEPS destined for a data-prep-kit base image 
 
-COPY --chown=dpk:root src/ src/
-COPY --chown=dpk:root pyproject.toml pyproject.toml 
+COPY --chown=dpk:root dpk_tokenization/ dpk_tokenization/
 COPY --chown=dpk:root requirements.txt requirements.txt
 RUN pip install --no-cache-dir -r  requirements.txt
-RUN pip install --no-cache-dir -e .
 
-# copy the main() entry point to the image 
-COPY ./src/tokenization_transform_python.py .
-
-# copy some of the samples in
-COPY src/tokenization_local_python.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/dpk

diff --git a/...rms/universal/tokenization/ray/Dockerfile → ...rms/universal/tokenization/Dockerfile.ray b/...rms/universal/tokenization/ray/Dockerfile → ...rms/universal/tokenization/Dockerfile.ray
@@ -13,24 +13,11 @@ ARG DPK_WHEEL_FILE_NAME
 COPY --chown=ray:users data-processing-dist data-processing-dist
 RUN  pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]
 
-## Copy the python version of the tansform
-COPY --chown=ray:users python-transform/ python-transform
-RUN cd python-transform && pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -e .
 
+COPY --chown=ray:users dpk_tokenization/ dpk_tokenization/
+COPY --chown=ray:users requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
 
-COPY --chown=ray:users src/ src/
-COPY --chown=ray:users pyproject.toml pyproject.toml 
-RUN pip install --no-cache-dir -e .
-
-# copy the main() entry point to the image 
-COPY ./src/tokenization_transform_ray.py .
-
-# copy some of the samples in
-COPY src/tokenization_local_ray.py local/
-
-# copy test
-COPY test/ test/
-COPY test-data/ test-data/
 
 # Set environment
 ENV PYTHONPATH /home/ray

diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile
@@ -1,79 +1,21 @@
 REPOROOT=../../..
 # Use make help, to see the available rules
-include $(REPOROOT)/.make.defaults
+include $(REPOROOT)/transforms/.make.cicd.targets
 
-setup::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
+#
+# This is intended to be included across the Makefiles provided within
+# a given transform's directory tree,  so must use compatible syntax.
+#
+################################################################################
+# This defines the name of the transform and is used to match against
+# expected files and is used to define the transform's image name. 
+TRANSFORM_NAME=$(shell basename `pwd`)
 
-clean::
-	@# Help: Recursively make $@ all subdirs 
-	$(MAKE) RULE=$@ .recurse
+################################################################################
 
-build::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-venv::
-	@# Help: Recursively make $@ in subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-set-versions:  
-	@# Help: Recursively $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-publish:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-image:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test:: 
-	@# Help: Recursively make $@ in all subdirs 
-	@$(MAKE) RULE=$@ .recurse
-
-test-src::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-kind-load-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-docker-load-image::
-	@# Help: Recursively make $@ in all subdirs
-	$(MAKE) RULE=$@ .recurse
-
-docker-save-image::
-	@# Help: Recursively make $@ in all subdirs 
-	$(MAKE) RULE=$@ .recurse
-
-.PHONY: workflow-venv
-workflow-venv:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-venv;   \
-	fi
-
-.PHONY: workflow-test
-workflow-test:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-test;   \
-	fi
-
-.PHONY: workflow-upload
-workflow-upload:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C kfp_ray workflow-upload; \
-	fi
-
-.PHONY: workflow-build
-workflow-build:
-	if [ -e kfp_ray ]; then                 \
-	    $(MAKE) -C  kfp_ray workflow-build; \
-	fi
 
+run-cli-sample: 
+	make venv
+	source venv/bin/activate && \
+	$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
+            --data_local_config "{ 'input_folder' : 'test-data/ds01/input', 'output_folder' : 'output'}"
diff --git a/transforms/universal/tokenization/README.md b/transforms/universal/tokenization/README.md
@@ -1,13 +1,122 @@
-# Tokenization Transform 
-The tokenization transform annotates pyarrow tables and parquet files
-to add a column containing tokens for the document column. 
-Per the set of 
+<p align="Left"> Distributed tokenization module for data sets using any Hugging Face compatible tokenizer.
+    <br> 
+</p>
+
+
+## Contributors
+
+- Xuan-Hong Dang ([email protected])
+
+# Data Tokenization
+Please see the set of
 [transform project conventions](../../README.md#transform-project-conventions)
-the following runtimes are available:
-
-* [python](python/README.md) - provides the core python-based transformation 
-implementation.
-* [ray](ray/README.md) - enables the running of the python-based transformation
-in a Ray runtime
-* [kfp](kfp_ray/README.md) - enables running the ray docker image 
-the transform in a kubernetes cluster using a generated `yaml` file.
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary 
+The data tokenization transform operates by converting a (non-empty) input table into an output table 
+using a pre-trained tokenizer. The input table is required to have a minimum of two columns, 
+named `document_id` and `contents` by default. However, alternate column names can be specified using 
+`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents.
+It is essential for the values within the `document_id` column to be unique across the dataset, 
+while the `contents` column stores their respective document content. To execute example demonstrations within this directory, 
+a machine with `64GiB` of RAM is recommended.
+
+To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter. 
+This parameter accepts the name of a tokenizer ready for download from Hugging Face, 
+such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible 
+with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter 
+to include extra arguments specific to the chosen tokenizer. 
+For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token, 
+you can specify `use_auth_token=<your token>` in `--tkn_tokenizer`. 
+
+The tokenization transformer utilizes the specified tokenizer to tokenize each row, 
+assuming each row represents a document, in the input table and save it to a corresponding row in the output table. 
+The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`.
+
+The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process. 
+The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID, 
+while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens.
+During tokenization, the tokenizer will disregard empty documents (rows) in the input table, 
+as well as documents that yield no tokens or encounter failure during tokenization. 
+The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file.
+
+
+In certain cases, the tokenization process of some tokenizers may be sluggish, 
+particularly when handling lengthy documents containing millions of characters. 
+To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time.
+For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text. 
+The tokenizer will then tokenize each chunk separately and combine their resulting token IDs.
+By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length.
+
+
+
+## Running
+
+### CLI Options
+The following command line arguments are available in addition to 
+the options provided by the [python launcher](../../../data-processing-lib/doc/python-launcher-options.md).
+```
+  --tkn_tokenizer TKN_TOKENIZER
+                        Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used
+  --tkn_tokenizer_args TKN_TOKENIZER_ARGS
+                        Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace
+  --tkn_doc_id_column TKN_DOC_ID_COLUMN
+                        Column contains document id which values should be unique across dataset
+  --tkn_doc_content_column TKN_DOC_CONTENT_COLUMN
+                        Column contains document content
+  --tkn_text_lang TKN_TEXT_LANG
+                        Specify language used in the text content for better text splitting if needed
+  --tkn_chunk_size TKN_CHUNK_SIZE
+                        Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)
+```
+
+### Running the samples
+To run the samples, use the following `make` target
+
+* `run-cli-sample` - runs dpk_tokenization/transform_python.py using command line args
+
+
+These targets will activate the virtual environment and set up any configuration needed.
+Use the `-n` option of `make` to see the detail of what is done to run the sample.
+
+For example, 
+```shell
+make run-cli-sample
+...
+```
+Then 
+```shell
+ls output
+```
+To see results of the transform.
+
+### Code example
+Here is a sample [notebook](tokenization.ipynb)
+
+
+
+### Transforming data using the transform image
+
+To use the transform image to transform your data, please refer to the 
+[running images quickstart](../../../doc/quick-start/run-transform-image.md),
+substituting the name of this transform image and runtime as appropriate.
+
+# Tokenization Transform for Ray
+Please see the set of
+[transform project conventions](../../README.md#transform-project-conventions)
+for details on general project conventions, transform configuration,
+testing and IDE set up.
+
+## Summary 
+This project wraps the tokenization transform with a Ray runtime.
+
+## Configuration and command line Options
+
+Configuration and command line options are the same as for the base python transform. 
+
+### Launched Command Line Options 
+In addition to those available to the transform as defined in here,
+the set of 
+[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available.
+
diff --git a/...n/python/src/tokenization_local_python.py → ...al/tokenization/dpk_tokenization/local.py b/...n/python/src/tokenization_local_python.py → ...al/tokenization/dpk_tokenization/local.py
@@ -15,12 +15,12 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,

diff --git a/...src/tokenization_local_long_doc_python.py → ...zation/dpk_tokenization/local_long_doc.py b/...src/tokenization_local_long_doc_python.py → ...zation/dpk_tokenization/local_long_doc.py
@@ -15,12 +15,12 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds02", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds02"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds02", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds02"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,

diff --git a/...ization/ray/src/tokenization_local_ray.py → ...okenization/dpk_tokenization/ray/local.py b/...ization/ray/src/tokenization_local_ray.py → ...okenization/dpk_tokenization/ray/local.py
@@ -15,12 +15,12 @@
 
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from tokenization_transform_ray import TokenizationRayConfiguration
+from dpk_tokenization.ray.transform import TokenizationRayConfiguration
 
 
 # create parameters
-input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input"))
-output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01"))
+input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input"))
+output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01"))
 local_conf = {
     "input_folder": input_folder,
     "output_folder": output_folder,

diff --git a/...kenization/ray/src/tokenization_s3_ray.py → ...l/tokenization/dpk_tokenization/ray/s3.py b/...kenization/ray/src/tokenization_s3_ray.py → ...l/tokenization/dpk_tokenization/ray/s3.py
@@ -15,7 +15,7 @@
 
 from data_processing.utils import ParamsUtils
 from data_processing_ray.runtime.ray import RayTransformLauncher
-from tokenization_transform_ray import TokenizationRayConfiguration
+from dpk_tokenization.ray.transform import TokenizationRayConfiguration
 
 
 print(os.environ)

diff --git a/...ion/ray/src/tokenization_transform_ray.py → ...ization/dpk_tokenization/ray/transform.py b/...ion/ray/src/tokenization_transform_ray.py → ...ization/dpk_tokenization/ray/transform.py
@@ -15,7 +15,7 @@
 from data_processing_ray.runtime.ray.runtime_configuration import (
     RayTransformRuntimeConfiguration,
 )
-from tokenization_transform import TokenizationTransformConfiguration
+from dpk_tokenization.transform import TokenizationTransformConfiguration
 
 
 logger = get_logger(__name__)

diff --git a/...on/src/tokenization_s3_long_doc_python.py → ...enization/dpk_tokenization/s3_long_doc.py b/...on/src/tokenization_s3_long_doc_python.py → ...enization/dpk_tokenization/s3_long_doc.py
@@ -14,7 +14,7 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from tokenization_transform_python import TokenizationPythonConfiguration
+from dpk_tokenization.transform_python import TokenizationPythonConfiguration
 
 
 # create parameters

diff --git a/...tion/python/src/tokenization_transform.py → ...okenization/dpk_tokenization/transform.py b/...tion/python/src/tokenization_transform.py → ...okenization/dpk_tokenization/transform.py
@@ -21,7 +21,7 @@
 
 import pyarrow as pa
 from data_processing.transform import AbstractTableTransform, TransformConfiguration
-from tokenization_utils import is_valid_argument_string, load_tokenizer, split_text
+from dpk_tokenization.utils import is_valid_argument_string, load_tokenizer, split_text
 
 
 CHUNK_CHECKPOINT_INTERVAL = 100