diff --git a/transforms/Makefile.transform.template b/transforms/Makefile.transform.template new file mode 100644 index 000000000..72feb87cf --- /dev/null +++ b/transforms/Makefile.transform.template @@ -0,0 +1,16 @@ +REPOROOT=../../.. +# Use make help, to see the available rules +include $(REPOROOT)/transforms/.make.cicd.targets + +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) + +################################################################################ + + diff --git a/transforms/universal/tokenization/python/Dockerfile b/transforms/universal/tokenization/Dockerfile.python similarity index 72% rename from transforms/universal/tokenization/python/Dockerfile rename to transforms/universal/tokenization/Dockerfile.python index 9f2c9dc38..35552e198 100644 --- a/transforms/universal/tokenization/python/Dockerfile +++ b/transforms/universal/tokenization/Dockerfile.python @@ -18,21 +18,10 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME} # END OF STEPS destined for a data-prep-kit base image -COPY --chown=dpk:root src/ src/ -COPY --chown=dpk:root pyproject.toml pyproject.toml +COPY --chown=dpk:root dpk_tokenization/ dpk_tokenization/ COPY --chown=dpk:root requirements.txt requirements.txt RUN pip install --no-cache-dir -r requirements.txt -RUN pip install --no-cache-dir -e . -# copy the main() entry point to the image -COPY ./src/tokenization_transform_python.py . - -# copy some of the samples in -COPY src/tokenization_local_python.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ # Set environment ENV PYTHONPATH /home/dpk diff --git a/transforms/universal/tokenization/ray/Dockerfile b/transforms/universal/tokenization/Dockerfile.ray similarity index 56% rename from transforms/universal/tokenization/ray/Dockerfile rename to transforms/universal/tokenization/Dockerfile.ray index 223b0c483..5462e48e8 100644 --- a/transforms/universal/tokenization/ray/Dockerfile +++ b/transforms/universal/tokenization/Dockerfile.ray @@ -13,24 +13,11 @@ ARG DPK_WHEEL_FILE_NAME COPY --chown=ray:users data-processing-dist data-processing-dist RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray] -## Copy the python version of the tansform -COPY --chown=ray:users python-transform/ python-transform -RUN cd python-transform && pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -e . +COPY --chown=ray:users dpk_tokenization/ dpk_tokenization/ +COPY --chown=ray:users requirements.txt requirements.txt +RUN pip install --no-cache-dir -r requirements.txt -COPY --chown=ray:users src/ src/ -COPY --chown=ray:users pyproject.toml pyproject.toml -RUN pip install --no-cache-dir -e . - -# copy the main() entry point to the image -COPY ./src/tokenization_transform_ray.py . - -# copy some of the samples in -COPY src/tokenization_local_ray.py local/ - -# copy test -COPY test/ test/ -COPY test-data/ test-data/ # Set environment ENV PYTHONPATH /home/ray diff --git a/transforms/universal/tokenization/Makefile b/transforms/universal/tokenization/Makefile index bca6f7e85..09feb6f02 100644 --- a/transforms/universal/tokenization/Makefile +++ b/transforms/universal/tokenization/Makefile @@ -1,79 +1,21 @@ REPOROOT=../../.. # Use make help, to see the available rules -include $(REPOROOT)/.make.defaults +include $(REPOROOT)/transforms/.make.cicd.targets -setup:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse +# +# This is intended to be included across the Makefiles provided within +# a given transform's directory tree, so must use compatible syntax. +# +################################################################################ +# This defines the name of the transform and is used to match against +# expected files and is used to define the transform's image name. +TRANSFORM_NAME=$(shell basename `pwd`) -clean:: - @# Help: Recursively make $@ all subdirs - $(MAKE) RULE=$@ .recurse +################################################################################ -build:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse -venv:: - @# Help: Recursively make $@ in subdirs - $(MAKE) RULE=$@ .recurse - -image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -set-versions: - @# Help: Recursively $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -publish:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-image:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test:: - @# Help: Recursively make $@ in all subdirs - @$(MAKE) RULE=$@ .recurse - -test-src:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -kind-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-load-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -docker-save-image:: - @# Help: Recursively make $@ in all subdirs - $(MAKE) RULE=$@ .recurse - -.PHONY: workflow-venv -workflow-venv: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-venv; \ - fi - -.PHONY: workflow-test -workflow-test: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-test; \ - fi - -.PHONY: workflow-upload -workflow-upload: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-upload; \ - fi - -.PHONY: workflow-build -workflow-build: - if [ -e kfp_ray ]; then \ - $(MAKE) -C kfp_ray workflow-build; \ - fi +run-cli-sample: + make venv + source venv/bin/activate && \ + $(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \ + --data_local_config "{ 'input_folder' : 'test-data/ds01/input', 'output_folder' : 'output'}" diff --git a/transforms/universal/tokenization/README.md b/transforms/universal/tokenization/README.md index 3fd4571ff..b80ed2c99 100644 --- a/transforms/universal/tokenization/README.md +++ b/transforms/universal/tokenization/README.md @@ -1,13 +1,122 @@ -# Tokenization Transform -The tokenization transform annotates pyarrow tables and parquet files -to add a column containing tokens for the document column. -Per the set of +

Distributed tokenization module for data sets using any Hugging Face compatible tokenizer. +
+

+ + +## Contributors + +- Xuan-Hong Dang (xuan-hong.dang@ibm.com) + +# Data Tokenization +Please see the set of [transform project conventions](../../README.md#transform-project-conventions) -the following runtimes are available: - -* [python](python/README.md) - provides the core python-based transformation -implementation. -* [ray](ray/README.md) - enables the running of the python-based transformation -in a Ray runtime -* [kfp](kfp_ray/README.md) - enables running the ray docker image -the transform in a kubernetes cluster using a generated `yaml` file. +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +The data tokenization transform operates by converting a (non-empty) input table into an output table +using a pre-trained tokenizer. The input table is required to have a minimum of two columns, +named `document_id` and `contents` by default. However, alternate column names can be specified using +`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents. +It is essential for the values within the `document_id` column to be unique across the dataset, +while the `contents` column stores their respective document content. To execute example demonstrations within this directory, +a machine with `64GiB` of RAM is recommended. + +To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter. +This parameter accepts the name of a tokenizer ready for download from Hugging Face, +such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible +with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter +to include extra arguments specific to the chosen tokenizer. +For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token, +you can specify `use_auth_token=` in `--tkn_tokenizer`. + +The tokenization transformer utilizes the specified tokenizer to tokenize each row, +assuming each row represents a document, in the input table and save it to a corresponding row in the output table. +The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`. + +The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process. +The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID, +while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens. +During tokenization, the tokenizer will disregard empty documents (rows) in the input table, +as well as documents that yield no tokens or encounter failure during tokenization. +The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file. + + +In certain cases, the tokenization process of some tokenizers may be sluggish, +particularly when handling lengthy documents containing millions of characters. +To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time. +For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text. +The tokenizer will then tokenize each chunk separately and combine their resulting token IDs. +By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length. + + + +## Running + +### CLI Options +The following command line arguments are available in addition to +the options provided by the [python launcher](../../../data-processing-lib/doc/python-launcher-options.md). +``` + --tkn_tokenizer TKN_TOKENIZER + Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used + --tkn_tokenizer_args TKN_TOKENIZER_ARGS + Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace + --tkn_doc_id_column TKN_DOC_ID_COLUMN + Column contains document id which values should be unique across dataset + --tkn_doc_content_column TKN_DOC_CONTENT_COLUMN + Column contains document content + --tkn_text_lang TKN_TEXT_LANG + Specify language used in the text content for better text splitting if needed + --tkn_chunk_size TKN_CHUNK_SIZE + Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words) +``` + +### Running the samples +To run the samples, use the following `make` target + +* `run-cli-sample` - runs dpk_tokenization/transform_python.py using command line args + + +These targets will activate the virtual environment and set up any configuration needed. +Use the `-n` option of `make` to see the detail of what is done to run the sample. + +For example, +```shell +make run-cli-sample +... +``` +Then +```shell +ls output +``` +To see results of the transform. + +### Code example +Here is a sample [notebook](tokenization.ipynb) + + + +### Transforming data using the transform image + +To use the transform image to transform your data, please refer to the +[running images quickstart](../../../doc/quick-start/run-transform-image.md), +substituting the name of this transform image and runtime as appropriate. + +# Tokenization Transform for Ray +Please see the set of +[transform project conventions](../../README.md#transform-project-conventions) +for details on general project conventions, transform configuration, +testing and IDE set up. + +## Summary +This project wraps the tokenization transform with a Ray runtime. + +## Configuration and command line Options + +Configuration and command line options are the same as for the base python transform. + +### Launched Command Line Options +In addition to those available to the transform as defined in here, +the set of +[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available. + diff --git a/transforms/universal/tokenization/python/src/tokenization_local_python.py b/transforms/universal/tokenization/dpk_tokenization/local.py similarity index 91% rename from transforms/universal/tokenization/python/src/tokenization_local_python.py rename to transforms/universal/tokenization/dpk_tokenization/local.py index eb4766d60..7978e4dee 100644 --- a/transforms/universal/tokenization/python/src/tokenization_local_python.py +++ b/transforms/universal/tokenization/dpk_tokenization/local.py @@ -15,12 +15,12 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from tokenization_transform_python import TokenizationPythonConfiguration +from dpk_tokenization.transform_python import TokenizationPythonConfiguration # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, diff --git a/transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py b/transforms/universal/tokenization/dpk_tokenization/local_long_doc.py similarity index 92% rename from transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py rename to transforms/universal/tokenization/dpk_tokenization/local_long_doc.py index 788ec0d08..f657d946d 100644 --- a/transforms/universal/tokenization/python/src/tokenization_local_long_doc_python.py +++ b/transforms/universal/tokenization/dpk_tokenization/local_long_doc.py @@ -15,12 +15,12 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from tokenization_transform_python import TokenizationPythonConfiguration +from dpk_tokenization.transform_python import TokenizationPythonConfiguration # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds02", "input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds02")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds02", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds02")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, diff --git a/transforms/universal/tokenization/ray/src/tokenization_local_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/local.py similarity index 92% rename from transforms/universal/tokenization/ray/src/tokenization_local_ray.py rename to transforms/universal/tokenization/dpk_tokenization/ray/local.py index bd92415a3..45a32880a 100644 --- a/transforms/universal/tokenization/ray/src/tokenization_local_ray.py +++ b/transforms/universal/tokenization/dpk_tokenization/ray/local.py @@ -15,12 +15,12 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from tokenization_transform_ray import TokenizationRayConfiguration +from dpk_tokenization.ray.transform import TokenizationRayConfiguration # create parameters -input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input")) -output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01")) +input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input")) +output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01")) local_conf = { "input_folder": input_folder, "output_folder": output_folder, diff --git a/transforms/universal/tokenization/ray/src/tokenization_s3_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/s3.py similarity index 96% rename from transforms/universal/tokenization/ray/src/tokenization_s3_ray.py rename to transforms/universal/tokenization/dpk_tokenization/ray/s3.py index 4ad450912..8777e00e9 100644 --- a/transforms/universal/tokenization/ray/src/tokenization_s3_ray.py +++ b/transforms/universal/tokenization/dpk_tokenization/ray/s3.py @@ -15,7 +15,7 @@ from data_processing.utils import ParamsUtils from data_processing_ray.runtime.ray import RayTransformLauncher -from tokenization_transform_ray import TokenizationRayConfiguration +from dpk_tokenization.ray.transform import TokenizationRayConfiguration print(os.environ) diff --git a/transforms/universal/tokenization/ray/src/tokenization_transform_ray.py b/transforms/universal/tokenization/dpk_tokenization/ray/transform.py similarity index 94% rename from transforms/universal/tokenization/ray/src/tokenization_transform_ray.py rename to transforms/universal/tokenization/dpk_tokenization/ray/transform.py index c7d210417..b95d2d30b 100644 --- a/transforms/universal/tokenization/ray/src/tokenization_transform_ray.py +++ b/transforms/universal/tokenization/dpk_tokenization/ray/transform.py @@ -15,7 +15,7 @@ from data_processing_ray.runtime.ray.runtime_configuration import ( RayTransformRuntimeConfiguration, ) -from tokenization_transform import TokenizationTransformConfiguration +from dpk_tokenization.transform import TokenizationTransformConfiguration logger = get_logger(__name__) diff --git a/transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py b/transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py similarity index 96% rename from transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py rename to transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py index 90e3cc29e..fffb2bbb4 100644 --- a/transforms/universal/tokenization/python/src/tokenization_s3_long_doc_python.py +++ b/transforms/universal/tokenization/dpk_tokenization/s3_long_doc.py @@ -14,7 +14,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from tokenization_transform_python import TokenizationPythonConfiguration +from dpk_tokenization.transform_python import TokenizationPythonConfiguration # create parameters diff --git a/transforms/universal/tokenization/python/src/tokenization_transform.py b/transforms/universal/tokenization/dpk_tokenization/transform.py similarity index 99% rename from transforms/universal/tokenization/python/src/tokenization_transform.py rename to transforms/universal/tokenization/dpk_tokenization/transform.py index aedb5ca08..277c333fa 100644 --- a/transforms/universal/tokenization/python/src/tokenization_transform.py +++ b/transforms/universal/tokenization/dpk_tokenization/transform.py @@ -21,7 +21,7 @@ import pyarrow as pa from data_processing.transform import AbstractTableTransform, TransformConfiguration -from tokenization_utils import is_valid_argument_string, load_tokenizer, split_text +from dpk_tokenization.utils import is_valid_argument_string, load_tokenizer, split_text CHUNK_CHECKPOINT_INTERVAL = 100 diff --git a/transforms/universal/tokenization/python/src/tokenization_transform_python.py b/transforms/universal/tokenization/dpk_tokenization/transform_python.py similarity index 52% rename from transforms/universal/tokenization/python/src/tokenization_transform_python.py rename to transforms/universal/tokenization/dpk_tokenization/transform_python.py index 2d22a52cb..8efed547d 100644 --- a/transforms/universal/tokenization/python/src/tokenization_transform_python.py +++ b/transforms/universal/tokenization/dpk_tokenization/transform_python.py @@ -10,11 +10,14 @@ # limitations under the License. ################################################################################ +import sys + from data_processing.runtime.pure_python import ( PythonTransformLauncher, PythonTransformRuntimeConfiguration, ) -from tokenization_transform import TokenizationTransformConfiguration +from data_processing.utils import ParamsUtils +from dpk_tokenization.transform import TokenizationTransformConfiguration class TokenizationPythonConfiguration(PythonTransformRuntimeConfiguration): @@ -22,6 +25,29 @@ def __init__(self): super().__init__(transform_config=TokenizationTransformConfiguration()) +class Tokenization: + def __init__(self, **kwargs): + self.params = {} + for key in kwargs: + self.params[key] = kwargs[key] + # if input_folder and output_folder are specified, then assume it is represent data_local_config + try: + local_conf = {k: self.params[k] for k in ("input_folder", "output_folder")} + self.params["data_local_config"] = ParamsUtils.convert_to_ast(local_conf) + del self.params["input_folder"] + del self.params["output_folder"] + except: + pass + + def transform(self): + sys.argv = ParamsUtils.dict_to_req(d=(self.params)) + # create launcher + launcher = PythonTransformLauncher(TokenizationPythonConfiguration()) + # launch + return_code = launcher.launch() + return return_code + + if __name__ == "__main__": launcher = PythonTransformLauncher(TokenizationPythonConfiguration()) launcher.launch() diff --git a/transforms/universal/tokenization/python/src/tokenization_utils.py b/transforms/universal/tokenization/dpk_tokenization/utils.py similarity index 100% rename from transforms/universal/tokenization/python/src/tokenization_utils.py rename to transforms/universal/tokenization/dpk_tokenization/utils.py diff --git a/transforms/universal/tokenization/kfp_ray/Makefile b/transforms/universal/tokenization/kfp_ray/Makefile index c43105ff1..858db1b0a 100644 --- a/transforms/universal/tokenization/kfp_ray/Makefile +++ b/transforms/universal/tokenization/kfp_ray/Makefile @@ -2,10 +2,15 @@ REPOROOT=${CURDIR}/../../../../ WORKFLOW_VENV_ACTIVATE=${REPOROOT}/transforms/venv/bin/activate include $(REPOROOT)/transforms/.make.workflows -# Include the common configuration for this transform -include ../transform.config +SRC_DIR=${CURDIR}/../ +# Use the docker image that is built for ray runtime +TRANSFORM_RUNTIME=ray +## override settings in .make.default as they assume old structure with ray being the current folder +DOCKER_IMAGE_NAME=$(TRANSFORM_NAME)-$(TRANSFORM_RUNTIME) +DOCKER_LOCAL_IMAGE=$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) -SRC_DIR=${CURDIR}/../ray/ +# Only build the image with -f Dockerfile.ray +BUILD_SPECIFIC_RUNTIME=ray PYTHON_WF := $(shell find ./ -name '*_wf.py') YAML_WF := $(patsubst %.py, %.yaml, ${PYTHON_WF}) @@ -15,29 +20,8 @@ workflow-venv: .check_python_version ${WORKFLOW_VENV_ACTIVATE} .PHONY: clean clean: @# Help: Clean up the virtual environment. - rm -rf ${REPOROOT}/transforms/venv + rm -rf ${REPOROOT}/transforms/venv -venv:: - -build:: - -setup:: - -test:: - -test-src:: - -test-image:: - -publish:: - -image:: - -kind-load-image:: - -docker-load-image:: - -docker-save-image:: .PHONY: workflow-build workflow-build: workflow-venv @@ -45,10 +29,19 @@ workflow-build: workflow-venv .PHONY: workflow-test workflow-test: workflow-build - $(MAKE) .workflows.test-pipeline TRANSFORM_SRC=${SRC_DIR} PIPELINE_FILE=tokenization_wf.yaml + $(MAKE) TRANSFORM_SRC=${SRC_DIR} \ + TRANSFORM_RUNTIME=$(TRANSFORM_RUNTIME) \ + TRANSFORM_NAME=$(TRANSFORM_NAME) \ + BUILD_SPECIFIC_RUNTIME=$(BUILD_SPECIFIC_RUNTIME) \ + DOCKER_REMOTE_IMAGE=$(DOCKER_REGISTRY_ENDPOINT)/$(DOCKER_IMAGE_NAME):$(DOCKER_IMAGE_VERSION) \ + PIPELINE_FILE=$(TRANSFORM_NAME)_wf.yaml .workflows.test-pipeline .PHONY: workflow-upload workflow-upload: workflow-build @for file in $(YAML_WF); do \ $(MAKE) .workflows.upload-pipeline PIPELINE_FILE=$$file; \ done + + + + diff --git a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py index c131d11ea..c9fb6f2e9 100644 --- a/transforms/universal/tokenization/kfp_ray/tokenization_wf.py +++ b/transforms/universal/tokenization/kfp_ray/tokenization_wf.py @@ -18,7 +18,7 @@ # the name of the job script -EXEC_SCRIPT_NAME: str = "tokenization_transform_ray.py" +EXEC_SCRIPT_NAME: str = "-m dpk_tokenization.ray.transform" task_image = "quay.io/dataprep1/data-prep-kit/tokenization-ray:latest" @@ -112,7 +112,14 @@ def tokenization( ray_name: str = "tkn-kfp-ray", # name of Ray cluster # Add image_pull_secret and image_pull_policy to ray workers if needed ray_head_options: dict = {"cpu": 1, "memory": 4, "image": task_image}, - ray_worker_options: dict = {"replicas": 2, "max_replicas": 2, "min_replicas": 2, "cpu": 2, "memory": 4, "image": task_image}, + ray_worker_options: dict = { + "replicas": 2, + "max_replicas": 2, + "min_replicas": 2, + "cpu": 2, + "memory": 4, + "image": task_image, + }, server_url: str = "http://kuberay-apiserver-service.kuberay.svc.cluster.local:8888", # data access data_s3_config: str = "{'input_folder': 'test/tokenization/ds01/input/', 'output_folder': 'test/tokenization/ds01/output/'}", @@ -120,9 +127,9 @@ def tokenization( data_max_files: int = -1, data_num_samples: int = -1, # orchestrator - runtime_actor_options: dict = {'num_cpus': 0.8}, + runtime_actor_options: dict = {"num_cpus": 0.8}, runtime_pipeline_id: str = "pipeline_id", - runtime_code_location: dict = {'github': 'github', 'commit_hash': '12345', 'path': 'path'}, + runtime_code_location: dict = {"github": "github", "commit_hash": "12345", "path": "path"}, # tokenizer parameters tkn_tokenizer: str = "hf-internal-testing/llama-tokenizer", tkn_doc_id_column: str = "document_id", @@ -175,7 +182,9 @@ def tokenization( :return: None """ # create clean_up task - clean_up_task = cleanup_ray_op(ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params) + clean_up_task = cleanup_ray_op( + ray_name=ray_name, run_id=run_id, server_url=server_url, additional_params=additional_params + ) ComponentUtils.add_settings_to_component(clean_up_task, ONE_HOUR_SEC * 2) # pipeline definition with dsl.ExitHandler(clean_up_task): diff --git a/transforms/universal/tokenization/python/.dockerignore b/transforms/universal/tokenization/python/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/universal/tokenization/python/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/tokenization/python/Makefile b/transforms/universal/tokenization/python/Makefile deleted file mode 100644 index 8f4f7fbf5..000000000 --- a/transforms/universal/tokenization/python/Makefile +++ /dev/null @@ -1,65 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -venv:: .transforms.python-venv - -test:: .transforms.python-test - -clean:: .transforms.clean - -image:: .transforms.python-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-python - -setup:: .transforms.setup - -# distribution versions is the same as image version. -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(TOKENIZATION_PYTHON_VERSION) TOML_VERSION=$(TOKENIZATION_PYTHON_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.python-test-image - -run-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \ - RUN_ARGS="--data_local_config \"{ 'input_folder' : '../test-data/ds01/input', 'output_folder' : '../output'}\" \ - " .transforms.run-src-file - -run-local-sample: .transforms.run-local-python-sample - -#run-s3-sample: .transforms.run-s3-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/tokenization/python/README.md b/transforms/universal/tokenization/python/README.md deleted file mode 100644 index 0c470bb73..000000000 --- a/transforms/universal/tokenization/python/README.md +++ /dev/null @@ -1,100 +0,0 @@ -

Distributed tokenization module for data sets using any Hugging Face compatible tokenizer. -
-

- -## 📝 Table of Contents -- [Summary](#Summary) -- [Running](#Running) -- [CLI Options](#cli_options) - -# Data Tokenization -Please see the set of -[transform project conventions](../../../README.md) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary -The data tokenization transform operates by converting a (non-empty) input table into an output table -using a pre-trained tokenizer. The input table is required to have a minimum of two columns, -named `document_id` and `contents` by default. However, alternate column names can be specified using -`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents. -It is essential for the values within the `document_id` column to be unique across the dataset, -while the `contents` column stores their respective document content. To execute example demonstrations within this directory, -a machine with `64GiB` of RAM is recommended. - -To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter. -This parameter accepts the name of a tokenizer ready for download from Hugging Face, -such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible -with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter -to include extra arguments specific to the chosen tokenizer. -For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token, -you can specify `use_auth_token=` in `--tkn_tokenizer`. - -The tokenization transformer utilizes the specified tokenizer to tokenize each row, -assuming each row represents a document, in the input table and save it to a corresponding row in the output table. -The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`. - -The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process. -The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID, -while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens. -During tokenization, the tokenizer will disregard empty documents (rows) in the input table, -as well as documents that yield no tokens or encounter failure during tokenization. -The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file. - - -In certain cases, the tokenization process of some tokenizers may be sluggish, -particularly when handling lengthy documents containing millions of characters. -To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time. -For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text. -The tokenizer will then tokenize each chunk separately and combine their resulting token IDs. -By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length. - - - -## Running - -### CLI Options -The following command line arguments are available in addition to -the options provided by the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md) -and the [python launcher](../../../../data-processing-lib/doc/python-launcher-options.md). -``` - --tkn_tokenizer TKN_TOKENIZER - Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used - --tkn_tokenizer_args TKN_TOKENIZER_ARGS - Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace - --tkn_doc_id_column TKN_DOC_ID_COLUMN - Column contains document id which values should be unique across dataset - --tkn_doc_content_column TKN_DOC_CONTENT_COLUMN - Column contains document content - --tkn_text_lang TKN_TEXT_LANG - Specify language used in the text content for better text splitting if needed - --tkn_chunk_size TKN_CHUNK_SIZE - Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words) -``` - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/tokenization_transform_python.py using command line args -* `run-local-sample` - runs src/tokenization_local_python.py - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/universal/tokenization/python/pyproject.toml b/transforms/universal/tokenization/python/pyproject.toml deleted file mode 100644 index fb9ee0c48..000000000 --- a/transforms/universal/tokenization/python/pyproject.toml +++ /dev/null @@ -1,53 +0,0 @@ -[project] -name = "dpk_tokenization_transform_python" -keywords = ["tokenizer", "data", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] -version = "0.2.3.dev2" -requires-python = ">=3.10,<3.13" -description = "Tokenization Transform for Python" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, -] - -[project_urls] -Repository = "https://github.com/IBM/data-prep-kit" -Issues = "https://github.com/IBM/data-prep-kit/issues" -Documentation = "https://ibm.github.io/data-prep-kit/" -"Transform project" = "https://github.com/IBM/data-prep-kit/tree/dev/transforms/universal/tokenization" - -dynamic = ["dependencies"] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools.dynamic] -dependencies = {file = ["requirements.txt"]} - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/tokenization/ray/.dockerignore b/transforms/universal/tokenization/ray/.dockerignore deleted file mode 100644 index f7275bbbd..000000000 --- a/transforms/universal/tokenization/ray/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -venv/ diff --git a/transforms/universal/tokenization/ray/.gitignore b/transforms/universal/tokenization/ray/.gitignore deleted file mode 100644 index 3ea7fd4ab..000000000 --- a/transforms/universal/tokenization/ray/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -test-data/output -output/* -/output/ -data-processing-lib/ - - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - - -# Distribution / packaging -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -.tox/ -htmlcov -.coverage -.cache -nosetests.xml -coverage.xml \ No newline at end of file diff --git a/transforms/universal/tokenization/ray/Makefile b/transforms/universal/tokenization/ray/Makefile deleted file mode 100644 index 0a4e3a370..000000000 --- a/transforms/universal/tokenization/ray/Makefile +++ /dev/null @@ -1,65 +0,0 @@ -# Define the root of the local git clone for the common rules to be able -# know where they are running from. -REPOROOT=../../../.. - -# Set this, before including .make.defaults, to -# 1 if requirements reference the latest code in the data processing library -# in this repo (that is not yet published to pypi). This is the default setting. -# 0 if the transforms DPK dependencies are on wheels published to -# pypi (e.g. data-prep-toolkit=0.2.1) -#USE_REPO_LIB_SRC=1 - -# Include a library of common .transform.* targets which most -# transforms should be able to reuse. However, feel free -# to override/redefine the rules below. -include $(REPOROOT)/transforms/.make.transforms - -# Include the common configuration for this transform -include ../transform.config - -BASE_IMAGE=${RAY_BASE_IMAGE} -venv:: .transforms.ray-venv - -test:: .transforms.ray-test - -clean:: .transforms.clean - -image:: .transforms.ray-image - -test-src:: .transforms.test-src - -setup:: .transforms.setup - -build:: build-dist image - -publish: publish-image - -publish-image:: .transforms.publish-image-ray - -setup:: .transforms.setup - -set-versions: - $(MAKE) TRANSFORM_PYTHON_VERSION=$(TOKENIZATION_PYTHON_VERSION) TOML_VERSION=$(TOKENIZATION_RAY_VERSION) .transforms.set-versions - -build-dist:: .defaults.build-dist - -publish-dist:: .defaults.publish-dist - -test-image:: .transforms.ray-test-image - -run-cli-sample: - $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_ray.py \ - RUN_ARGS="--run_locally True --data_local_config \"{ 'input_folder' : '../test-data/ds01/input', 'output_folder' : '../output'}\" \ - " .transforms.run-src-file - -#run-local-sample: .transforms.run-local-sample - -run-s3-sample: .transforms.run-s3-ray-sample - -minio-start: .minio-start - -kind-load-image:: .transforms.kind-load-image - -docker-load-image: .defaults.docker-load-image - -docker-save-image: .defaults.docker-save-image diff --git a/transforms/universal/tokenization/ray/README.md b/transforms/universal/tokenization/ray/README.md deleted file mode 100644 index 1181d6878..000000000 --- a/transforms/universal/tokenization/ray/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# Tokenization Transform for Ray -Please see the set of -[transform project conventions](../../../README.md#transform-project-conventions) -for details on general project conventions, transform configuration, -testing and IDE set up. - -## Summary -This project wraps the [tokenization transform](../python) with a Ray runtime. - -## Configuration and command line Options - -Noop configuration and command line options are the same as for the base python transform. - -## Running - -### Launched Command Line Options -In addition to those available to the transform as defined in [here](../python/README.md), -the set of -[ray launcher](../../../../data-processing-lib/doc/ray-launcher-options.md) are available. - -### Running the samples -To run the samples, use the following `make` targets - -* `run-cli-sample` - runs src/tokenization_transform_ray.py using command line args -* `run-local-sample` - runs src/tokenization_local_ray.py -* `run-s3-sample` - runs src/filter_s3_ray.py - * Requires prior installation of minio, depending on your platform (e.g., from [here](https://min.io/docs/minio/macos/index.html) - and [here](https://min.io/docs/minio/linux/index.html) - and invocation of `make minio-start` to load data into local minio for S3 access. - -These targets will activate the virtual environment and set up any configuration needed. -Use the `-n` option of `make` to see the detail of what is done to run the sample. - -For example, -```shell -make run-cli-sample -... -``` -Then -```shell -ls output -``` -To see results of the transform. - -### Transforming data using the transform image - -To use the transform image to transform your data, please refer to the -[running images quickstart](../../../../doc/quick-start/run-transform-image.md), -substituting the name of this transform image and runtime as appropriate. diff --git a/transforms/universal/tokenization/ray/pyproject.toml b/transforms/universal/tokenization/ray/pyproject.toml deleted file mode 100644 index 0829e002c..000000000 --- a/transforms/universal/tokenization/ray/pyproject.toml +++ /dev/null @@ -1,45 +0,0 @@ -[project] -name = "dpk_tokenization_transform_ray" -version = "0.2.3.dev2" -requires-python = ">=3.10,<3.13" -description = "Tokenization Transform for Ray" -license = {text = "Apache-2.0"} -readme = {file = "README.md", content-type = "text/markdown"} -authors = [ - { name = "Xuan-Hong Dang", email = "xuan-hong.dang@ibm.com"}, -] -dependencies = [ - "dpk-tokenization-transform-python==0.2.3.dev2", - "data-prep-toolkit[ray]>=0.2.3.dev2", -] - -[build-system] -requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] -build-backend = "setuptools.build_meta" - -[project.optional-dependencies] -dev = [ - "twine", - "pytest>=7.3.2", - "pytest-dotenv>=0.5.2", - "pytest-env>=1.0.0", - "pre-commit>=3.3.2", - "pytest-cov>=4.1.0", - "pytest-mock>=3.10.0", - "moto==5.0.5", - "markupsafe==2.0.1", -] - -[options] -package_dir = ["src","test"] - -[options.packages.find] -where = ["src/"] - -[tool.pytest.ini_options] -# Currently we use low coverage since we have to run tests separately (see makefile) -#addopts = "--cov --cov-report term-missing --cov-fail-under 25" -markers = ["unit: unit tests", "integration: integration tests"] - -[tool.coverage.run] -include = ["src/*"] diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet deleted file mode 100644 index e452fbf9a..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet deleted file mode 100644 index bff7a5ed9..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq01.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet b/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet deleted file mode 100644 index fbdeaf64b..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds01/expected/lang=en/pq02.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json b/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json deleted file mode 100644 index e6c190807..000000000 --- a/transforms/universal/tokenization/ray/test-data/ds01/expected/metadata.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "Tokenization", - "job type": "ray", - "job id": "job_id", - "start_time": "2024-03-29 13:30:56", - "end_time": "2024-03-29 13:30:57", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "tokenizer": "hf-internal-testing/llama-tokenizer", - "tokenizer_args": null, - "doc_id_column": "document_id", - "doc_content_column": "contents", - "text_lang": "en", - "chunk_size": 0, - "checkpointing": false, - "max_files": -1, - "number of workers": 5, - "worker options": { - "num_cpus": 0.8 - }, - "actor creation delay": 0 - }, - "execution_stats": { - "cpus": 10, - "gpus": 0, - "memory": 27.31659088190645, - "object_store": 2.0 - }, - "job_output_stats": { - "source_files": 5, - "source_size": 450, - "result_files": 3, - "result_size": 842, - "table_processing": 0.03880786895751953, - "num_files": 3, - "num_rows": 6, - "num_tokenized_rows": 6, - "num_tokens": 85, - "num_chars": 384, - "skipped empty tables": 2 - }, - "source": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds01/input", - "type": "path" - }, - "target": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds01", - "type": "path" - } -} diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet deleted file mode 100644 index 83bdac0b4..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet deleted file mode 100644 index 5a86a7b13..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet deleted file mode 100644 index 5a86a7b13..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet deleted file mode 100644 index 07fd2adfe..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq01.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet b/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet deleted file mode 100644 index 879fcf0f0..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds01/input/lang=en/pq02.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet b/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet deleted file mode 100644 index e81983916..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds02/expected/df_17m.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json b/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json deleted file mode 100644 index dc9813beb..000000000 --- a/transforms/universal/tokenization/ray/test-data/ds02/expected/metadata.json +++ /dev/null @@ -1,58 +0,0 @@ -{ - "pipeline": "pipeline_id", - "job details": { - "job category": "preprocessing", - "job name": "Tokenization", - "job type": "ray", - "job id": "job_id", - "start_time": "2024-03-29 14:03:15", - "end_time": "2024-03-29 14:03:32", - "status": "success" - }, - "code": { - "github": "github", - "commit_hash": "12345", - "path": "path" - }, - "job_input_params": { - "tokenizer": "hf-internal-testing/llama-tokenizer", - "tokenizer_args": null, - "doc_id_column": "document_id", - "doc_content_column": "contents", - "text_lang": "en", - "chunk_size": 20000, - "checkpointing": false, - "max_files": -1, - "number of workers": 5, - "worker options": { - "num_cpus": 0.8 - }, - "actor creation delay": 0 - }, - "execution_stats": { - "cpus": 10, - "gpus": 0, - "memory": 27.180484008975327, - "object_store": 2.0 - }, - "job_output_stats": { - "source_files": 1, - "source_size": 16863266, - "result_files": 1, - "result_size": 37109764, - "table_processing": 15.886597871780396, - "num_files": 1, - "num_rows": 1, - "num_tokenized_rows": 1, - "num_tokens": 4638717, - "num_chars": 16836009 - }, - "source": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/test-data/ds02/input", - "type": "path" - }, - "target": { - "name": "/Users/xdang/00proj/04-FM/01_code/fm-data-engineering/transforms/universal/tokenization/output/ds02", - "type": "path" - } -} diff --git a/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet b/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet deleted file mode 100644 index b7f3df71b..000000000 Binary files a/transforms/universal/tokenization/ray/test-data/ds02/input/df_17m.parquet and /dev/null differ diff --git a/transforms/universal/tokenization/python/requirements.txt b/transforms/universal/tokenization/requirements.txt similarity index 100% rename from transforms/universal/tokenization/python/requirements.txt rename to transforms/universal/tokenization/requirements.txt diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq01.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq01.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq01.parquet rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq01.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq02.parquet b/transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq02.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/expected/lang=en/pq02.parquet rename to transforms/universal/tokenization/test-data/ds01/expected/lang=en/pq02.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds01/expected/metadata.json b/transforms/universal/tokenization/test-data/ds01/expected/metadata.json similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/expected/metadata.json rename to transforms/universal/tokenization/test-data/ds01/expected/metadata.json diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=cybersecurity_v2.0/version=2.3.2/pq03.snappy.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc01.snappy.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/dataset=empty/dpv08_cc02.snappy.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq01.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/pq01.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq01.parquet rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/pq01.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq02.parquet b/transforms/universal/tokenization/test-data/ds01/input/lang=en/pq02.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds01/input/lang=en/pq02.parquet rename to transforms/universal/tokenization/test-data/ds01/input/lang=en/pq02.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds02/expected/df_17m.parquet b/transforms/universal/tokenization/test-data/ds02/expected/df_17m.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds02/expected/df_17m.parquet rename to transforms/universal/tokenization/test-data/ds02/expected/df_17m.parquet diff --git a/transforms/universal/tokenization/python/test-data/ds02/expected/metadata.json b/transforms/universal/tokenization/test-data/ds02/expected/metadata.json similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds02/expected/metadata.json rename to transforms/universal/tokenization/test-data/ds02/expected/metadata.json diff --git a/transforms/universal/tokenization/python/test-data/ds02/input/df_17m.parquet b/transforms/universal/tokenization/test-data/ds02/input/df_17m.parquet similarity index 100% rename from transforms/universal/tokenization/python/test-data/ds02/input/df_17m.parquet rename to transforms/universal/tokenization/test-data/ds02/input/df_17m.parquet diff --git a/transforms/universal/tokenization/python/test/test_tokenization.py b/transforms/universal/tokenization/test/test_tokenization.py similarity index 97% rename from transforms/universal/tokenization/python/test/test_tokenization.py rename to transforms/universal/tokenization/test/test_tokenization.py index e4f13fd13..3cb53a047 100644 --- a/transforms/universal/tokenization/python/test/test_tokenization.py +++ b/transforms/universal/tokenization/test/test_tokenization.py @@ -16,7 +16,7 @@ from data_processing.test_support.transform.table_transform_test import ( AbstractTableTransformTest, ) -from tokenization_transform import TokenizationTransform +from dpk_tokenization.transform import TokenizationTransform """ diff --git a/transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py b/transforms/universal/tokenization/test/test_tokenization_long_doc_python.py similarity index 95% rename from transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py rename to transforms/universal/tokenization/test/test_tokenization_long_doc_python.py index ef68ce549..5fe98689e 100644 --- a/transforms/universal/tokenization/python/test/test_tokenization_long_doc_python.py +++ b/transforms/universal/tokenization/test/test_tokenization_long_doc_python.py @@ -16,7 +16,7 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from tokenization_transform_python import TokenizationPythonConfiguration +from dpk_tokenization.transform_python import TokenizationPythonConfiguration tkn_params = { diff --git a/transforms/universal/tokenization/python/test/test_tokenization_python.py b/transforms/universal/tokenization/test/test_tokenization_python.py similarity index 95% rename from transforms/universal/tokenization/python/test/test_tokenization_python.py rename to transforms/universal/tokenization/test/test_tokenization_python.py index c198d561a..00a3a6f58 100644 --- a/transforms/universal/tokenization/python/test/test_tokenization_python.py +++ b/transforms/universal/tokenization/test/test_tokenization_python.py @@ -16,7 +16,7 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from tokenization_transform_python import TokenizationPythonConfiguration +from dpk_tokenization.transform_python import TokenizationPythonConfiguration tkn_params = { diff --git a/transforms/universal/tokenization/ray/test/test_tokenization_ray.py b/transforms/universal/tokenization/test/test_tokenization_ray.py similarity index 95% rename from transforms/universal/tokenization/ray/test/test_tokenization_ray.py rename to transforms/universal/tokenization/test/test_tokenization_ray.py index 17791e370..6ef31a914 100644 --- a/transforms/universal/tokenization/ray/test/test_tokenization_ray.py +++ b/transforms/universal/tokenization/test/test_tokenization_ray.py @@ -16,7 +16,7 @@ AbstractTransformLauncherTest, ) from data_processing_ray.runtime.ray import RayTransformLauncher -from tokenization_transform_ray import TokenizationRayConfiguration +from dpk_tokenization.ray.transform import TokenizationRayConfiguration tkn_params = { diff --git a/transforms/universal/tokenization/tokenization.ipynb b/transforms/universal/tokenization/tokenization.ipynb new file mode 100644 index 000000000..8b59e6885 --- /dev/null +++ b/transforms/universal/tokenization/tokenization.ipynb @@ -0,0 +1,185 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "afd55886-5f5b-4794-838e-ef8179fb0394", + "metadata": {}, + "source": [ + "##### **** These pip installs need to be adapted to use the appropriate release level. Alternatively, The venv running the jupyter lab could be pre-configured with a requirement file that includes the right release. Example for transform developers working from git clone:\n", + "```\n", + "make venv \n", + "source venv/bin/activate \n", + "pip install jupyterlab\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c45c3c6-e4d7-4e61-8de6-32d61f2ce695", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "## This is here as a reference only\n", + "# Users and application developers must use the right tag for the latest from pypi\n", + "!pip install data-prep-toolkit\n", + "!pip install data-prep-toolkit-transforms[tokenization]" + ] + }, + { + "cell_type": "markdown", + "id": "407fd4e4-265d-4ec7-bbc9-b43158f5f1f3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "##### **** Configure the transform parameters. The set of dictionary keys holding DocIDTransform configuration for values are as follows: \n", + "| Name | Description|\n", + "| -----|------------|\n", + "|tkn_tokenizer | Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used |\n", + "|tkn_tokenizer_args |Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace|\n", + "|tkn_doc_id_column|Column contains document id which values should be unique across dataset|\n", + "|tkn_doc_content_column|Column contains document content|\n", + "|tkn_text_lang|Specify language used in the text content for better text splitting if needed|\n", + "|tkn_chunk_size|Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)|\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "ebf1f782-0e61-485c-8670-81066beb734c", + "metadata": {}, + "source": [ + "##### ***** Import required classes and modules" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9669273a-8fcc-4b40-9b20-8df658e2ab58", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.\n" + ] + } + ], + "source": [ + "from dpk_tokenization.transform_python import Tokenization" + ] + }, + { + "cell_type": "markdown", + "id": "7234563c-2924-4150-8a31-4aec98c1bf33", + "metadata": {}, + "source": [ + "##### ***** Setup runtime parameters for this transform" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "badafb96-64d2-4bb8-9f3e-b23713fd5c3f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "16:52:19 INFO - pipeline id pipeline_id\n", + "16:52:19 INFO - code location None\n", + "16:52:19 INFO - data factory data_ is using local data access: input_folder - test-data/ds02/input output_folder - output\n", + "16:52:19 INFO - data factory data_ max_files -1, n_sample -1\n", + "16:52:19 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.parquet'], files to checkpoint ['.parquet']\n", + "16:52:19 INFO - orchestrator Tokenization started at 2024-12-11 16:52:19\n", + "16:52:19 INFO - Number of files is 1, source profile {'max_file_size': 6.856490135192871, 'min_file_size': 6.856490135192871, 'total_file_size': 6.856490135192871}\n", + "Token indices sequence length is longer than the specified maximum sequence length for this model (5256 > 2048). Running this sequence through the model will result in indexing errors\n", + "16:52:33 INFO - Completed 1 files (100.0%) in 0.228 min\n", + "16:52:33 INFO - Done processing 1 files, waiting for flush() completion.\n", + "16:52:33 INFO - done flushing in 0.0 sec\n", + "16:52:33 INFO - Completed execution in 0.235 min, execution result 0\n" + ] + }, + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Tokenization(input_folder= \"test-data/ds02/input\",\n", + " output_folder= \"output\",\n", + " tkn_tokenizer= \"hf-internal-testing/llama-tokenizer\",\n", + " tkn_chunk_size= 20_000).transform()" + ] + }, + { + "cell_type": "markdown", + "id": "c3df5adf-4717-4a03-864d-9151cd3f134b", + "metadata": {}, + "source": [ + "##### **** The specified folder will include the transformed parquet files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7276fe84-6512-4605-ab65-747351e13a7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['output/lang=en', 'output/metadata.json', 'output/df_17m.parquet']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import glob\n", + "glob.glob(\"output/*\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "845a75cf-f4a9-467d-87fa-ccbac1c9beb8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transforms/universal/tokenization/transform.config b/transforms/universal/tokenization/transform.config deleted file mode 100644 index 04f517d42..000000000 --- a/transforms/universal/tokenization/transform.config +++ /dev/null @@ -1,20 +0,0 @@ -# -# This is intended to be included across the Makefiles provided within -# a given transform's directory tree, so must use compatible syntax. -# -################################################################################ -# This defines the name of the transform and is used to match against -# expected files and is used to define the transform's image name. -TRANSFORM_NAME=tokenization - -################################################################################ -# This defines the transforms' version number as would be used -# when publishing the wheel. In general, only the micro version -# number should be advanced relative to the DPK_VERSION. -# -# If you change the versions numbers, be sure to run "make set-versions" to -# update version numbers across the transform (e.g., pyproject.toml). -TOKENIZATION_PYTHON_VERSION=$(DPK_VERSION) -TOKENIZATION_RAY_VERSION=$(TOKENIZATION_PYTHON_VERSION) -TOKENIZATION_SPARK_VERSION=$(TOKENIZATION_PYTHON_VERSION) -