Skip to content

Commit

Permalink
merge tokenization
Browse files Browse the repository at this point in the history
Signed-off-by: matouma <[email protected]>
  • Loading branch information
matouma committed Dec 15, 2024
2 parents 352e267 + 0c3ae86 commit b7c74f9
Show file tree
Hide file tree
Showing 56 changed files with 418 additions and 716 deletions.
16 changes: 16 additions & 0 deletions transforms/Makefile.transform.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/transforms/.make.cicd.targets

#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

################################################################################


Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,10 @@ RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}

# END OF STEPS destined for a data-prep-kit base image

COPY --chown=dpk:root src/ src/
COPY --chown=dpk:root pyproject.toml pyproject.toml
COPY --chown=dpk:root dpk_tokenization/ dpk_tokenization/
COPY --chown=dpk:root requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/tokenization_transform_python.py .

# copy some of the samples in
COPY src/tokenization_local_python.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/dpk
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,11 @@ ARG DPK_WHEEL_FILE_NAME
COPY --chown=ray:users data-processing-dist data-processing-dist
RUN pip install data-processing-dist/${DPK_WHEEL_FILE_NAME}[ray]

## Copy the python version of the tansform
COPY --chown=ray:users python-transform/ python-transform
RUN cd python-transform && pip install --no-cache-dir -r requirements.txt && pip install --no-cache-dir -e .

COPY --chown=ray:users dpk_tokenization/ dpk_tokenization/
COPY --chown=ray:users requirements.txt requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

COPY --chown=ray:users src/ src/
COPY --chown=ray:users pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy the main() entry point to the image
COPY ./src/tokenization_transform_ray.py .

# copy some of the samples in
COPY src/tokenization_local_ray.py local/

# copy test
COPY test/ test/
COPY test-data/ test-data/

# Set environment
ENV PYTHONPATH /home/ray
Expand Down
88 changes: 15 additions & 73 deletions transforms/universal/tokenization/Makefile
Original file line number Diff line number Diff line change
@@ -1,79 +1,21 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults
include $(REPOROOT)/transforms/.make.cicd.targets

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse
#
# This is intended to be included across the Makefiles provided within
# a given transform's directory tree, so must use compatible syntax.
#
################################################################################
# This defines the name of the transform and is used to match against
# expected files and is used to define the transform's image name.
TRANSFORM_NAME=$(shell basename `pwd`)

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse
################################################################################

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-venv; \
fi

.PHONY: workflow-test
workflow-test:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-test; \
fi

.PHONY: workflow-upload
workflow-upload:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-upload; \
fi

.PHONY: workflow-build
workflow-build:
if [ -e kfp_ray ]; then \
$(MAKE) -C kfp_ray workflow-build; \
fi

run-cli-sample:
make venv
source venv/bin/activate && \
$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \
--data_local_config "{ 'input_folder' : 'test-data/ds01/input', 'output_folder' : 'output'}"
133 changes: 121 additions & 12 deletions transforms/universal/tokenization/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,122 @@
# Tokenization Transform
The tokenization transform annotates pyarrow tables and parquet files
to add a column containing tokens for the document column.
Per the set of
<p align="Left"> Distributed tokenization module for data sets using any Hugging Face compatible tokenizer.
<br>
</p>


## Contributors

- Xuan-Hong Dang ([email protected])

# Data Tokenization
Please see the set of
[transform project conventions](../../README.md#transform-project-conventions)
the following runtimes are available:

* [python](python/README.md) - provides the core python-based transformation
implementation.
* [ray](ray/README.md) - enables the running of the python-based transformation
in a Ray runtime
* [kfp](kfp_ray/README.md) - enables running the ray docker image
the transform in a kubernetes cluster using a generated `yaml` file.
for details on general project conventions, transform configuration,
testing and IDE set up.

## Summary
The data tokenization transform operates by converting a (non-empty) input table into an output table
using a pre-trained tokenizer. The input table is required to have a minimum of two columns,
named `document_id` and `contents` by default. However, alternate column names can be specified using
`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents.
It is essential for the values within the `document_id` column to be unique across the dataset,
while the `contents` column stores their respective document content. To execute example demonstrations within this directory,
a machine with `64GiB` of RAM is recommended.

To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter.
This parameter accepts the name of a tokenizer ready for download from Hugging Face,
such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible
with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter
to include extra arguments specific to the chosen tokenizer.
For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token,
you can specify `use_auth_token=<your token>` in `--tkn_tokenizer`.

The tokenization transformer utilizes the specified tokenizer to tokenize each row,
assuming each row represents a document, in the input table and save it to a corresponding row in the output table.
The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`.

The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process.
The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID,
while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens.
During tokenization, the tokenizer will disregard empty documents (rows) in the input table,
as well as documents that yield no tokens or encounter failure during tokenization.
The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file.


In certain cases, the tokenization process of some tokenizers may be sluggish,
particularly when handling lengthy documents containing millions of characters.
To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time.
For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text.
The tokenizer will then tokenize each chunk separately and combine their resulting token IDs.
By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length.



## Running

### CLI Options
The following command line arguments are available in addition to
the options provided by the [python launcher](../../../data-processing-lib/doc/python-launcher-options.md).
```
--tkn_tokenizer TKN_TOKENIZER
Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used
--tkn_tokenizer_args TKN_TOKENIZER_ARGS
Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace
--tkn_doc_id_column TKN_DOC_ID_COLUMN
Column contains document id which values should be unique across dataset
--tkn_doc_content_column TKN_DOC_CONTENT_COLUMN
Column contains document content
--tkn_text_lang TKN_TEXT_LANG
Specify language used in the text content for better text splitting if needed
--tkn_chunk_size TKN_CHUNK_SIZE
Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words)
```

### Running the samples
To run the samples, use the following `make` target

* `run-cli-sample` - runs dpk_tokenization/transform_python.py using command line args


These targets will activate the virtual environment and set up any configuration needed.
Use the `-n` option of `make` to see the detail of what is done to run the sample.

For example,
```shell
make run-cli-sample
...
```
Then
```shell
ls output
```
To see results of the transform.

### Code example
Here is a sample [notebook](tokenization.ipynb)



### Transforming data using the transform image

To use the transform image to transform your data, please refer to the
[running images quickstart](../../../doc/quick-start/run-transform-image.md),
substituting the name of this transform image and runtime as appropriate.

# Tokenization Transform for Ray
Please see the set of
[transform project conventions](../../README.md#transform-project-conventions)
for details on general project conventions, transform configuration,
testing and IDE set up.

## Summary
This project wraps the tokenization transform with a Ray runtime.

## Configuration and command line Options

Configuration and command line options are the same as for the base python transform.

### Launched Command Line Options
In addition to those available to the transform as defined in here,
the set of
[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available.

Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@

from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from tokenization_transform_python import TokenizationPythonConfiguration
from dpk_tokenization.transform_python import TokenizationPythonConfiguration


# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01"))
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@

from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from tokenization_transform_python import TokenizationPythonConfiguration
from dpk_tokenization.transform_python import TokenizationPythonConfiguration


# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds02", "input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds02"))
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds02", "input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds02"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@

from data_processing.utils import ParamsUtils
from data_processing_ray.runtime.ray import RayTransformLauncher
from tokenization_transform_ray import TokenizationRayConfiguration
from dpk_tokenization.ray.transform import TokenizationRayConfiguration


# create parameters
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "test-data", "ds01", "input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "output", "ds01"))
input_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "test-data", "ds01", "input"))
output_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "output", "ds01"))
local_conf = {
"input_folder": input_folder,
"output_folder": output_folder,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from data_processing.utils import ParamsUtils
from data_processing_ray.runtime.ray import RayTransformLauncher
from tokenization_transform_ray import TokenizationRayConfiguration
from dpk_tokenization.ray.transform import TokenizationRayConfiguration


print(os.environ)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from data_processing_ray.runtime.ray.runtime_configuration import (
RayTransformRuntimeConfiguration,
)
from tokenization_transform import TokenizationTransformConfiguration
from dpk_tokenization.transform import TokenizationTransformConfiguration


logger = get_logger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from tokenization_transform_python import TokenizationPythonConfiguration
from dpk_tokenization.transform_python import TokenizationPythonConfiguration


# create parameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import pyarrow as pa
from data_processing.transform import AbstractTableTransform, TransformConfiguration
from tokenization_utils import is_valid_argument_string, load_tokenizer, split_text
from dpk_tokenization.utils import is_valid_argument_string, load_tokenizer, split_text


CHUNK_CHECKPOINT_INTERVAL = 100
Expand Down
Loading

0 comments on commit b7c74f9

Please sign in to comment.