-
Notifications
You must be signed in to change notification settings - Fork 162
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: matouma <[email protected]>
- Loading branch information
Showing
56 changed files
with
418 additions
and
716 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
REPOROOT=../../.. | ||
# Use make help, to see the available rules | ||
include $(REPOROOT)/transforms/.make.cicd.targets | ||
|
||
# | ||
# This is intended to be included across the Makefiles provided within | ||
# a given transform's directory tree, so must use compatible syntax. | ||
# | ||
################################################################################ | ||
# This defines the name of the transform and is used to match against | ||
# expected files and is used to define the transform's image name. | ||
TRANSFORM_NAME=$(shell basename `pwd`) | ||
|
||
################################################################################ | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,79 +1,21 @@ | ||
REPOROOT=../../.. | ||
# Use make help, to see the available rules | ||
include $(REPOROOT)/.make.defaults | ||
include $(REPOROOT)/transforms/.make.cicd.targets | ||
|
||
setup:: | ||
@# Help: Recursively make $@ all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
# | ||
# This is intended to be included across the Makefiles provided within | ||
# a given transform's directory tree, so must use compatible syntax. | ||
# | ||
################################################################################ | ||
# This defines the name of the transform and is used to match against | ||
# expected files and is used to define the transform's image name. | ||
TRANSFORM_NAME=$(shell basename `pwd`) | ||
|
||
clean:: | ||
@# Help: Recursively make $@ all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
################################################################################ | ||
|
||
build:: | ||
@# Help: Recursively make $@ in subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
venv:: | ||
@# Help: Recursively make $@ in subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
set-versions: | ||
@# Help: Recursively $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
publish:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test:: | ||
@# Help: Recursively make $@ in all subdirs | ||
@$(MAKE) RULE=$@ .recurse | ||
|
||
test-src:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
kind-load-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
docker-load-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
docker-save-image:: | ||
@# Help: Recursively make $@ in all subdirs | ||
$(MAKE) RULE=$@ .recurse | ||
|
||
.PHONY: workflow-venv | ||
workflow-venv: | ||
if [ -e kfp_ray ]; then \ | ||
$(MAKE) -C kfp_ray workflow-venv; \ | ||
fi | ||
|
||
.PHONY: workflow-test | ||
workflow-test: | ||
if [ -e kfp_ray ]; then \ | ||
$(MAKE) -C kfp_ray workflow-test; \ | ||
fi | ||
|
||
.PHONY: workflow-upload | ||
workflow-upload: | ||
if [ -e kfp_ray ]; then \ | ||
$(MAKE) -C kfp_ray workflow-upload; \ | ||
fi | ||
|
||
.PHONY: workflow-build | ||
workflow-build: | ||
if [ -e kfp_ray ]; then \ | ||
$(MAKE) -C kfp_ray workflow-build; \ | ||
fi | ||
|
||
run-cli-sample: | ||
make venv | ||
source venv/bin/activate && \ | ||
$(PYTHON) -m dpk_$(TRANSFORM_NAME).transform_python \ | ||
--data_local_config "{ 'input_folder' : 'test-data/ds01/input', 'output_folder' : 'output'}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,122 @@ | ||
# Tokenization Transform | ||
The tokenization transform annotates pyarrow tables and parquet files | ||
to add a column containing tokens for the document column. | ||
Per the set of | ||
<p align="Left"> Distributed tokenization module for data sets using any Hugging Face compatible tokenizer. | ||
<br> | ||
</p> | ||
|
||
|
||
## Contributors | ||
|
||
- Xuan-Hong Dang ([email protected]) | ||
|
||
# Data Tokenization | ||
Please see the set of | ||
[transform project conventions](../../README.md#transform-project-conventions) | ||
the following runtimes are available: | ||
|
||
* [python](python/README.md) - provides the core python-based transformation | ||
implementation. | ||
* [ray](ray/README.md) - enables the running of the python-based transformation | ||
in a Ray runtime | ||
* [kfp](kfp_ray/README.md) - enables running the ray docker image | ||
the transform in a kubernetes cluster using a generated `yaml` file. | ||
for details on general project conventions, transform configuration, | ||
testing and IDE set up. | ||
|
||
## Summary | ||
The data tokenization transform operates by converting a (non-empty) input table into an output table | ||
using a pre-trained tokenizer. The input table is required to have a minimum of two columns, | ||
named `document_id` and `contents` by default. However, alternate column names can be specified using | ||
`--tkn_doc_id_column` for the document id and `--tkn_doc_content_column` for the document contents. | ||
It is essential for the values within the `document_id` column to be unique across the dataset, | ||
while the `contents` column stores their respective document content. To execute example demonstrations within this directory, | ||
a machine with `64GiB` of RAM is recommended. | ||
|
||
To specify a pre-trained tokenizer, utilize the `--tkn_tokenizer` parameter. | ||
This parameter accepts the name of a tokenizer ready for download from Hugging Face, | ||
such as `hf-internal-testing/llama-tokenizer, bigcode/starcoder`, or any other tokenizer compatible | ||
with the Hugging Face AutoTokenizer library. Additionally, you can employ the `--tkn_tokenizer_args` parameter | ||
to include extra arguments specific to the chosen tokenizer. | ||
For instance, when loading a Hugging Face tokenizer like `bigcode/starcoder`, which necessitate an access token, | ||
you can specify `use_auth_token=<your token>` in `--tkn_tokenizer`. | ||
|
||
The tokenization transformer utilizes the specified tokenizer to tokenize each row, | ||
assuming each row represents a document, in the input table and save it to a corresponding row in the output table. | ||
The output table generally consists of four columns: `tokens, document_id, document_length`, and `token_count`. | ||
|
||
The `tokens` stores the sequence of token IDs generated by the tokenizer during the document tokenization process. | ||
The `document_id` (or the designated name specified in `--tkn_doc_id_column`) contains the document ID, | ||
while `document_length` and `token_count` respectively record the length of the document and the total count of generated tokens. | ||
During tokenization, the tokenizer will disregard empty documents (rows) in the input table, | ||
as well as documents that yield no tokens or encounter failure during tokenization. | ||
The count of such documents will be stored in the `num_empty_rows` field of the `metadata` file. | ||
|
||
|
||
In certain cases, the tokenization process of some tokenizers may be sluggish, | ||
particularly when handling lengthy documents containing millions of characters. | ||
To address this, you can employ the `--tkn_chunk_size` parameter to define the length of chunks to tokenize at a given time. | ||
For English text (`en`), it is recommended to set the chunk size to `20,000`, roughly equivalent to `15` pages of text. | ||
The tokenizer will then tokenize each chunk separately and combine their resulting token IDs. | ||
By default, the value of `--tkn_chunk_size` is `0`, indicating that each document is tokenized as a whole, regardless of its length. | ||
|
||
|
||
|
||
## Running | ||
|
||
### CLI Options | ||
The following command line arguments are available in addition to | ||
the options provided by the [python launcher](../../../data-processing-lib/doc/python-launcher-options.md). | ||
``` | ||
--tkn_tokenizer TKN_TOKENIZER | ||
Tokenizer used for tokenization. It also can be a path to a pre-trained tokenizer. By defaut, `hf-internal-testing/llama-tokenizer` from HuggingFace is used | ||
--tkn_tokenizer_args TKN_TOKENIZER_ARGS | ||
Arguments for tokenizer. For example, `cache_dir=/tmp/hf,use_auth_token=Your_HF_authentication_token` could be arguments for tokenizer `bigcode/starcoder` from HuggingFace | ||
--tkn_doc_id_column TKN_DOC_ID_COLUMN | ||
Column contains document id which values should be unique across dataset | ||
--tkn_doc_content_column TKN_DOC_CONTENT_COLUMN | ||
Column contains document content | ||
--tkn_text_lang TKN_TEXT_LANG | ||
Specify language used in the text content for better text splitting if needed | ||
--tkn_chunk_size TKN_CHUNK_SIZE | ||
Specify >0 value to tokenize each row/doc in chunks of characters (rounded in words) | ||
``` | ||
|
||
### Running the samples | ||
To run the samples, use the following `make` target | ||
|
||
* `run-cli-sample` - runs dpk_tokenization/transform_python.py using command line args | ||
|
||
|
||
These targets will activate the virtual environment and set up any configuration needed. | ||
Use the `-n` option of `make` to see the detail of what is done to run the sample. | ||
|
||
For example, | ||
```shell | ||
make run-cli-sample | ||
... | ||
``` | ||
Then | ||
```shell | ||
ls output | ||
``` | ||
To see results of the transform. | ||
|
||
### Code example | ||
Here is a sample [notebook](tokenization.ipynb) | ||
|
||
|
||
|
||
### Transforming data using the transform image | ||
|
||
To use the transform image to transform your data, please refer to the | ||
[running images quickstart](../../../doc/quick-start/run-transform-image.md), | ||
substituting the name of this transform image and runtime as appropriate. | ||
|
||
# Tokenization Transform for Ray | ||
Please see the set of | ||
[transform project conventions](../../README.md#transform-project-conventions) | ||
for details on general project conventions, transform configuration, | ||
testing and IDE set up. | ||
|
||
## Summary | ||
This project wraps the tokenization transform with a Ray runtime. | ||
|
||
## Configuration and command line Options | ||
|
||
Configuration and command line options are the same as for the base python transform. | ||
|
||
### Launched Command Line Options | ||
In addition to those available to the transform as defined in here, | ||
the set of | ||
[ray launcher options](../../../data-processing-lib/doc/ray-launcher-options.md) are available. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.