-
Notifications
You must be signed in to change notification settings - Fork 163
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Build transforms wheel #493
Changes from 5 commits
ddedccc
134c2c2
e227b9d
d155a24
fe0d380
b1e2707
d96477a
3c57682
35c7e60
d66df27
a4f7e0a
0af03cc
65f4ac4
703ebe0
d54708a
e1309e5
109ea29
a874358
440975d
10c9159
db60963
9bb36c5
ee63628
346b82e
071836e
07b827f
0369842
aa297e0
32578d5
3b52ecf
eb499de
8d69b71
d27a1c2
e155d2c
33b8853
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
**/src | ||
**/dist | ||
**/*.egg-info | ||
**/build | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
# Define the root of the local git clone for the common rules to be able | ||
# know where they are running from. | ||
REPOROOT=../../.. | ||
# Include a library of common .transform.* targets which most | ||
# transforms should be able to reuse. However, feel free | ||
# to override/redefine the rules below. | ||
|
||
# $(REPOROOT)/.make.versions file contains the versions | ||
|
||
#TRANSFORM_NAME=doc_quality | ||
|
||
include $(REPOROOT)/transforms/.make.transforms | ||
|
||
TRANSFORMS_NAMES = code/code_quality \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should build this list automatically by looking for the python directories in transforms// |
||
code/code2parquet \ | ||
code/header_cleanser \ | ||
code/code_quality \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. code2parquet, malware There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. malware transforms need some additional work to be included as a pip install and was intentionally excluded from this initial release... It seem to require a docker container and does not fit in the current use case for running these transforms in a notebook. |
||
code/proglang_select \ | ||
language/doc_chunk \ | ||
language/doc_quality \ | ||
language/lang_id \ | ||
language/pdf2parquet \ | ||
language/text_encoder \ | ||
universal/ededup \ | ||
universal/filter \ | ||
universal/resize \ | ||
universal/tokenization | ||
|
||
venv: | ||
$(MAKE) .defaults.create-venv | ||
source venv/bin/activate; \ | ||
$(PYTHON) -m pip install . | ||
|
||
|
||
test:: setup venv test-src | ||
|
||
clean:: .transforms.clean | ||
-rm -fr src | ||
|
||
image:: .transforms.python-image | ||
|
||
test-src:: | ||
source venv/bin/activate; \ | ||
for T in $(TRANSFORMS_NAMES); do \ | ||
echo running unit test on: $$T ; \ | ||
$(PYTEST) $(REPOROOT)/transforms/$$T/python/test; \ | ||
done; | ||
|
||
test-with-pypi: | ||
$(MAKE) .defaults.create-venv | ||
source venv/bin/activate; \ | ||
$(PYTHON) -m pip install data_prep_toolkit_transforms==0.2.1.dev0 | ||
$(MAKE) test-src | ||
|
||
|
||
setup: .transforms.setup | ||
$(MAKE) src | ||
|
||
src: | ||
for T in $(TRANSFORMS_NAMES); do \ | ||
echo copy src from $$T ; \ | ||
cp -R $(REPOROOT)/transforms/$$T/python/src/ src/ ; \ | ||
rm -fr *.egg-info ; \ | ||
rm -fr dist ; \ | ||
rm -fr build ; \ | ||
done; | ||
|
||
|
||
build:: build-dist | ||
|
||
publish:: publish-dist | ||
|
||
build-dist:: .defaults.build-dist | ||
|
||
publish-dist:: .defaults.publish-dist | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# DPK Python Transforms | ||
|
||
## installation | ||
|
||
The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: | ||
|
||
`python -m pip install data-prep-toolkit-transforms` | ||
|
||
installing the python transforms will also install `data-prep-toolkit` | ||
|
||
## List of Transforms in current package | ||
|
||
* code | ||
* [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/python/README.md) | ||
* header_cleanser (Not available on MacOS) | ||
* code_quality | ||
* proglang_select | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. malware There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. malware transforms need some additional work to be included as a pip install and was intentionally excluded from this initial release... It seem to require a docker container and does not fit in the current use case for running these transforms in a notebook. |
||
* language | ||
* doc_chunk | ||
* *doc_quality | ||
* lang_id | ||
* pdf2parquet | ||
* text_encoder | ||
* universal | ||
* ededup | ||
* filter | ||
* resize | ||
* tokenization | ||
|
||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
[project] | ||
name = "data_prep_toolkit_transforms" | ||
version = "0.2.1.dev0" | ||
requires-python = ">=3.10,<3.12" | ||
keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] | ||
description = "Data Preparation Toolkit Transforms" | ||
license = {text = "Apache-2.0"} | ||
readme = {file = "README.md", content-type = "text/markdown"} | ||
authors = [ | ||
{ name = "Maroun Touma", email = "[email protected]" }, | ||
] | ||
|
||
dependencies = [ | ||
"data-prep-toolkit==0.2.1.dev0", | ||
"argparse", | ||
"boto3==1.34.69", | ||
"bs4==0.0.2", | ||
"clamd==1.0.2", | ||
"docling[ocr]==1.1.2", | ||
"duckdb==0.10.1", | ||
"fasttext==0.9.2", | ||
"filetype >=1.2.0, <2.0.0", | ||
"huggingface-hub >= 0.21.4, <1.0.0", | ||
"langcodes==3.3.0", | ||
"mmh3==4.1.0", | ||
"numpy==1.26.4", | ||
"pandas", | ||
"parameterized", | ||
"pyarrow==16.1.0", | ||
"python-dateutil>=2.8.2", | ||
"pytz>=2020.1", | ||
"quackling==0.1.0", | ||
"scancode-toolkit==32.1.0 ; platform_system != 'Darwin'", | ||
"sentence-transformers==3.0.1", | ||
"transformers==4.38.2", | ||
"tzdata>=2022.7", | ||
"xxhash==3.4.1", | ||
] | ||
|
||
[build-system] | ||
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
|
||
[options] | ||
package_dir = ["src"] | ||
|
||
[options.packages.find] | ||
where = ["src/"] | ||
|
||
[tool.pytest.ini_options] | ||
# Currently we use low coverage since we have to run tests separately (see makefile) | ||
#addopts = "--cov --cov-report term-missing --cov-fail-under 25" | ||
markers = ["unit: unit tests", "integration: integration tests"] | ||
|
||
[tool.coverage.run] | ||
include = ["src/*"] | ||
|
||
|
||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
# Define the root of the local git clone for the common rules to be able | ||
# know where they are running from. | ||
REPOROOT=../../.. | ||
# Include a library of common .transform.* targets which most | ||
# transforms should be able to reuse. However, feel free | ||
# to override/redefine the rules below. | ||
|
||
# $(REPOROOT)/.make.versions file contains the versions | ||
|
||
include $(REPOROOT)/transforms/.make.transforms | ||
|
||
|
||
## Ray Transforms: `find . -name src | grep ray/src` | ||
TRANSFORMS_NAMES = code/proglang_select \ | ||
code/header_cleanser \ | ||
code/code_quality \ | ||
code/repo_level_ordering \ | ||
code/code2parquet \ | ||
language/doc_quality \ | ||
language/doc_chunk \ | ||
language/lang_id \ | ||
language/text_encoder \ | ||
language/pdf2parquet \ | ||
universal/fdedup \ | ||
universal/tokenization \ | ||
universal/ededup \ | ||
universal/profiler \ | ||
universal/doc_id \ | ||
universal/filter \ | ||
universal/resize | ||
|
||
|
||
venv: | ||
$(MAKE) .defaults.create-venv | ||
source venv/bin/activate; \ | ||
$(PYTHON) -m pip install . | ||
|
||
|
||
test:: setup venv test-src | ||
|
||
clean:: .transforms.clean | ||
-rm -fr src | ||
-rm -fr *.egg-info | ||
-rm -fr dist | ||
-rm -fr build | ||
|
||
image:: .transforms.python-image | ||
|
||
test-src:: | ||
source venv/bin/activate; \ | ||
for T in $(TRANSFORMS_NAMES); do \ | ||
echo running unit test on: $$T/ray/test ; \ | ||
$(PYTEST) $(REPOROOT)/transforms/$$T/ray/test; \ | ||
done; | ||
|
||
test-with-pypi: | ||
$(MAKE) clean | ||
$(MAKE) .defaults.create-venv | ||
source venv/bin/activate; \ | ||
$(PYTHON) -m pip install data_prep_toolkit_transforms_ray==0.2.1.dev0 | ||
$(MAKE) test-src | ||
|
||
|
||
setup: .transforms.setup | ||
$(MAKE) src | ||
|
||
src: | ||
for T in $(TRANSFORMS_NAMES); do \ | ||
echo copy src from $$T/ray/src ; \ | ||
cp -R $(REPOROOT)/transforms/$$T/ray/src/ src/ ; \ | ||
done; | ||
-rm -fr *.egg-info | ||
-rm -fr dist | ||
-rm -fr build | ||
|
||
|
||
build:: build-dist | ||
|
||
publish:: publish-dist | ||
|
||
build-dist:: .defaults.build-dist | ||
|
||
publish-dist:: .defaults.publish-dist | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# DPK Ray Transforms | ||
|
||
## installation | ||
|
||
The [transforms](https://github.com/IBM/data-prep-kit/blob/dev/transforms/README.md) are delivered as a standard pyton library available on pypi and can be installed using pip install: | ||
|
||
`python -m pip install data-prep-toolkit-transforms-ray` | ||
|
||
installing the Ray transforms will also install `data_prep_toolkit_transforms` and `data-prep-toolkit-ray` | ||
|
||
## List of Ray Transforms availabe in current package | ||
|
||
* code | ||
* [code2parquet](https://github.com/IBM/data-prep-kit/blob/dev/transforms/code/code2parquet/ray/README.md) | ||
* proglang_select | ||
* header_cleanser (Not available on MacOS) | ||
* code_quality | ||
* repo_level_ordering | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. malware There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. malware transforms need some additional work to be included as a pip install and was intentionally excluded from this initial release... It seem to require a docker container and does not fit in the current use case for running these transforms in a notebook. |
||
* language | ||
* doc_quality | ||
* doc_chunk | ||
* lang_id | ||
* text_encoder | ||
* pdf2parquet | ||
* universal | ||
* fdedup | ||
* tokenization | ||
* ededup | ||
* profiler | ||
* doc_id | ||
* filter | ||
* resize | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. profiler, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @blublinsky what is meant by this comment? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. missing transform |
||
|
||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
[project] | ||
name = "data_prep_toolkit_transforms_ray" | ||
version = "0.2.1.dev0" | ||
requires-python = ">=3.10,<3.12" | ||
keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ] | ||
description = "Data Preparation Toolkit Transforms using Ray" | ||
license = {text = "Apache-2.0"} | ||
readme = {file = "README.md", content-type = "text/markdown"} | ||
authors = [ | ||
{ name = "Maroun Touma", email = "[email protected]" }, | ||
] | ||
|
||
dependencies = [ | ||
"data_prep_toolkit_transforms==0.2.1.dev0", | ||
"data-prep-toolkit-ray==0.2.1.dev0", | ||
"scancode-toolkit==32.1.0 ; platform_system != 'Darwin'", | ||
"parameterized", | ||
"tqdm==4.66.3", | ||
"mmh3==4.1.0", | ||
"xxhash==3.4.1", | ||
"tqdm==4.66.3", | ||
"scipy==1.12.0", | ||
"networkx==3.3", | ||
"colorlog==6.8.2", | ||
"func-timeout==4.3.5", | ||
"pandas==2.2.2", | ||
"emerge-viz==2.0.0", | ||
] | ||
|
||
[build-system] | ||
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] | ||
build-backend = "setuptools.build_meta" | ||
|
||
|
||
[options] | ||
package_dir = ["src"] | ||
|
||
[options.packages.find] | ||
where = ["src/"] | ||
|
||
[tool.pytest.ini_options] | ||
# Currently we use low coverage since we have to run tests separately (see makefile) | ||
#addopts = "--cov --cov-report term-missing --cov-fail-under 25" | ||
markers = ["unit: unit tests", "integration: integration tests"] | ||
|
||
[tool.coverage.run] | ||
include = ["src/*"] | ||
|
||
|
||
|
||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we combine it (except src) with the .gitignore in the root directory?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That would work to. I think there may be situations in the future where we want to. add the wheel for specific transforms to git so folks can download it from git. This is still under discussion but yes, once we have consensus, we should move the gitignore to higher level as appropriate.