-
Notifications
You must be signed in to change notification settings - Fork 141
/
pyproject.toml
110 lines (86 loc) · 4.23 KB
/
pyproject.toml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
[project]
name = "data_prep_toolkit_transforms"
version = "0.2.3.dev1"
requires-python = ">=3.10,<3.13"
keywords = ["transforms", "data preprocessing", "data preparation", "llm", "generative", "ai", "fine-tuning", "llmapps" ]
description = "Data Preparation Toolkit Transforms using Ray"
license = {text = "Apache-2.0"}
readme = {file = "README-list.md", content-type = "text/markdown"}
authors = [
{ name = "Maroun Touma", email = "[email protected]" },
]
dynamic = ["dependencies","optional-dependencies"]
[build-system]
requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
build-backend = "setuptools.build_meta"
[tool.setuptools.dynamic.dependencies]
file = ["requirements.txt"]
[tool.setuptools.dynamic.optional-dependencies]
dev = { file = ["requirements-dev.txt"]}
ray = { file = ["requirements-ray.txt"]}
all = { file = [
"code/proglang_select/python/requirements.txt",
"code/header_cleanser/python/requirements.txt",
"code/license_select/python/requirements.txt",
"code/code_quality/python/requirements.txt",
"code/code2parquet/python/requirements.txt",
"language/doc_quality/python/requirements.txt",
"language/doc_chunk/python/requirements.txt",
##### Cannot have html2parquet until we solve
## docling-ibm-models 1.1.7 depends on lxml<5.0.0 and >=4.9.1
## trafilatura 1.12.0 depends on lxml>=5.2.2; platform_system != "Darwin" or python_version > "3.8"
## "language/html2parquet/python/requirements.txt",
##### pii_redactor seem to be failing UT
## "language/pii_redactor/python/requirements.txt",
"language/lang_id/python/requirements.txt",
"language/text_encoder/python/requirements.txt",
"language/pdf2parquet/python/requirements.txt",
"universal/hap/python/requirements.txt",
"universal/tokenization/python/requirements.txt",
"universal/ededup/python/requirements.txt",
"universal/fdedup/python/requirements.txt",
"universal/profiler/python/requirements.txt",
"universal/doc_id/python/requirements.txt",
"universal/filter/python/requirements.txt",
"universal/resize/python/requirements.txt",
"universal/web2parquet/requirements.txt"
]}
# pyproject.toml must be in a parent and cannot be in sibling
# i.e. Cannot access '../code/proglang_select/python/..
proglang_select = { file = ["code/proglang_select/python/requirements.txt"]}
header_cleanser = {file = ["code/header_cleanser/python/requirements.txt"]}
license_select = { file = ["code/license_select/python/requirements.txt"]}
code_quality = { file = ["code/code_quality/python/requirements.txt"]}
code2parquet = {file = ["code/code2parquet/python/requirements.txt"]}
doc_quality = { file = ["language/doc_quality/python/requirements.txt"]}
doc_chunk = { file = ["language/doc_chunk/python/requirements.txt"]}
html2parquet = { file = ["language/html2parquet/python/requirements.txt"]}
pii_redactor = { file = ["language/pii_redactor/python/requirements.txt"]}
lang_id = { file = ["language/lang_id/python/requirements.txt"]}
text_encoder = { file = ["language/text_encoder/python/requirements.txt"]}
pdf2parquet = { file = ["language/pdf2parquet/python/requirements.txt"]}
hap = { file = ["universal/hap/python/requirements.txt"]}
tokenization = { file = ["universal/tokenization/python/requirements.txt"]}
ededup = { file = ["universal/ededup/python/requirements.txt"]}
fdedup = { file = ["universal/fdedup/python/requirements.txt"]}
profiler = { file = ["universal/profiler/python/requirements.txt"]}
doc_id = { file = ["universal/doc_id/python/requirements.txt"]}
filter = { file = ["universal/filter/python/requirements.txt"]}
resize = { file = ["universal/resize/python/requirements.txt"]}
web2parquet = { file = ["universal/web2parquet/requirements.txt"]}
# Does not seem to work for our custom layout
# copy all files to a single src and let automatic discovery find them
#[tool.setuptools.package-data]
#"*" = ["*.txt"]
#[tool.setuptools.packages.find]
#where = ["src"]
#[tool.setuptools.package-dir]
#dpk_web2parquet = "universal/web2parquet/dpk_web2parquet"
[options]
package_dir = ["src","test"]
[options.packages.find]
where = ["src"]
[tool.pytest.ini_options]
# Currently we use low coverage since we have to run tests separately (see makefile)
#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
markers = ["unit: unit tests", "integration: integration tests"]