Skip to content

Commit 843b4e9

Browse files
Ruff - Entire Repo (#650)
1 parent 81983ad commit 843b4e9

317 files changed

Lines changed: 10676 additions & 12804 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/ruff.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name: Ruff Linter
2+
on: [push, pull_request]
3+
jobs:
4+
ruff:
5+
runs-on: ubuntu-latest
6+
steps:
7+
- uses: actions/checkout@v3
8+
- uses: astral-sh/ruff-action@v3
9+
with:
10+
version: 0.11.4
11+
args: "check --output-format=github"

.pre-commit-config.yaml

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ default_install_hook_types: [pre-commit, commit-msg]
2424

2525
repos:
2626
- repo: https://github.com/pre-commit/pre-commit-hooks
27-
rev: v4.6.0
27+
rev: v5.0.0
2828
hooks:
2929
- id: check-added-large-files
3030
args: ['--maxkb=1000']
@@ -35,20 +35,14 @@ repos:
3535
exclude: docs/
3636
- id: requirements-txt-fixer
3737
- id: trailing-whitespace
38-
exclude: nemo_curator/utils/aegis_utils.py
39-
40-
- repo: https://github.com/psf/black
41-
rev: 24.4.2
42-
hooks:
43-
- id: black
44-
name: Format code
45-
46-
- repo: https://github.com/PyCQA/isort
47-
rev: 5.13.2
38+
- repo: https://github.com/astral-sh/ruff-pre-commit
39+
rev: v0.11.4
4840
hooks:
49-
- id: isort
50-
name: Format imports
51-
exclude: docs/|nemo_curator/modules/__init__.py
41+
# Run the linter
42+
- id: ruff
43+
args: [--fix] # Enables auto-fixing lint errors
44+
# Run the formatter
45+
- id: ruff-format
5246

5347
- repo: local
5448
hooks:

CONTRIBUTING.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ The documentation should also cover potential pitfalls and performance considera
4545
The existing examples and documentation should serve as a good reference to what is expected.
4646

4747
## Python style
48-
We use ``black`` as our style guide. To fix your format run `pip install pre-commit && pre-commit install && pre-commit run --all`.
48+
We use ``ruff`` as our style guide. To fix your format run `pre-commit install && pre-commit run --all`.
4949

5050
1. Include docstrings for every class and method exposed to the user.
5151
1. Avoid wild import: ``from X import *`` unless in ``X.py``, ``__all__`` is defined.

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ These extras are available for all installation methods provided.
110110

111111
```bash
112112
pip install nemo-curator # Installs CPU-only text curation modules
113+
pip install nemo-curator[dev] # Installs libraries required for development
113114
pip install --extra-index-url https://pypi.nvidia.com nemo-curator[cuda12x] # Installs CPU + GPU text curation modules
114115
pip install --extra-index-url https://pypi.nvidia.com nemo-curator[image] # Installs CPU + GPU text and image curation modules
115116
pip install --extra-index-url https://pypi.nvidia.com nemo-curator[all] # Installs all of the above

conftest.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from collections.abc import Generator
2+
13
import pytest
24
from dask.distributed import Client
35

@@ -8,13 +10,11 @@
810
LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster")
911

1012

11-
def pytest_addoption(parser):
12-
parser.addoption(
13-
"--cpu", action="store_true", default=False, help="Run tests without gpu marker"
14-
)
13+
def pytest_addoption(parser: pytest.Parser) -> None:
14+
parser.addoption("--cpu", action="store_true", default=False, help="Run tests without gpu marker")
1515

1616

17-
def pytest_collection_modifyitems(config, items):
17+
def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
1818
if config.getoption("--cpu"):
1919
skip_gpu = pytest.mark.skip(reason="Skipping GPU tests")
2020
for item in items:
@@ -23,7 +23,7 @@ def pytest_collection_modifyitems(config, items):
2323

2424

2525
@pytest.fixture(autouse=True, scope="session")
26-
def gpu_client(request):
26+
def gpu_client(request: pytest.FixtureRequest) -> Generator[Client, None, None]:
2727
if not request.config.getoption("--cpu"):
2828
with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client:
2929
request.session.client = client

examples/async_llm_pii_redaction.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
import dask.dataframe
1615
import pandas as pd
1716

1817
from nemo_curator.datasets import DocumentDataset
@@ -21,8 +20,8 @@
2120
from nemo_curator.utils.distributed_utils import get_client
2221

2322

24-
def console_script():
25-
_ = get_client()
23+
def console_script() -> None:
24+
client = get_client() # noqa: F841
2625

2726
dataframe = pd.DataFrame(
2827
{

examples/blend_and_shuffle.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,15 @@
2020
from nemo_curator.utils.script_utils import ArgumentHelper
2121

2222

23-
def main(args):
23+
def main(args: argparse.Namespace) -> None:
2424
# Params
2525
dataset_paths = ["/path/to/first", "/path/to/second", "/path/to/third"]
2626
dataset_weights = [5.0, 2.0, 1.0]
2727
target_size = 1000
2828
output_path = "/path/to/output"
2929

3030
# Set up Dask client
31-
client = get_client(**ArgumentHelper.parse_client_args(args))
31+
client = get_client(**ArgumentHelper.parse_client_args(args)) # noqa: F841
3232

3333
# Blend the datasets
3434
datasets = [DocumentDataset.read_json(path) for path in dataset_paths]
@@ -42,12 +42,10 @@ def main(args):
4242

4343

4444
def attach_args(
45-
parser=argparse.ArgumentParser(
46-
formatter_class=argparse.ArgumentDefaultsHelpFormatter
47-
),
48-
):
45+
parser: argparse.ArgumentParser,
46+
) -> argparse.ArgumentParser:
4947
return ArgumentHelper(parser).add_distributed_args()
5048

5149

5250
if __name__ == "__main__":
53-
main(attach_args().parse_args())
51+
main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())

examples/classifier_filtering.py

Lines changed: 11 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,27 +26,23 @@
2626
from nemo_curator.utils.script_utils import ArgumentHelper
2727

2828

29-
def load_dataset(input_data_dir):
29+
def load_dataset(input_data_dir: str) -> DocumentDataset:
3030
files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
3131
raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
32-
dataset = DocumentDataset(raw_data)
32+
return DocumentDataset(raw_data)
3333

34-
return dataset
3534

36-
37-
def create_samples(data_path, label, num_samples):
35+
def create_samples(data_path: str, label: str, num_samples: int) -> list[str]:
3836
raw_dataset = load_dataset(data_path)
3937
label_quality = nc.Modify(FastTextLabelModifier(label))
4038

4139
labeled_dataset = label_quality(raw_dataset)
42-
labeled_samples = labeled_dataset.df.sample(
43-
frac=num_samples / len(labeled_dataset.df)
44-
)
40+
labeled_samples = labeled_dataset.df.sample(frac=num_samples / len(labeled_dataset.df))
4541

4642
return labeled_samples["text"].compute().values.tolist()
4743

4844

49-
def main(args):
45+
def main(args: argparse.Namespace) -> None:
5046
# Params
5147
low_quality_data_path = "/path/to/low_quality"
5248
high_quality_data_path = "/path/to/high_quality"
@@ -55,13 +51,9 @@ def main(args):
5551
filtered_output = "/path/to/output"
5652

5753
# Prepare samples for the classifier
58-
client = get_client(**ArgumentHelper.parse_client_args(args))
59-
low_quality_samples = create_samples(
60-
low_quality_data_path, "__label__lq", num_low_quality_samples
61-
)
62-
high_quality_samples = create_samples(
63-
high_quality_data_path, "__label__hq", num_high_quality_samples
64-
)
54+
client = get_client(**ArgumentHelper.parse_client_args(args)) # noqa: F841
55+
low_quality_samples = create_samples(low_quality_data_path, "__label__lq", num_low_quality_samples)
56+
high_quality_samples = create_samples(high_quality_data_path, "__label__hq", num_high_quality_samples)
6557

6658
train_samples = low_quality_samples + high_quality_samples
6759
random.shuffle(train_samples)
@@ -96,12 +88,10 @@ def main(args):
9688

9789

9890
def attach_args(
99-
parser=argparse.ArgumentParser(
100-
formatter_class=argparse.ArgumentDefaultsHelpFormatter
101-
),
102-
):
91+
parser: argparse.ArgumentParser,
92+
) -> argparse.ArgumentParser:
10393
return ArgumentHelper(parser).add_distributed_args()
10494

10595

10696
if __name__ == "__main__":
107-
main(attach_args().parse_args())
97+
main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())

examples/classifiers/aegis_example.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,19 @@
2121
from nemo_curator.utils.script_utils import ArgumentHelper
2222

2323

24-
def main(args):
24+
def main(args: argparse.Namespace) -> None:
2525
global_st = time.time()
2626

2727
# Input can be a string or list
2828
input_file_path = "/path/to/data"
2929
output_file_path = "./"
30-
huggingface_token = "hf_1234" # Replace with a HuggingFace user access token
30+
huggingface_token = "hf_1234" # Replace with a HuggingFace user access token # noqa: S105
3131

3232
client_args = ArgumentHelper.parse_client_args(args)
3333
client_args["cluster_type"] = "gpu"
3434
client = get_client(**client_args)
3535

36-
input_dataset = DocumentDataset.read_json(
37-
input_file_path, backend="cudf", add_filename=True
38-
)
36+
input_dataset = DocumentDataset.read_json(input_file_path, backend="cudf", add_filename=True)
3937

4038
safety_classifier = AegisClassifier(
4139
aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0",
@@ -48,23 +46,21 @@ def main(args):
4846

4947
global_et = time.time()
5048
print(
51-
f"Total time taken for AEGIS classifier inference: {global_et-global_st} s",
49+
f"Total time taken for AEGIS classifier inference: {global_et - global_st} s",
5250
flush=True,
5351
)
5452

5553
client.close()
5654

5755

5856
def attach_args(
59-
parser=argparse.ArgumentParser(
60-
formatter_class=argparse.ArgumentDefaultsHelpFormatter
61-
),
62-
):
63-
argumentHelper = ArgumentHelper(parser)
64-
argumentHelper.add_distributed_classifier_cluster_args()
57+
parser: argparse.ArgumentParser,
58+
) -> argparse.ArgumentParser:
59+
arg_helper = ArgumentHelper(parser)
60+
arg_helper.add_distributed_classifier_cluster_args()
6561

66-
return argumentHelper.parser
62+
return arg_helper.parser
6763

6864

6965
if __name__ == "__main__":
70-
main(attach_args().parse_args())
66+
main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())

examples/classifiers/content_type_example.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from nemo_curator.utils.script_utils import ArgumentHelper
2222

2323

24-
def main(args):
24+
def main(args: argparse.Namespace) -> None:
2525
global_st = time.time()
2626

2727
# Input can be a string or list
@@ -32,9 +32,7 @@ def main(args):
3232
client_args["cluster_type"] = "gpu"
3333
client = get_client(**client_args)
3434

35-
input_dataset = DocumentDataset.read_json(
36-
input_file_path, backend="cudf", add_filename=True
37-
)
35+
input_dataset = DocumentDataset.read_json(input_file_path, backend="cudf", add_filename=True)
3836

3937
content_type_classifier = ContentTypeClassifier(filter_by=["Blogs", "News"])
4038
result_dataset = content_type_classifier(dataset=input_dataset)
@@ -43,23 +41,21 @@ def main(args):
4341

4442
global_et = time.time()
4543
print(
46-
f"Total time taken for content type classifier inference: {global_et-global_st} s",
44+
f"Total time taken for content type classifier inference: {global_et - global_st} s",
4745
flush=True,
4846
)
4947

5048
client.close()
5149

5250

5351
def attach_args(
54-
parser=argparse.ArgumentParser(
55-
formatter_class=argparse.ArgumentDefaultsHelpFormatter
56-
),
57-
):
58-
argumentHelper = ArgumentHelper(parser)
59-
argumentHelper.add_distributed_classifier_cluster_args()
52+
parser: argparse.ArgumentParser,
53+
) -> argparse.ArgumentParser:
54+
arg_helper = ArgumentHelper(parser)
55+
arg_helper.add_distributed_classifier_cluster_args()
6056

61-
return argumentHelper.parser
57+
return arg_helper.parser
6258

6359

6460
if __name__ == "__main__":
65-
main(attach_args().parse_args())
61+
main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())

0 commit comments

Comments
 (0)