NVIDIA-NeMo
diff --git a/‎.github/workflows/ruff.yml‎
Lines changed: 11 additions & 0 deletions b/‎.github/workflows/ruff.yml‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 9 additions & 15 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 9 additions & 15 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎conftest.py‎
Lines changed: 6 additions & 6 deletions b/‎conftest.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/async_llm_pii_redaction.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/async_llm_pii_redaction.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/blend_and_shuffle.py‎
Lines changed: 5 additions & 7 deletions b/‎examples/blend_and_shuffle.py‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎examples/classifier_filtering.py‎
Lines changed: 11 additions & 21 deletions b/‎examples/classifier_filtering.py‎
Lines changed: 11 additions & 21 deletions
diff --git a/‎examples/classifiers/aegis_example.py‎
Lines changed: 10 additions & 14 deletions b/‎examples/classifiers/aegis_example.py‎
Lines changed: 10 additions & 14 deletions
diff --git a/‎examples/classifiers/content_type_example.py‎
Lines changed: 9 additions & 13 deletions b/‎examples/classifiers/content_type_example.py‎
Lines changed: 9 additions & 13 deletions
@@ -0,0 +1,11 @@
+name: Ruff Linter
+on: [push, pull_request]
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: astral-sh/ruff-action@v3
+        with:
+          version: 0.11.4
+          args: "check --output-format=github"
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@ default_install_hook_types: [pre-commit, commit-msg]
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: check-added-large-files
         args: ['--maxkb=1000']
@@ -35,20 +35,14 @@ repos:
         exclude: docs/
       - id: requirements-txt-fixer
       - id: trailing-whitespace
-        exclude: nemo_curator/utils/aegis_utils.py
-
-  - repo: https://github.com/psf/black
-    rev: 24.4.2
-    hooks:
-      - id: black
-        name: Format code
-
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.13.2
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.11.4
     hooks:
-      - id: isort
-        name: Format imports
-        exclude: docs/|nemo_curator/modules/__init__.py
+      # Run the linter
+      - id: ruff
+        args: [--fix]  # Enables auto-fixing lint errors
+      # Run the formatter
+      - id: ruff-format
 
   - repo: local
     hooks:
 
@@ -45,7 +45,7 @@ The documentation should also cover potential pitfalls and performance considera
 The existing examples and documentation should serve as a good reference to what is expected.
 
 ## Python style
-We use ``black`` as our style guide. To fix your format run `pip install pre-commit && pre-commit install && pre-commit run --all`.
+We use ``ruff`` as our style guide. To fix your format run `pre-commit install && pre-commit run --all`.
 
 1. Include docstrings for every class and method exposed to the user.
 1. Avoid wild import: ``from X import *`` unless in ``X.py``, ``__all__`` is defined.
 
@@ -110,6 +110,7 @@ These extras are available for all installation methods provided.
 
 ```bash
 pip install nemo-curator # Installs CPU-only text curation modules
+pip install nemo-curator[dev] # Installs libraries required for development
 pip install --extra-index-url https://pypi.nvidia.com nemo-curator[cuda12x] # Installs CPU + GPU text curation modules
 pip install --extra-index-url https://pypi.nvidia.com nemo-curator[image] # Installs CPU + GPU text and image curation modules
 pip install --extra-index-url https://pypi.nvidia.com nemo-curator[all] # Installs all of the above
 
@@ -1,3 +1,5 @@
+from collections.abc import Generator
+
 import pytest
 from dask.distributed import Client
 
@@ -8,13 +10,11 @@
 LocalCUDACluster = gpu_only_import_from("dask_cuda", "LocalCUDACluster")
 
 
-def pytest_addoption(parser):
-    parser.addoption(
-        "--cpu", action="store_true", default=False, help="Run tests without gpu marker"
-    )
+def pytest_addoption(parser: pytest.Parser) -> None:
+    parser.addoption("--cpu", action="store_true", default=False, help="Run tests without gpu marker")
 
 
-def pytest_collection_modifyitems(config, items):
+def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None:
     if config.getoption("--cpu"):
         skip_gpu = pytest.mark.skip(reason="Skipping GPU tests")
         for item in items:
@@ -23,7 +23,7 @@ def pytest_collection_modifyitems(config, items):
 
 
 @pytest.fixture(autouse=True, scope="session")
-def gpu_client(request):
+def gpu_client(request: pytest.FixtureRequest) -> Generator[Client, None, None]:
     if not request.config.getoption("--cpu"):
         with LocalCUDACluster(n_workers=1) as cluster, Client(cluster) as client:
             request.session.client = client
 
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import dask.dataframe
 import pandas as pd
 
 from nemo_curator.datasets import DocumentDataset
@@ -21,8 +20,8 @@
 from nemo_curator.utils.distributed_utils import get_client
 
 
-def console_script():
-    _ = get_client()
+def console_script() -> None:
+    client = get_client()  # noqa: F841
 
     dataframe = pd.DataFrame(
         {
 
@@ -20,15 +20,15 @@
 from nemo_curator.utils.script_utils import ArgumentHelper
 
 
-def main(args):
+def main(args: argparse.Namespace) -> None:
     # Params
     dataset_paths = ["/path/to/first", "/path/to/second", "/path/to/third"]
     dataset_weights = [5.0, 2.0, 1.0]
     target_size = 1000
     output_path = "/path/to/output"
 
     # Set up Dask client
-    client = get_client(**ArgumentHelper.parse_client_args(args))
+    client = get_client(**ArgumentHelper.parse_client_args(args))  # noqa: F841
 
     # Blend the datasets
     datasets = [DocumentDataset.read_json(path) for path in dataset_paths]
@@ -42,12 +42,10 @@ def main(args):
 
 
 def attach_args(
-    parser=argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    ),
-):
+    parser: argparse.ArgumentParser,
+) -> argparse.ArgumentParser:
     return ArgumentHelper(parser).add_distributed_args()
 
 
 if __name__ == "__main__":
-    main(attach_args().parse_args())
+    main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())
@@ -26,27 +26,23 @@
 from nemo_curator.utils.script_utils import ArgumentHelper
 
 
-def load_dataset(input_data_dir):
+def load_dataset(input_data_dir: str) -> DocumentDataset:
     files = list(get_all_files_paths_under(input_data_dir, keep_extensions="jsonl"))
     raw_data = read_data(files, file_type="jsonl", backend="pandas", add_filename=True)
-    dataset = DocumentDataset(raw_data)
+    return DocumentDataset(raw_data)
 
-    return dataset
 
-
-def create_samples(data_path, label, num_samples):
+def create_samples(data_path: str, label: str, num_samples: int) -> list[str]:
     raw_dataset = load_dataset(data_path)
     label_quality = nc.Modify(FastTextLabelModifier(label))
 
     labeled_dataset = label_quality(raw_dataset)
-    labeled_samples = labeled_dataset.df.sample(
-        frac=num_samples / len(labeled_dataset.df)
-    )
+    labeled_samples = labeled_dataset.df.sample(frac=num_samples / len(labeled_dataset.df))
 
     return labeled_samples["text"].compute().values.tolist()
 
 
-def main(args):
+def main(args: argparse.Namespace) -> None:
     # Params
     low_quality_data_path = "/path/to/low_quality"
     high_quality_data_path = "/path/to/high_quality"
@@ -55,13 +51,9 @@ def main(args):
     filtered_output = "/path/to/output"
 
     # Prepare samples for the classifier
-    client = get_client(**ArgumentHelper.parse_client_args(args))
-    low_quality_samples = create_samples(
-        low_quality_data_path, "__label__lq", num_low_quality_samples
-    )
-    high_quality_samples = create_samples(
-        high_quality_data_path, "__label__hq", num_high_quality_samples
-    )
+    client = get_client(**ArgumentHelper.parse_client_args(args))  # noqa: F841
+    low_quality_samples = create_samples(low_quality_data_path, "__label__lq", num_low_quality_samples)
+    high_quality_samples = create_samples(high_quality_data_path, "__label__hq", num_high_quality_samples)
 
     train_samples = low_quality_samples + high_quality_samples
     random.shuffle(train_samples)
@@ -96,12 +88,10 @@ def main(args):
 
 
 def attach_args(
-    parser=argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    ),
-):
+    parser: argparse.ArgumentParser,
+) -> argparse.ArgumentParser:
     return ArgumentHelper(parser).add_distributed_args()
 
 
 if __name__ == "__main__":
-    main(attach_args().parse_args())
+    main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())
@@ -21,21 +21,19 @@
 from nemo_curator.utils.script_utils import ArgumentHelper
 
 
-def main(args):
+def main(args: argparse.Namespace) -> None:
     global_st = time.time()
 
     # Input can be a string or list
     input_file_path = "/path/to/data"
     output_file_path = "./"
-    huggingface_token = "hf_1234"  # Replace with a HuggingFace user access token
+    huggingface_token = "hf_1234"  # Replace with a HuggingFace user access token  # noqa: S105
 
     client_args = ArgumentHelper.parse_client_args(args)
     client_args["cluster_type"] = "gpu"
     client = get_client(**client_args)
 
-    input_dataset = DocumentDataset.read_json(
-        input_file_path, backend="cudf", add_filename=True
-    )
+    input_dataset = DocumentDataset.read_json(input_file_path, backend="cudf", add_filename=True)
 
     safety_classifier = AegisClassifier(
         aegis_variant="nvidia/Aegis-AI-Content-Safety-LlamaGuard-Permissive-1.0",
@@ -48,23 +46,21 @@ def main(args):
 
     global_et = time.time()
     print(
-        f"Total time taken for AEGIS classifier inference: {global_et-global_st} s",
+        f"Total time taken for AEGIS classifier inference: {global_et - global_st} s",
         flush=True,
     )
 
     client.close()
 
 
 def attach_args(
-    parser=argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    ),
-):
-    argumentHelper = ArgumentHelper(parser)
-    argumentHelper.add_distributed_classifier_cluster_args()
+    parser: argparse.ArgumentParser,
+) -> argparse.ArgumentParser:
+    arg_helper = ArgumentHelper(parser)
+    arg_helper.add_distributed_classifier_cluster_args()
 
-    return argumentHelper.parser
+    return arg_helper.parser
 
 
 if __name__ == "__main__":
-    main(attach_args().parse_args())
+    main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())
@@ -21,7 +21,7 @@
 from nemo_curator.utils.script_utils import ArgumentHelper
 
 
-def main(args):
+def main(args: argparse.Namespace) -> None:
     global_st = time.time()
 
     # Input can be a string or list
@@ -32,9 +32,7 @@ def main(args):
     client_args["cluster_type"] = "gpu"
     client = get_client(**client_args)
 
-    input_dataset = DocumentDataset.read_json(
-        input_file_path, backend="cudf", add_filename=True
-    )
+    input_dataset = DocumentDataset.read_json(input_file_path, backend="cudf", add_filename=True)
 
     content_type_classifier = ContentTypeClassifier(filter_by=["Blogs", "News"])
     result_dataset = content_type_classifier(dataset=input_dataset)
@@ -43,23 +41,21 @@ def main(args):
 
     global_et = time.time()
     print(
-        f"Total time taken for content type classifier inference: {global_et-global_st} s",
+        f"Total time taken for content type classifier inference: {global_et - global_st} s",
         flush=True,
     )
 
     client.close()
 
 
 def attach_args(
-    parser=argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    ),
-):
-    argumentHelper = ArgumentHelper(parser)
-    argumentHelper.add_distributed_classifier_cluster_args()
+    parser: argparse.ArgumentParser,
+) -> argparse.ArgumentParser:
+    arg_helper = ArgumentHelper(parser)
+    arg_helper.add_distributed_classifier_cluster_args()
 
-    return argumentHelper.parser
+    return arg_helper.parser
 
 
 if __name__ == "__main__":
-    main(attach_args().parse_args())
+    main(attach_args(argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)).parse_args())