[data] Split out long running scaling test (#54045)

omatthew98 · web-flow · commit c081542c2ce6 · 2025-06-25T18:47:21.000Z
## Why are these changes needed? Test `test_arrow_block` has become flakey, often failing because it times out on `test_arrow_batch_gt_2gb` which is a scaling test to see if the arrow code works with a single 2gb batch. This splits out that test into its own suite to see if that will reduce the likelihood of a timeout (limit should be 180s). If this does not work, the next step will be to try running this on a larger worker. ## Related issue number  ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: Matthew Owen <mowen@anyscale.com>
diff --git a/python/ray/data/BUILD b/python/ray/data/BUILD
@@ -184,6 +184,21 @@ py_test(
     ],
 )
 
+py_test(
+    name = "test_arrow_block_scaling",
+    size = "large",
+    srcs = ["tests/test_arrow_block_scaling.py"],
+    tags = [
+        "data_non_parallel",
+        "exclusive",
+        "team:data",
+    ],
+    deps = [
+        ":conftest",
+        "//:ray_lib",
+    ],
+)
+
 py_test(
     name = "test_auto_parallelism",
     size = "medium",
diff --git a/python/ray/data/tests/test_arrow_block.py b/python/ray/data/tests/test_arrow_block.py
@@ -1,4 +1,3 @@
-import gc
 import os
 import sys
 import types
@@ -9,7 +8,6 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
-from pyarrow import parquet as pq
 
 import ray
 from ray._private.test_utils import run_string_as_driver
@@ -147,39 +145,6 @@ def test_to_pylist(self, arr, as_py):
         assert accessor.to_pylist() == arr.to_pylist()
 
 
-@pytest.fixture(scope="module")
-def parquet_dataset_single_column_gt_2gb():
-    chunk_size = 256 * MiB
-    num_chunks = 10
-
-    total_column_size = chunk_size * 10  # ~2.5 GiB
-
-    with TemporaryDirectory() as tmp_dir:
-        dataset_path = f"{tmp_dir}/large_parquet_chunk_{chunk_size}"
-
-        # Create directory
-        os.mkdir(dataset_path)
-
-        for i in range(num_chunks):
-            chunk = b"a" * chunk_size
-
-            d = {"id": [i], "bin": [chunk]}
-            t = pa.Table.from_pydict(d)
-
-            print(f">>> Table schema: {t.schema} (size={sys.getsizeof(t)})")
-
-            filepath = f"{dataset_path}/chunk_{i}.parquet"
-            pq.write_table(t, filepath)
-
-            print(f">>> Created a chunk #{i}")
-
-        print(f">>> Created dataset at {dataset_path}")
-
-        yield dataset_path, num_chunks, total_column_size
-
-        print(f">>> Cleaning up dataset at {dataset_path}")
-
-
 @pytest.fixture(scope="module")
 def binary_dataset_single_file_gt_2gb():
     total_size = int(2.1 * GiB)
@@ -243,65 +208,6 @@ def _id(row):
     assert total == 1
 
 
-@pytest.mark.parametrize(
-    "op",
-    [
-        "map",
-        "map_batches",
-    ],
-)
-def test_arrow_batch_gt_2gb(
-    ray_start_regular,
-    parquet_dataset_single_column_gt_2gb,
-    restore_data_context,
-    op,
-):
-    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
-    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
-
-    dataset_path, num_rows, total_column_size = parquet_dataset_single_column_gt_2gb
-
-    def _id(x):
-        return x
-
-    ds = ray.data.read_parquet(dataset_path)
-
-    if op == "map":
-        ds = ds.map(_id)
-    elif op == "map_batches":
-        # Combine all rows into a single batch using `map_batches` coercing to
-        # numpy format
-        ds = ds.map_batches(
-            _id,
-            batch_format="numpy",
-            batch_size=num_rows,
-            zero_copy_batch=False,
-        )
-
-    batch = ds.take_batch()
-
-    total_binary_column_size = sum([len(b) for b in batch["bin"]])
-
-    print(
-        f">>> Batch:\n"
-        f"------\n"
-        "Column: 'id'\n"
-        f"Values: {batch['id']}\n"
-        f"------\n"
-        "Column: 'bin'\n"
-        f"Total: {total_binary_column_size / GiB} GiB\n"
-        f"Values: {[str(v)[:3] + ' x ' + str(len(v)) for v in batch['bin']]}\n"
-    )
-
-    assert total_binary_column_size == total_column_size
-
-    # Clean up refs
-    del batch
-    del ds
-    # Force GC to free up object store memory
-    gc.collect()
-
-
 @pytest.mark.parametrize(
     "input_,expected_output",
     [
diff --git a/python/ray/data/tests/test_arrow_block_scaling.py b/python/ray/data/tests/test_arrow_block_scaling.py
@@ -0,0 +1,108 @@
+import gc
+import os
+import sys
+from tempfile import TemporaryDirectory
+
+import pyarrow as pa
+import pytest
+from pyarrow import parquet as pq
+
+import ray
+from ray.data import DataContext
+from ray.data._internal.util import GiB, MiB
+
+
+@pytest.fixture(scope="module")
+def parquet_dataset_single_column_gt_2gb():
+    chunk_size = 256 * MiB
+    num_chunks = 10
+
+    total_column_size = chunk_size * 10  # ~2.5 GiB
+
+    with TemporaryDirectory() as tmp_dir:
+        dataset_path = f"{tmp_dir}/large_parquet_chunk_{chunk_size}"
+
+        # Create directory
+        os.mkdir(dataset_path)
+
+        for i in range(num_chunks):
+            chunk = b"a" * chunk_size
+
+            d = {"id": [i], "bin": [chunk]}
+            t = pa.Table.from_pydict(d)
+
+            print(f">>> Table schema: {t.schema} (size={sys.getsizeof(t)})")
+
+            filepath = f"{dataset_path}/chunk_{i}.parquet"
+            pq.write_table(t, filepath)
+
+            print(f">>> Created a chunk #{i}")
+
+        print(f">>> Created dataset at {dataset_path}")
+
+        yield dataset_path, num_chunks, total_column_size
+
+        print(f">>> Cleaning up dataset at {dataset_path}")
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "map",
+        "map_batches",
+    ],
+)
+def test_arrow_batch_gt_2gb(
+    ray_start_regular,
+    parquet_dataset_single_column_gt_2gb,
+    restore_data_context,
+    op,
+):
+    # Disable (automatic) fallback to `ArrowPythonObjectType` extension type
+    DataContext.get_current().enable_fallback_to_arrow_object_ext_type = False
+
+    dataset_path, num_rows, total_column_size = parquet_dataset_single_column_gt_2gb
+
+    def _id(x):
+        return x
+
+    ds = ray.data.read_parquet(dataset_path)
+
+    if op == "map":
+        ds = ds.map(_id)
+    elif op == "map_batches":
+        # Combine all rows into a single batch using `map_batches` coercing to
+        # numpy format
+        ds = ds.map_batches(
+            _id,
+            batch_format="numpy",
+            batch_size=num_rows,
+            zero_copy_batch=False,
+        )
+
+    batch = ds.take_batch()
+
+    total_binary_column_size = sum([len(b) for b in batch["bin"]])
+
+    print(
+        f">>> Batch:\n"
+        f"------\n"
+        "Column: 'id'\n"
+        f"Values: {batch['id']}\n"
+        f"------\n"
+        "Column: 'bin'\n"
+        f"Total: {total_binary_column_size / GiB} GiB\n"
+        f"Values: {[str(v)[:3] + ' x ' + str(len(v)) for v in batch['bin']]}\n"
+    )
+
+    assert total_binary_column_size == total_column_size
+
+    # Clean up refs
+    del batch
+    del ds
+    # Force GC to free up object store memory
+    gc.collect()
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", __file__]))