Make threads the default executor (#621)

tomwhite · web-flow · commit 003bf920f5ed · 2024-11-25T21:38:16.000Z
* Make `threads` the default executor

* Fix chunking in store tests and run on multiple executors

* Use single-threaded executor in test_default_spec_config_override to avoid memory error

* Don't measure peak mem with threads executor
diff --git a/cubed/core/array.py b/cubed/core/array.py
@@ -274,9 +274,9 @@ def compute(
     if executor is None:
         executor = arrays[0].spec.executor
         if executor is None:
-            from cubed.runtime.executors.local import SingleThreadedExecutor
+            from cubed.runtime.executors.local import ThreadsExecutor
 
-            executor = SingleThreadedExecutor()
+            executor = ThreadsExecutor()
 
     _return_in_memory_array = kwargs.pop("_return_in_memory_array", True)
     plan.execute(
diff --git a/cubed/runtime/executors/local.py b/cubed/runtime/executors/local.py
@@ -18,6 +18,7 @@
 from cubed.runtime.types import Callback, CubedPipeline, DagExecutor, TaskEndEvent
 from cubed.runtime.utils import (
     execution_stats,
+    execution_timing,
     handle_callbacks,
     handle_operation_start_callbacks,
     profile_memray,
@@ -61,9 +62,14 @@ def execute_dag(
                     [callback.on_task_end(event) for callback in callbacks]
 
 
+@execution_timing
+def run_func_threads(input, func=None, config=None, name=None, compute_id=None):
+    return func(input, config=config)
+
+
 @profile_memray
 @execution_stats
-def run_func(input, func=None, config=None, name=None, compute_id=None):
+def run_func_processes(input, func=None, config=None, name=None, compute_id=None):
     return func(input, config=config)
 
 
@@ -142,7 +148,11 @@ def create_futures_func_multiprocessing(input, **kwargs):
 
 
 def pipeline_to_stream(
-    concurrent_executor: Executor, name: str, pipeline: CubedPipeline, **kwargs
+    concurrent_executor: Executor,
+    run_func: Callable,
+    name: str,
+    pipeline: CubedPipeline,
+    **kwargs,
 ) -> Stream:
     return stream.iterate(
         map_unordered(
@@ -200,15 +210,17 @@ async def async_execute_dag(
                 mp_context=context,
                 max_tasks_per_child=max_tasks_per_child,
             )
+        run_func = run_func_processes
     else:
         concurrent_executor = ThreadPoolExecutor(max_workers=max_workers)
+        run_func = run_func_threads
     try:
         if not compute_arrays_in_parallel:
             # run one pipeline at a time
             for name, node in visit_nodes(dag, resume=resume):
                 handle_operation_start_callbacks(callbacks, name)
                 st = pipeline_to_stream(
-                    concurrent_executor, name, node["pipeline"], **kwargs
+                    concurrent_executor, run_func, name, node["pipeline"], **kwargs
                 )
                 async with st.stream() as streamer:
                     async for _, stats in streamer:
@@ -218,7 +230,7 @@ async def async_execute_dag(
                 # run pipelines in the same topological generation in parallel by merging their streams
                 streams = [
                     pipeline_to_stream(
-                        concurrent_executor, name, node["pipeline"], **kwargs
+                        concurrent_executor, run_func, name, node["pipeline"], **kwargs
                     )
                     for name, node in gen
                 ]
diff --git a/cubed/runtime/utils.py b/cubed/runtime/utils.py
@@ -40,12 +40,33 @@ def execute_with_stats(function, *args, **kwargs):
     )
 
 
+def execute_with_timing(function, *args, **kwargs):
+    """Invoke function and measure timing information.
+
+    Returns the result of the function call and a stats dictionary.
+    """
+
+    function_start_tstamp = time.time()
+    result = function(*args, **kwargs)
+    function_end_tstamp = time.time()
+    return result, dict(
+        function_start_tstamp=function_start_tstamp,
+        function_end_tstamp=function_end_tstamp,
+    )
+
+
 def execution_stats(func):
     """Decorator to measure timing information and peak memory usage of a function call."""
 
     return partial(execute_with_stats, func)
 
 
+def execution_timing(func):
+    """Decorator to measure timing information of a function call."""
+
+    return partial(execute_with_timing, func)
+
+
 def execute_with_memray(function, input, **kwargs):
     # only run memray if installed, and only for first input (for operations that run on block locations)
     if (
diff --git a/cubed/tests/test_core.py b/cubed/tests/test_core.py
@@ -127,43 +127,43 @@ def test_from_zarr(tmp_path, spec, executor, path):
     )
 
 
-def test_store(tmp_path, spec):
+def test_store(tmp_path, spec, executor):
     a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
 
     store = tmp_path / "source.zarr"
-    target = zarr.empty(a.shape, store=store)
+    target = zarr.empty(a.shape, chunks=a.chunksize, store=store)
 
-    cubed.store(a, target)
+    cubed.store(a, target, executor=executor)
     assert_array_equal(target[:], np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
 
 
-def test_store_multiple(tmp_path, spec):
+def test_store_multiple(tmp_path, spec, executor):
     a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
     b = xp.asarray([[1, 1, 1], [1, 1, 1], [1, 1, 1]], chunks=(2, 2), spec=spec)
 
     store1 = tmp_path / "source1.zarr"
-    target1 = zarr.empty(a.shape, store=store1)
+    target1 = zarr.empty(a.shape, chunks=a.chunksize, store=store1)
     store2 = tmp_path / "source2.zarr"
-    target2 = zarr.empty(b.shape, store=store2)
+    target2 = zarr.empty(b.shape, chunks=b.chunksize, store=store2)
 
-    cubed.store([a, b], [target1, target2])
+    cubed.store([a, b], [target1, target2], executor=executor)
     assert_array_equal(target1[:], np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
     assert_array_equal(target2[:], np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]]))
 
 
-def test_store_fails(tmp_path, spec):
+def test_store_fails(tmp_path, spec, executor):
     a = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
     b = xp.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]], chunks=(2, 2), spec=spec)
     store = tmp_path / "source.zarr"
-    target = zarr.empty(a.shape, store=store)
+    target = zarr.empty(a.shape, chunks=a.chunksize, store=store)
 
     with pytest.raises(
         ValueError, match=r"Different number of sources \(2\) and targets \(1\)"
     ):
-        cubed.store([a, b], [target])
+        cubed.store([a, b], [target], executor=executor)
 
     with pytest.raises(ValueError, match="All sources must be cubed array objects"):
-        cubed.store([1], [target])
+        cubed.store([1], [target], executor=executor)
 
 
 @pytest.mark.parametrize("path", [None, "sub", "sub/group"])
@@ -370,7 +370,9 @@ def test_default_spec_config_override():
     # override default spec to increase allowed_mem
     from cubed import config
 
-    with config.set({"spec.allowed_mem": "4GB"}):
+    with config.set(
+        {"spec.allowed_mem": "4GB", "spec.executor_name": "single-threaded"}
+    ):
         a = xp.ones((20000, 10000), chunks=(10000, 10000))
         b = xp.negative(a)
         assert_array_equal(b.compute(), -np.ones((20000, 10000)))
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -95,9 +95,9 @@ These properties can be passed directly to the {py:class}`Spec <cubed.Spec>` con
 | Property           | Default           | Description                                                                                                                             |
 |--------------------|-------------------|-----------------------------------------------------------------------------------------------------------------------------------------|
 | `work_dir`         | `None`            | The directory path (specified as an fsspec URL) used for storing intermediate data. If not set, the user's temporary directory is used. |
-| `allowed_mem`      | `"2GB"`             | The total memory available to a worker for running a task. This includes any `reserved_mem` that has been set.                          |
-| `reserved_mem`     | `"100MB"`           | The memory reserved on a worker for non-data use when running a task                                                                    |
-| `executor_name`    | `"single-threaded"` | The executor for running computations. One of `"single-threaded"`, `"threads"`, `"processes"`, `"beam"`, `"coiled"`, `"dask"`, `"lithops"`, `"modal"`.  |
+| `allowed_mem`      | `"2GB"`           | The total memory available to a worker for running a task. This includes any `reserved_mem` that has been set.                          |
+| `reserved_mem`     | `"100MB"`         | The memory reserved on a worker for non-data use when running a task                                                                    |
+| `executor_name`    | `"threads"`       | The executor for running computations. One of `"single-threaded"`, `"threads"`, `"processes"`, `"beam"`, `"coiled"`, `"dask"`, `"lithops"`, `"modal"`.  |
 | `executor_options` | `None`            | Options to pass to the executor on construction. See below for possible options for each executor.                                      |
 | `zarr_compressor`  | `"default"`| The compressor used by Zarr for intermediate data. If not specified, or set to `"default"`, Zarr will use the default Blosc compressor. If set to `None`, compression is disabled, which can be a good option when using local storage. Use a dictionary (or nested YAML) to configure arbitrary compression using Numcodecs. |
 
diff --git a/docs/user-guide/executors.md b/docs/user-guide/executors.md
@@ -6,9 +6,11 @@ Cubed provides a variety of executors for running the tasks in a computation, wh
 
 ## Local single-machine executors
 
-If you don't specify an executor then the local in-process single-threaded Python executor is used. This is a very simple executor (called `single-threaded`) that is intended for testing on small amounts of data before running larger computations using the `processes` executor on a single machine, or a distributed executor in the cloud.
+If you don't specify an executor then the local in-process multi-threaded Python executor is used by default. This is called the `threads` executor. It doesn't require any set up so it is useful for quickly getting started and running on datasets that don't fit in memory, but that can fit on a single machine's disk.
 
-The `processes` executor runs on a single machine, and uses all the cores on the machine. It doesn't require any set up so it is useful for quickly getting started and running on datasets that don't fit in memory, but can fit on a single machine's disk.
+The `processes` executor also runs on a single machine, and uses all the cores on the machine. However, unlike the `threads` executor, each task runs in a separate process, which avoids GIL contention, but adds some overhead in process startup time and communication. Typically, running using `processes` is more performant than `threads`, but it is worth trying both on your workload to see which is best.
+
+There is a third local executor called `single-threaded` that runs tasks sequentially in a single thread, and is intended for testing on small amounts of data.
 
 ## Which cloud service executor should I use?