RichieHakim · RichieHakim · May 12, 2026 · May 12, 2026 · May 12, 2026 · May 12, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -3,12 +3,9 @@ name: build
 on:
   push:
     branches: [ "main" ]
-    # branches: [ "dev" ]
-    # branches: [ "main", "dev" ]
-  # pull_request:
-    # branches: [ "main" ]
-    # branches: [ "dev" ]    
-    # branches: [ "main", "dev" ]        
+  pull_request:
+    branches: [ "main" ]
+    types: [opened, synchronize, reopened, ready_for_review]
   workflow_dispatch:
     inputs:
       name:
@@ -23,33 +20,22 @@ jobs:
 
   build:
 
-    name: ${{ matrix.platform }}, py${{ matrix.python-version }}, ${{ matrix.install-level }}
+    name: ${{ matrix.platform }}, py${{ matrix.python-version }}
     runs-on: ${{ matrix.platform }}
     strategy:
       fail-fast: false
       matrix:
         platform: [
-          # ubuntu-latest,
-          ubuntu-24.04,
-          ubuntu-22.04,
-          ubuntu-20.04,
-          # # windows-latest,
-          windows-2022,
-          windows-2019,
-          # # macos-latest,
-          macos-14,
-          # macos-11.0,
-          # macos-10.15,
+          ubuntu-latest,
+          windows-latest,
+          macos-latest,
         ]
-        python-version: [ 
-          # "3.9",
+        python-version: [
           "3.10",
           "3.11",
           "3.12",
-        ]
-        install-level: [
-          system,
-          user,
+          "3.13",
+          "3.14",
         ]
 
     steps:
@@ -118,15 +104,10 @@ jobs:
         uname -a
 
 
-    - name: Install package with pip dependencies -- system-level
-      if: matrix.install-level == 'system'
+    - name: Install package with pip dependencies
       run: |
         ## install dependencies with optional extras
-        pip install -v -e .
-    - name: Install package with pip dependencies -- user-level
-      if: matrix.install-level == 'user'
-      run: |
-        pip install -v -e . --user
+        python -m pip install -v -e .
 
 
     - name: Check installed packages
@@ -136,8 +117,10 @@ jobs:
     - name: Run pytest and generate coverage report
       run: |
         # pip install tox tox-gh-actions
-        pip install pytest pytest-cov
-        python -m pytest --capture=tee-sys --cov=sparse_convolution --cov-report=xml:coverage.xml --color=yes
+        # Install numba so CI exercises the optional numba backends.
+        # Torch is intentionally omitted because it is not required for this CI job.
+        python -m pip install pytest pytest-cov numba
+        python -m pytest --capture=tee-sys -rs --cov=sparse_convolution --cov-report=xml:coverage.xml --color=yes
 
     - name: Upload coverage reports to Codecov
       uses: codecov/codecov-action@v4  ## this is a public action recognized by GitHub Actions

diff --git a/README.md b/README.md
@@ -53,10 +53,10 @@ Four methods, each with selectable backends:
 | `lazy` | yes | n/a | yes |
 | `gather_scatter` | yes | yes | yes |
 
-- **`direct`** (default): Batch-parallel scatter convolution with thread-local dense buffers (numba only). For each image in parallel, scatters kernel-weighted input values into an L2-cache-sized accumulator buffer, then extracts nonzeros into CSR format. Uses a two-phase approach: a lightweight boolean counting pass (1-byte flags, no float arithmetic) determines exact output sizes, then the scatter pass writes directly to right-sized arrays with zero waste. Interior pixels (~92-100%) skip bounds checking entirely via precomputed safe regions. O(nnz × K) per image with no init overhead. Fastest method across nearly all configurations. Requires `numba`.
+- **`direct`**: Batch-parallel scatter convolution with thread-local dense buffers (numba only). For each image in parallel, scatters kernel-weighted input values into an L2-cache-sized accumulator buffer, then extracts nonzeros into CSR format. Uses a two-phase approach: a lightweight boolean counting pass (1-byte flags, no float arithmetic) determines exact output sizes, then the scatter pass writes directly to right-sized arrays with zero waste. Interior pixels (~92-100%) skip bounds checking entirely via precomputed safe regions. O(nnz × K) per image with no init overhead. Fastest method across nearly all configurations. Requires `numba`.
 - **`precomputed`**: Builds a sparse Toeplitz matrix at init; fast batched matmul. Best for large batches with the same kernel when numba is not available.
 - **`lazy`**: COO broadcasting, no init cost. Best for very sparse inputs with small batches.
-- **`gather_scatter`**: Per-kernel-position scatter into a dense accumulator. General-purpose method for sparse batched inputs.
+- **`gather_scatter`** (default): Per-kernel-position scatter into a dense accumulator. General-purpose method for sparse batched inputs. Uses `numba` automatically when available, and falls back to `numpy` otherwise.
 
 Backend selection:
 - **`numpy`**: scipy/numpy ops. Always available.
@@ -68,12 +68,12 @@ conv = sc.Toeplitz_convolution2d(
     x_shape=(100, 100),
     k=k,
     mode='same',
-    method='direct',       # default
-    backend='numba',       # auto-selected for direct
+    method='gather_scatter',  # default
+    backend=None,             # numba if installed, otherwise numpy
 )
 ```
 
-If `backend=None` (default), auto-selects `numba` for `direct` and `gather_scatter` (if installed), `numpy` otherwise.
+If `backend=None` (default), `gather_scatter` auto-selects `numba` when installed and falls back to `numpy` otherwise. Use `method='direct'` explicitly for the fastest numba-only implementation.
 
 ## References
 - Toeplitz convolution: [stackoverflow.com/a/51865516](https://stackoverflow.com/a/51865516), [alisaaalehi/convolution_as_multiplication](https://github.com/alisaaalehi/convolution_as_multiplication)

diff --git a/sparse_convolution/sparse_convolution.py b/sparse_convolution/sparse_convolution.py
@@ -110,8 +110,9 @@ class Toeplitz_convolution2d():
             * ``'direct'``: ``'numba'`` (only option)
             \\n
             If ``None``, auto-selects the best available backend:
-            ``'numba'`` for ``'gather_scatter'`` and ``'direct'`` (if
-            installed), ``'numpy'`` otherwise.
+            ``'numba'`` for ``'gather_scatter'`` (if installed), ``'numpy'``
+            otherwise. ``'direct'`` requires ``backend='numba'`` and numba to
+            be installed.
         max_buffer_bytes (int):
             Maximum memory (bytes) for the dense accumulator buffer used by
             ``'gather_scatter'``. Controls chunk size for batch processing.
@@ -144,7 +145,7 @@ def __init__(
         mode: str = 'same',
         dtype: Optional[np.dtype] = None,
         verbose: Union[bool, int] = False,
-        method: str = 'direct',
+        method: str = 'gather_scatter',
         max_buffer_bytes: int = 256 * 1024 * 1024,
         backend: Optional[str] = None,
         device: Optional[str] = None,

diff --git a/tests/test_unit.py b/tests/test_unit.py
@@ -118,3 +118,20 @@ def test_gather_scatter_numpy_all_zero_kernel_returns_empty_sparse_output():
     assert scipy.sparse.isspmatrix_csr(out)
     assert out.shape == x.shape
     assert out.nnz == 0
+
+
+def test_default_method_falls_back_without_numba(monkeypatch):
+    """Default construction should work in minimal scipy/numpy installs."""
+    import sparse_convolution.sparse_convolution as sc_module
+
+    monkeypatch.setattr(sc_module, "HAS_NUMBA", False)
+
+    x = np.array([[1.0, 0.0], [0.0, 2.0]])
+    k = np.array([[0.5]])
+
+    conv = Toeplitz_convolution2d(x_shape=x.shape, k=k, mode="same")
+    out = conv(x, batching=False)
+
+    assert conv.method == "gather_scatter"
+    assert conv.backend == "numpy"
+    assert np.allclose(out, scipy.signal.convolve2d(x, k, mode="same"))