diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c4bb75133..09eea5193 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -31,7 +31,7 @@ jobs:
     - name: Install Python Dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest
+        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm
         python -m pip install -e . -q
 
     - name: Build
@@ -113,7 +113,7 @@ jobs:
     - name: Install Python Dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest
+        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm
         python -m pip install -e . -q
 
     - name: Build
@@ -159,7 +159,7 @@ jobs:
     - name: Install Python Dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest
+        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm
         python -m pip install -e . -q
 
     - name: Build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bde0b9107..7ae762e7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -806,6 +806,7 @@ list(APPEND qgd_files
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/CR.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/Adaptive.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/R.cpp
+    ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/Permutation.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/kernels/apply_kernel_to_input.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/kernels/apply_kernel_to_state_vector_input.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/kernels/apply_large_kernel_to_input.cpp
@@ -1040,6 +1041,12 @@ add_subdirectory (squander/VQA)
 
 add_subdirectory (squander/src-cpp/density_matrix)
 
+# ===================================================================
+# SABRE Router Module
+# ===================================================================
+
+add_subdirectory (squander/src-cpp/sabre_router)
+
 if(DEFINED ENV{QGD_CTEST})
     # adding CMAKE files for executables
     add_subdirectory (test_standalone)
diff --git a/conda_env_example.yaml b/conda_env_example.yaml
index 12bde7316..b09ebd8f5 100644
--- a/conda_env_example.yaml
+++ b/conda_env_example.yaml
@@ -16,6 +16,7 @@ dependencies:
   - numpy
   - scipy
   - tbb-devel
+  - pybind11
   - pip:
       - gurobipy
       - matplotlib
diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
new file mode 100644
index 000000000..d64fe596b
--- /dev/null
+++ b/examples/decomposition/PartAM_example.py
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 26 14:42:56 2020
+Copyright 2020 Peter Rakyta, Ph.D.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author: Peter Rakyta, Ph.D.
+"""
+## \file PartAM_example.py
+## \brief Simple example python code demonstrating Partition Aware Mapping
+
+import time
+import numpy as np
+
+from squander import Partition_Aware_Mapping
+from squander import utils
+from squander import Circuit
+from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+    qgd_Wide_Circuit_Optimization,
+)
+
+
+def make_linear_topology(n_qubits):
+    return [(i, i + 1) for i in range(n_qubits - 1)]
+
+
+def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm):
+    """Apply both circuits to a random state and return ``1 - |<psi|phi>|``."""
+    num_qubits = circ.get_Qbit_Num()
+    matrix_size = 1 << num_qubits
+    rng = np.random.RandomState(0)
+    initial_state = (
+        rng.uniform(-1, 1, (matrix_size,))
+        + 1j * rng.uniform(-1, 1, (matrix_size,))
+    )
+    initial_state /= np.linalg.norm(initial_state)
+
+    original_state = initial_state.copy()
+    circ_orig.apply_to(parameters_orig, original_state)
+
+    circ_Final = Circuit(num_qubits)
+    output_perm_T = [0] * num_qubits
+    for i, j in enumerate(output_perm):
+        output_perm_T[j] = i
+    circ_Final.add_Permutation([int(x) for x in input_perm])
+    circ_Final.add_Circuit(circ)
+    circ_Final.add_Permutation(output_perm_T)
+
+    state = initial_state.copy()
+    circ_Final.apply_to(params, state)
+    return 1 - abs(np.vdot(state, original_state))
+
+
+if __name__ == '__main__':
+
+    filename = "bv_n14.qasm"
+
+    # load the circuit from a file
+    circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
+    N = circ_orig.get_Qbit_Num()
+    topology = make_linear_topology(N)
+
+    initial_cnot = circ_orig.get_Gate_Nums().get('CNOT', 0)
+    print(f"Qubits: {N}, initial CNOTs: {initial_cnot}")
+
+    start_time = time.time()
+
+    # one-shot WCO pass before PartAM (topology=None, max_partition_size=3,
+    # part_size_end=4) to fuse trivially-mergeable blocks
+    pre_partam_cleanup_config = {
+        'strategy': 'TreeSearch',
+        'pre-opt-strategy': 'TreeSearch',
+        'partition_strategy': 'ilp',
+        'test_subcircuits': False,
+        'test_final_circuit': False,
+        'max_partition_size': 3,
+        'topology': None,
+        'verbosity': 0,
+        'tolerance': 1e-8,
+        'parallel': 0,
+        'part_size_end': 4,
+    }
+    wco = qgd_Wide_Circuit_Optimization(pre_partam_cleanup_config)
+    pre_partam_circ, pre_partam_params = wco.OptimizeWideCircuit(
+        circ_orig.get_Flat_Circuit(),
+        parameters_orig,
+    )
+    pre_partam_cleanup_cnot = pre_partam_circ.get_Gate_Nums().get('CNOT', 0)
+    print(f"PartAM input CNOTs after pre-cleanup: {pre_partam_cleanup_cnot}")
+
+    # PartAM config
+    config = {
+        'strategy': "TreeSearch",
+        'test_subcircuits': False,
+        'test_final_circuit': False,
+        'max_partition_size': 3,
+        'progressbar': False,
+        'topology': topology,
+        'verbosity': 0,
+        'cleanup': True,
+        'sabre_iterations': 20,
+        'n_layout_trials': 128,
+        'random_seed': 42,
+        # Cheap candidate prefilter before full A* scoring.
+        'prefilter_top_k': 400,
+        'prefilter_min_per_partition': 2,
+        'prefilter_min_3q': 12,
+        # Rank every layout trial by actual constructed routing, not only by
+        # the heuristic trial cost.  QFT is sensitive to this cap.
+        'actual_routing_rank_top_k': None,
+        # Boundary-state beam routing runs in the C++ SABRE router.
+        'use_cpp_router': True,
+        'layout_boundary_beam_width': 4,
+        'layout_boundary_beam_depth': 3,
+        'boundary_beam_width': 4,
+        'boundary_beam_depth': 3,
+        'cnot_cost': 0.5 / 3.0,
+        'cleanup_top_k': 3,
+        'parallel_layout_trials': True,
+        'layout_trial_workers': 0,
+        'max_E_size': 40,
+        'max_lookahead': 6,
+        'E_weight': 0.3,
+        'E_alpha': 1.0,  # LightSABRE-style uniform lookahead (no per-depth decay)
+        'decay_delta': 0.001,
+        'swap_burst_budget': 0,
+        'path_tiebreak_weight': 0.2,
+        'three_qubit_exit_weight': 1.5,
+        'partition_weight_model': 'window_turnover',
+        'pack_credit_weight': 1.0,
+        'partition_chain_penalty_weight': 2.5,
+    }
+
+    # instantiate the object for Partition Aware Mapping
+    pam = Partition_Aware_Mapping(config)
+
+    # run Partition Aware Mapping
+    circ, params, input_perm, output_perm = pam.Partition_Aware_Mapping(
+        pre_partam_circ.get_Flat_Circuit(), pre_partam_params
+    )
+
+    elapsed = time.time() - start_time
+
+    error = validate_result(
+        circ_orig, parameters_orig, circ, params, input_perm, output_perm
+    )
+
+    print(f"CNOTs pre-cleanup: {pam._cnot_pre_cleanup}")
+    print(f"CNOTs post-cleanup: {circ.get_Gate_Nums().get('CNOT', 0)}")
+    print(f"Decomposition error: {error:.10f}")
+    print("--- %s seconds elapsed during optimization ---" % elapsed)
diff --git a/examples/decomposition/example_SABRE.py b/examples/decomposition/example_SABRE.py
index 9b990e17e..68a235259 100644
--- a/examples/decomposition/example_SABRE.py
+++ b/examples/decomposition/example_SABRE.py
@@ -1,14 +1,10 @@
 from squander import SABRE
 from squander import Qiskit_IO
 from squander import utils
+from squander import Circuit
 
 from qiskit import transpile
 from qiskit import QuantumCircuit
-from qiskit.circuit import CircuitInstruction
-from qiskit.circuit.library import PermutationGate
-from qiskit_aer import AerSimulator
-from qiskit.quantum_info import Operator
-from qiskit import QuantumRegister, ClassicalRegister
 import numpy as np
 parameters = np.array([])
 
@@ -39,13 +35,14 @@
 print("INITIAL CIRCUIT:")
 #print( circuit_qiskit )
 print("mapping (q -> Q):", pi)
-print("Final mapping:", final_pi)
 qubits = list(range(N))
-Qiskit_circuit = QuantumCircuit(N)
-pi_map = list(np.array(sabre.get_inverse_pi(pi)))
-Qiskit_circuit.append(CircuitInstruction( PermutationGate(pi_map),qubits))
-Qiskit_circuit &= Qiskit_IO.get_Qiskit_Circuit( Squander_remapped_circuit, parameters_remapped_circuit )
-Qiskit_circuit.append(CircuitInstruction( PermutationGate(list(final_pi)),qubits))
+pi_map = list(np.array(sabre.get_inverse_pi(final_pi)))
+print("Final mapping:", final_pi)
+final_circuit = Circuit(N)
+final_circuit.add_Permutation(list(pi)) 
+final_circuit.add_Circuit(Squander_remapped_circuit)
+final_circuit.add_Permutation(list(pi_map))
+Qiskit_circuit = Qiskit_IO.get_Qiskit_Circuit( final_circuit.get_Flat_Circuit(), parameters_remapped_circuit )
 print("CIRCUIT MAPPED WITH SABRE:")
 #print( Qiskit_circuit )
 print("SABRE SWAP COUNT:", swap_count)
@@ -61,27 +58,14 @@
 print("CIRCUIT MAPPED WITH QISKIT:")
 #print( Qiskit_circuit_mapped )
 print("QISKIT SWAP COUNT:",  dict(Qiskit_circuit_mapped.count_ops())['swap'])
-
-# test the generated squander circuits
-#matrix_size = 1 << Squander_initial_circuit.get_Qbit_Num()
-#unitary_squander_initial = utils.get_unitary_from_qiskit_circuit_operator(circuit_qiskit)
-
-#unitary_squander_remapped_circuit = np.eye( 1 << Squander_initial_circuit.get_Qbit_Num(), dtype=np.complex128 )
-#Squander_remapped_circuit.apply_to( parameters_remapped_circuit, unitary_squander_remapped_circuit)
-"""
-unitary_squander_remapped_circuit = utils.get_unitary_from_qiskit_circuit_operator(Qiskit_circuit)
-
-
-product_matrix = np.dot(unitary_squander_initial.conj().T, unitary_squander_remapped_circuit)
-phase = np.angle(product_matrix[0,0])
-product_matrix = product_matrix*np.exp(-1j*phase)
-
-    
-product_matrix = np.eye(matrix_size)*2 - product_matrix - product_matrix.conj().T
-
-# the error of the decomposition
-decomposition_error =  (np.real(np.trace(product_matrix)))/2
-       
-print('The error of the decomposition is ' + str(decomposition_error))
-
-"""
\ No newline at end of file
+num_qubits = final_circuit.get_Qbit_Num() 
+matrix_size = 1 << num_qubits 
+initial_state_real = np.random.uniform(-1.0,1.0, (matrix_size,) )
+initial_state_imag = np.random.uniform(-1.0,1.0, (matrix_size,) )
+initial_state = initial_state_real + initial_state_imag*1j
+initial_state = initial_state/np.linalg.norm(initial_state)
+original_state = initial_state.copy()
+Squander_initial_circuit.apply_to(parameters_initial,original_state)
+SABRE_state = initial_state.copy()
+final_circuit.apply_to(parameters_remapped_circuit,SABRE_state)
+print(f"ERROR: {1-abs(np.vdot(SABRE_state,original_state))}")
\ No newline at end of file
diff --git a/examples/decomposition/wide_circuit_optimization.py b/examples/decomposition/wide_circuit_optimization.py
index 32d9bde66..603a4688c 100644
--- a/examples/decomposition/wide_circuit_optimization.py
+++ b/examples/decomposition/wide_circuit_optimization.py
@@ -21,91 +21,63 @@
 ## \brief Simple example python code demonstrating a wide circuit optimization
 
 import squander.decomposition.qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization
-from squander.decomposition.qgd_Wide_Circuit_Optimization import CNOTGateCount
-from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
+from squander import Partition_Aware_Mapping
 from squander import utils
 from squander import Qiskit_IO
-import time, requests, os, zipfile, tempfile
-from pathlib import Path
+import time
+from squander import Circuit
+import numpy as np
+from qiskit import transpile
+def generate_star_topology(num_qubits):
+    return [(0, i) for i in range(1, num_qubits)]
+def extract_two_qubit_gate_count(gate_nums_dict):
 
+    # List of two-qubit gate names
+    two_qubit_gates = ['CNOT', 'CZ', 'CU', 'CH', 'SYC', 'CRY', 'CRZ', 'CRX', 'CP', 'SWAP', 'CSWAP']
+    
+    total_two_qubit = 0
+    for gate_name in two_qubit_gates:
+        total_two_qubit += gate_nums_dict.get(gate_name, 0)
+    return total_two_qubit
+if __name__ == '__main__':
 
-if __name__ == "__main__":
-
-    config = {
-        "strategy": "TreeSearch",  # possible values: "TreeSearch", "qiskit", "bqskit", "TabuSearch"
-        "test_subcircuits": False,
-        "test_final_circuit": False,
-        "max_partition_size": 3,
-        "beam": None,
-        "use_osr": True,
-        "use_graph_search": True,
-        "pre-opt-strategy": "TreeSearch",  # possible values: "TreeSearch", "qiskit", "bqskit", "TabuSearch"
-        "routing-strategy": "seqpam-ilp",  # possible values: "sabre", "light-sabre", "bqskit-sabre", "seqpam-quick", "seqpam-ilp"
-        "tolerance": 1e-10,
-        # **{'use_basin_hopping': True, 'bh_T': 1.1822334624366124, 'bh_stepsize': 0.9020671823381502, 'bh_interval': 165, 'bh_target_accept_rate': 0.7037812116166546, 'bh_stepwise_factor': 0.8254028860713254}
+    use_qiskit_sabre = False
+    config = {  
+            'strategy': "TreeSearch", 
+            'test_subcircuits': False,
+            'test_final_circuit': True,
+            'max_partition_size': 3,
+            'beam': 16,
+            "use_gl": True,
+            'tolerance': 1e-10,
     }
 
-    import os
-
-    files = [os.path.join(Path(__file__).resolve().parent, "bv_n14.qasm")]
+    filename = "benchmarks/qfast/5q/vqe.qasm"
+    start_time = time.time()
 
-    results = {}
-    for filename in files:
-        print(f"executing optimization of circuit: {filename}")
+    # load the circuit from a file
+    circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
+    N = circ_orig.get_Qbit_Num()
+    # instantiate the object for optimizing wide circuits
+    wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
 
-        # load the circuit from a file
-        circ, parameters, _ = utils.qasm_to_squander_circuit(filename)
-        config["topology"] = (
-            Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization.linear_topology(
-                circ.get_Qbit_Num()
-            )
-        )
+    # run circuti optimization
+    circ_flat, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ_orig, parameters_orig, True )
 
-        # run circuit optimization
-        wide_circuit_optimizer = (
-            Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization({**config})
-        )
-        start_time = time.time()
-        optcirc, optparameters = wide_circuit_optimizer.OptimizeWideCircuit(
-            circ, parameters
-        )
-        elapsed = time.time() - start_time
-        init_cnot_count = CNOTGateCount(circ, 0)
-        cnot_count, opt_time = CNOTGateCount(
-            optcirc, 0
-        ), wide_circuit_optimizer.config.get("optimization_time", None)
-        a2a_cnot_count, routed_cnot_count = None, None
-        a2a_time, routing_time = 0.0, 0.0
+    config['topology'] = generate_star_topology(N)
+    circo = Qiskit_IO.get_Qiskit_Circuit(circ_flat.get_Flat_Circuit(),parameters)
+    if use_qiskit_sabre:
+        coupling_map = [[i,j] for i,j in config['topology']]
+        circuit_qiskit_sabre = transpile(circo, coupling_map=coupling_map)
+        circ, parameters = Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_sabre)
+        config['routed']= True
+        wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
+    else:
+        wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
+        # run circuti optimization
+        circ, parameters = Qiskit_IO.convert_Qiskit_to_Squander(circo)
+    circ, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters, True )
+    print(f"Two qubit gate count: {extract_two_qubit_gate_count(circ.get_Gate_Nums())}")
+    print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
 
-        if wide_circuit_optimizer.config.get("routed_circuit", None) is not None:
-            init_map, final_map = (
-                wide_circuit_optimizer.config["initial_mapping"],
-                wide_circuit_optimizer.config["final_mapping"],
-            )
-            a2acirc, a2aparams = (
-                wide_circuit_optimizer.config["all_to_all_circuit"],
-                wide_circuit_optimizer.config["all_to_all_parameters"],
-            )
-            routedcirc, routedparams = (
-                wide_circuit_optimizer.config["routed_circuit"],
-                wide_circuit_optimizer.config["routed_parameters"],
-            )
-            a2a_cnot_count = CNOTGateCount(a2acirc, 0)
-            routed_cnot_count = CNOTGateCount(routedcirc, 0)
-            a2a_time = wide_circuit_optimizer.config.get(
-                "all_to_all_optimization_time", None
-            )
-            routing_time = wide_circuit_optimizer.config.get("routing_time", None)
-        results[os.path.basename(filename)] = (
-            (init_cnot_count, a2a_cnot_count, routed_cnot_count, cnot_count),
-            (a2a_time, routing_time, opt_time, elapsed),
-        )
-        wide_circuit_optimizer.check_compare_circuits(
-            circ, optparameters, optcirc, optparameters, routing=True
-        )
-        with open("results.txt", "a") as f:
-            f.write(
-                f"{os.path.basename(filename)}: {config['pre-opt-strategy']}, {config['routing-strategy']}, {config['strategy']} CNOT count = {init_cnot_count, a2a_cnot_count, routed_cnot_count, cnot_count}, elapsed time = {a2a_time:.2f} + {routing_time:.2f} + {opt_time:.2f} = {elapsed:.2f} seconds\n"
-            )
 
-        print("--- %s seconds elapsed during optimization ---" % elapsed)
diff --git a/pyproject.toml b/pyproject.toml
index 3ec14e77b..1721b5129 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,8 @@ requires = [
     "tbb-devel; platform_machine == 'x86' or platform_machine == 'x86_64'",
     "cmake>=3.10.2",
     "networkx",
-    "qiskit"
+    "qiskit",
+    "tqdm"
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/squander/IO_interfaces/Qiskit_IO.py b/squander/IO_interfaces/Qiskit_IO.py
index fa5e3122e..f4b5a4873 100644
--- a/squander/IO_interfaces/Qiskit_IO.py
+++ b/squander/IO_interfaces/Qiskit_IO.py
@@ -62,7 +62,8 @@
     CCX,
     RXX,
     RYY,
-    RZZ )
+    RZZ,
+    Permutation )
 
 
 
@@ -79,7 +80,6 @@ def scalar(param):
 def get_Qiskit_Circuit( Squander_circuit, parameters ):
 
     from qiskit import QuantumCircuit
-
     # creating Qiskit quantum circuit  
     circuit = QuantumCircuit(Squander_circuit.get_Qbit_Num() )
     
@@ -218,6 +218,13 @@ def get_Qiskit_Circuit( Squander_circuit, parameters ):
             #CCX gate
             target_qbits = gate.get_Target_Qbits()
             circuit.swap(target_qbits[0], target_qbits[1])
+        elif isinstance(gate, Permutation):
+            #Permutation gate
+            from qiskit.circuit.library import PermutationGate
+            pattern = gate.get_Pattern()
+            qubits = list(range(len(pattern)))
+            circuit.append( PermutationGate(pattern),qubits)
+        
         
         elif isinstance( gate, RXX ):
             # RXX gate
@@ -594,6 +601,11 @@ def convert_Qiskit_to_Squander( qc_in ):
 
             Circuit_Squander.add_RZZ(  [qubit0, qubit1] )
 
+        elif name[:11] == 'permutation':
+            #Permutation gate
+            pattern = gate.operation.pattern
+            Circuit_Squander.add_Permutation( pattern )
+
         else:
             print(f"convert_Qiskit_to_Squander: Unimplemented gate: {name}")
 
diff --git a/squander/__init__.py b/squander/__init__.py
index 46e41ba35..87755ccc5 100644
--- a/squander/__init__.py
+++ b/squander/__init__.py
@@ -14,7 +14,7 @@
 
 # optimization of wide circuits (optimize wide circuits)
 from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization
-
+from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping as Partition_Aware_Mapping
 # variational quantum solver
 from squander.VQA.qgd_Variational_Quantum_Eigensolver_Base import qgd_Variational_Quantum_Eigensolver_Base as Variational_Quantum_Eigensolver
 from squander.VQA.qgd_Generative_Quantum_Machine_Learning_Base import qgd_Generative_Quantum_Machine_Learning_Base as Generative_Quantum_Machine_Learning
@@ -52,7 +52,8 @@
     RXX,
     RYY,
     RZZ,
-    SXdg
+    SXdg,
+    Permutation
 )
 
 
diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 4841d66cf..423038a3b 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -27,6 +27,20 @@
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 
 
+def _affinity_num_workers():
+    """Return CPU count visible to this process via sched affinity, falling back to cpu_count.
+
+    Use this to size BQSKit ``Compiler(num_workers=...)`` so it does not oversubscribe
+    when the job is bound (taskset/cgroup) to a subset of the machine's CPUs.
+    """
+    if hasattr(os, "sched_getaffinity"):
+        try:
+            return max(1, len(os.sched_getaffinity(0)))
+        except OSError:
+            pass
+    return max(1, mp.cpu_count())
+
+
 def extract_subtopology(involved_qbits, qbit_map, config):
     """Return topology edges restricted to ``involved_qbits``, with indices remapped via ``qbit_map``.
 
@@ -97,1197 +111,176 @@ def CNOTGateCount(circ: Circuit, max_gates: int = 0) -> int:
     return num_cnots
 
 
-class N_Qubit_Decomposition_Guided_Tree(N_Qubit_Decomposition_custom):
-    """Tree-guided multi-qubit decomposition using operator Schmidt rank (OSR) style costs."""
-
-    def __init__(
-        self, Umtx, config, accelerator_num, topology, paramspace=None, paramscale=None
-    ):
-        """Initialize guided tree search over a unitary (or list of unitaries) and hardware topology.
-
-        Args:
-            Umtx: Complex unitary matrix, or list of such matrices (already conjugate-transposed per caller).
-            config: Decomposition / search configuration dict.
-            accelerator_num: Number of accelerators for the base decomposer.
-            topology: List of undirected coupler pairs ``(i, j)``; default is all-to-all.
-            paramspace: Optional per-parameter affine scaling space for ``params_to_mat``.
-            paramscale: Optional scaling denominators paired with ``paramspace``.
-        """
-        super().__init__(
-            Umtx[0] if isinstance(Umtx, list) else Umtx,
-            config=config,
-            accelerator_num=accelerator_num,
-        )
-        self.Umtx = (
-            Umtx if isinstance(Umtx, list) else [Umtx]
-        )  # already conjugate transposed
-        self.qbit_num = self.Umtx[0].shape[0].bit_length() - 1
-        self.config = config
-        self.accelerator_num = accelerator_num
-        self.paramspace = paramspace
-        self.paramscale = () if paramscale is None else paramscale
-        # self.set_Cost_Function_Variant( 0 )	 #0 is Frobenius, 3 is HS, 10 is OSR
-        if topology is None:
-            topology = [
-                (i, j)
-                for i in range(self.qbit_num)
-                for j in range(i + 1, self.qbit_num)
-            ]
-        self.topology = topology
-
-    @staticmethod
-    def enumerate_unordered_cnot_BFS(n: int, topology=None, use_gl=True):
-        """Yield successive BFS levels of CNOT-reachable GL(n,2) states (see ``enumerate_unordered_cnot_BFS_level``).
-
-        Args:
-            n: Number of qubits.
-            topology: Allowed unordered CNOT pairs; default all pairs.
-            use_gl: If True, use GL-style column updates; else restricted enumeration.
-
-        Yields:
-            Each level's list of ``(state_key, seq_pairs, seq_directed)`` discoveries.
-        """
-        # Precompute unordered pairs
-        topology = (
-            [(i, j) for i in range(n) for j in range(i + 1, n)]
-            if topology is None
-            else topology
-        )
-        prior_level_info: Union[tuple[Any, Any, Any, Any], None] = None
-        while True:
-            visited, seq_pairs_of, seq_dir_of, res = (
-                N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS_level(
-                    n, topology, prior_level_info, use_gl=use_gl
-                )
-            )
-            if not res:
-                break
-            yield res
-            prior_level_info = (
-                visited,
-                seq_pairs_of,
-                seq_dir_of,
-                list(x[0] for x in reversed(res)),
-            )
-
-    @staticmethod
-    def canonical_prefix_ok(seq):
-        """Check whether a sequence of unordered pair steps has a canonical topological order.
+def generate_squander_seqpam(squander_config, block_size):
+    """Build a bqskit SeqPAM workflow using Squander as the inner synthesis engine with ILP partitioning.
 
-        Returns:
-            ``-1`` if the prefix is OK; otherwise the first index where canonical order fails.
-        """
-        m = len(seq)
-        if m <= 1:
-            return -1
-        succ = {}
-        indeg = {}
-        last_on = {}
-        for k in range(m):
-            for q in seq[k]:
-                if q in last_on:
-                    p = last_on[q]
-                    succ.setdefault(p, []).append(k)
-                    indeg[k] = indeg.get(k, 0) + 1
-                last_on[q] = k
-        import heapq
-
-        pq = [(seq[x], x) for x in range(m) if indeg.get(x, 0) == 0]
-        heapq.heapify(pq)
-        for pos in range(m):
-            # Kahn's algorithm
-            if len(pq) == 0:
-                return pos  # malformed (shouldn't happen)
-            u = heapq.heappop(pq)
-            if u[1] != pos:
-                return pos  # deviation: not canonical
-            for v in succ.get(u[1], ()):
-                indeg[v] -= 1
-                if indeg[v] == 0:
-                    heapq.heappush(pq, (seq[v], v))
-        return -1
-
-    @staticmethod
-    def enumerate_unordered_cnot_BFS_level(
-        n: int,
-        topology: Optional[List[Tuple[int, int]]] = None,
-        prior_level_info: Optional[
-            Tuple[
-                Set[Tuple[int, ...]],
-                Dict[Tuple[int, ...], List[Tuple[int, int]]],
-                Dict[Tuple[int, ...], List[Tuple[int, int]]],
-                List[
-                    Tuple[Tuple[int, ...], List[Tuple[int, int]], List[Tuple[int, int]]]
-                ],
-            ]
-        ] = None,
-        use_gl=True,
-    ):
-        """Enumerate GL(n,2) states at the next BFS depth from ``prior_level_info``.
-
-        Moves are *recorded* as unordered pairs (structure view); each expansion
-        may try both CNOT directions internally when ``use_gl`` is True.
-
-        Returns:
-            Tuple ``(visited, seq_pairs_of, seq_dir_of, res)`` where ``res`` is a
-            list of ``(A, seq_pairs, seq_directed)`` for newly discovered states
-            ``A``: ``seq_pairs`` is the unordered-pair history; ``seq_directed`` is
-            a consistent directed realization. On the first call, pass
-            ``prior_level_info=None`` to obtain the root state only.
-        """
-        if prior_level_info is None:
-            # Initial state
-            start_key = tuple(1 << i for i in range(n))
-
-            # Visited: we only need to mark states once (minimal depth)
-            visited = {start_key}
-
-            # We also keep *one* representative sequence per state (unordered + directed)
-            seq_pairs_of = {start_key: []}
-            seq_dir_of = {start_key: []}
-
-            # Yield the root
-            return visited, seq_pairs_of, seq_dir_of, [(start_key, [], [])]
-        else:
-            visited, seq_pairs_of, seq_dir_of, q = prior_level_info
-        res = []
-        new_seq_pairs_of = {}
-        new_seq_dir_of = {}
-
-        while q:
-            A = q.pop()
-            last_pairs = seq_pairs_of[A]
-            last_dirs = seq_dir_of[A]
-            assert topology is not None
-            for p in topology:
-                if not use_gl:
-                    if len(last_pairs) >= 3 and all(p == x for x in last_pairs[-3:]):
-                        continue  # avoid more than 3 repeated CNOTs
-                    if (
-                        N_Qubit_Decomposition_Guided_Tree.canonical_prefix_ok(
-                            last_pairs + [p]
-                        )
-                        >= 0
-                    ):
-                        continue  # not canonical prefix
-                # Try both directions, but record the *same* unordered step 'p'
-                for mv in (p, (p[1], p[0])) if use_gl else (p,):
-                    # CNOT left
-                    if use_gl:
-                        if mv[0] == mv[1]:
-                            B = A
-                        else:
-                            B = list(A)
-                            B[mv[1]] ^= B[mv[0]]
-                            B = tuple(B)
-
-                        if B in visited:
-                            continue  # already discovered at minimal depth
-                    else:
-                        B = tuple(last_dirs + [p])
-
-                    visited.add(B)
-                    new_seq_pairs_of[B] = last_pairs + [p]
-                    new_seq_dir_of[B] = last_dirs + [mv]
-
-                    # Emit as soon as we discover the state (BFS → minimal depth)
-                    res.append((B, new_seq_pairs_of[B], new_seq_dir_of[B]))
-        return visited, new_seq_pairs_of, new_seq_dir_of, res
-
-    @staticmethod
-    def build_sequence(stop: int = 5, ordered: bool = True, use_gl: bool = True):
-        """Debug helper: print distribution of minimal CNOT sequence lengths by qubit count (up to ``stop``).
-
-        See OEIS A002884 for related enumeration context. Not used in production optimization paths.
-        """
-        # https://oeis.org/A002884
-        # unordered sequence: 1, 1, 4, 88, 9556, 4526605
-        # unordered at 5 qubits: {0: 1, 1: 10, 2: 85, 3: 650, 4: 4475, 5: 27375, 6: 142499, 7: 580482, 8: 1501297, 9: 1738232, 10: 517884, 11: 13591, 12: 24}
-        for i in range(2, stop + 1):
-            d = {}
-            for z in N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS(
-                i, use_gl=use_gl
-            ):
-                for x in (list if ordered else set)(tuple(x[1]) for x in z):
-                    d[len(x)] = d.get(len(x), 0) + 1
-                if not use_gl and len(d) > 5:
-                    break
-            print({x: d[x] for x in sorted(d)}, sum(d.values()))
+    Args:
+        squander_config: Config dict passed to SquanderSynthesisPass (bqskit-squander keys:
+            ``strategy`` ("Tree_search"/"Tabu_search"), ``verbosity``,
+            ``optimization_tolerance``, ``optimizer_engine``, etc.).
+        block_size: Maximum block size for ILP partitioning and SubtopologySelectionPass.
 
-    @staticmethod
-    def extract_bits(x, pos):
-        """Pack bits of integer ``x`` at positions ``pos`` into a smaller integer (LSB-first order)."""
-        return sum(((x >> p) & 1) << i for i, p in enumerate(pos))
+    Returns:
+        bqskit Workflow implementing the two-stage permutation-aware mapping.
+    """
+    from bqskit.passes import (
+        SquanderSynthesisPass,
+        ForEachBlockPass,
+        EmbedAllPermutationsPass,
+        PAMRoutingPass,
+        PAMLayoutPass,
+        PAMVerificationSequence,
+        SubtopologySelectionPass,
+        ApplyPlacement,
+        UnfoldPass,
+        ExtractModelConnectivityPass,
+        RestoreModelConnectivityPass,
+        LogPass,
+    )
+    from bqskit.passes.control import IfThenElsePass
+    from bqskit.passes.control.predicates import NotPredicate, WidthPredicate
+    from bqskit.compiler import Workflow, BasePass
 
-    @staticmethod
-    def build_osr_matrix(U, n, A):
-        """Reshape unitary ``U`` (size ``2^n``) into the OSR matrix for bipartition ``A`` vs complement.
+    class SquanderILPPartitioner(BasePass):
+        """Partition a bqskit circuit using Squander's ILP partitioner."""
 
-        Args:
-            U: Flattened ``2^n x 2^n`` unitary (row-major).
-            n: Qubit count.
-            A: Tuple of qubit indices on subsystem A.
+        def __init__(self, block_size):
+            self.block_size = block_size
 
-        Returns:
-            Matrix of shape ``(2^{|A|})^2 x (2^{|B|})^2`` for Schmidt analysis.
-        """
-        A = list(reversed(A))
-        B = list(sorted(set(range(n)) - set(A), reverse=True))
-        A, B = [n - 1 - q for q in A], [n - 1 - q for q in B]
-        dA = 1 << len(A)
-        dB = 1 << len(B)
-        return (
-            U.reshape([2] * (2 * n))
-            .transpose(
-                tuple(A) + tuple(t + n for t in A) + tuple(B) + tuple(t + n for t in B)
+        async def run(self, circuit, data):
+            from bqskit.ir import Circuit as BQCircuit
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from qiskit import qasm2
+            from qiskit import QuantumCircuit as QkCircuit
+            from squander import Qiskit_IO
+            from squander.partitioning.partition import PartitionCircuit
+
+            # Unfold any CircuitGate blocks (e.g. from a prior SubtopologySelectionPass)
+            # so that bqskit op indices align 1:1 with squander gate indices after the
+            # QASM roundtrip.  unfold_all() is a no-op on already-flat circuits.
+            flat_circuit = circuit.copy()
+            flat_circuit.unfold_all()
+
+            qasm_str = OPENQASM2Language().encode(flat_circuit)
+            qk_circ = QkCircuit.from_qasm_str(qasm_str)
+            sqdr_circ, sqdr_parameters = Qiskit_IO.convert_Qiskit_to_Squander(
+                qk_circ
             )
-            .reshape(dA * dA, dB * dB)
-        )
-
-    @staticmethod
-    def accumulate_grad_for_cut(U, G, Umat, VTmat, n, A):  # qubits on A
-        """Accumulate gradient ``G * Umat @ VTmat`` from an SVD triplet back into full ``U`` layout for cut ``A``."""
-        A = list(reversed(A))
-        B = list(sorted(set(range(n)) - set(A), reverse=True))
-        A, B = [n - 1 - q for q in A], [n - 1 - q for q in B]
-        mat = np.array(G) * Umat @ VTmat  # reconstruct U from its dyadic decomposition
-        revmap = [None] * (2 * n)
-        for i, x in enumerate(
-            tuple(A) + tuple(t + n for t in A) + tuple(B) + tuple(t + n for t in B)
-        ):
-            revmap[x] = i
-        U += mat.reshape([2] * (2 * n)).transpose(tuple(revmap)).reshape(*U.shape)
-        return U
-
-    @staticmethod
-    def trace_out_qubits(U, n, A):
-        """Trace out complement of subsystem ``A`` and return a unitary polar factor on ``A`` (2^{|A|} x 2^{|A|})."""
-        M = N_Qubit_Decomposition_Guided_Tree.build_osr_matrix(U, n, A)
-        M = np.linalg.svd(M, compute_uv=True, full_matrices=False)[0][:, 0].reshape(
-            1 << len(A), 1 << len(A)
-        )
-        return N_Qubit_Decomposition_Guided_Tree._polar_unitary(M)
-
-    @staticmethod
-    def numerical_rank_osr(M, Fnorm, tol=1e-10):
-        """Count singular values of ``M/Fnorm`` above ``tol`` relative to the largest; returns ``(rank, s)``."""
-        s = np.linalg.svd(M, full_matrices=False, compute_uv=False) / Fnorm
-        # print(s)
-        return int(np.sum(s >= s[0] * tol)), s
-
-    @staticmethod
-    def operator_schmidt_rank(U, n, A, Fnorm, tol=1e-10):
-        """Operator Schmidt rank of ``U`` across cut ``A`` (via OSR matrix), using ``numerical_rank_osr``."""
-        return N_Qubit_Decomposition_Guided_Tree.numerical_rank_osr(
-            N_Qubit_Decomposition_Guided_Tree.build_osr_matrix(U, n, A), Fnorm, tol
-        )
-
-    @staticmethod
-    def unique_cuts(n):
-        """Yield all nontrivial unordered bipartitions of ``n`` qubits (each complement pair once)."""
-        import itertools
-
-        qubits = tuple(range(n))
-        for r in range(1, n // 2 + 1):  # only up to half
-            for S in itertools.combinations(qubits, r):
-                if r < n - r:
-                    yield S
-                else:  # r == n-r (only possible when n even): tie-break
-                    comp = tuple(q for q in qubits if q not in S)
-                    if S < comp:  # lexicographically smaller tuple wins
-                        yield S
-
-    def get_circuit_from_pairs(self, pairs, finalizing=True):
-        """Build a layer of U3–U3–CNOT per pair, optionally followed by trailing U3 on every qubit."""
-        circ = Circuit(self.qbit_num)
-        for pair in pairs:
-            circ.add_U3(pair[0])
-            circ.add_U3(pair[1])
-            circ.add_CNOT(pair[0], pair[1])
-        if finalizing:
-            for qbit in range(self.qbit_num):
-                circ.add_U3(qbit)
-        return circ
-
-    @staticmethod
-    def ceil_log2(x):
-        """Ceiling of log2 for nonnegative integer ``x``; ``0`` maps to ``0``."""
-        return 0 if x == 0 else (x - 1).bit_length()
-
-    @staticmethod
-    def logsumexp_smoothmax(Lc, tau=1e-2):
-        """Smooth maximum of list ``Lc``: ``tau * log(sum exp(v/tau)) + max``, stable implementation."""
-        if not Lc:
-            return 0.0
-        if tau <= 0.0:
-            raise RuntimeError("tau must be > 0")
-        m = max(Lc)
-        acc = 0.0
-        for v in Lc:
-            acc += np.exp((v - m) / tau)
-        return tau * np.log(acc) + m
-
-    @staticmethod
-    def dyadic_loss(S, max_dyadic, rho=0.9, tol=1e-4):
-        """Weighted loss on dyadic singular-value indices (powers of two) of normalized spectrum ``S``."""
-        tot_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(len(S))
-        w = 1.0
-        acc = 0.0
-        for k in range(max_dyadic - 1, -1, -1):
-            if k < tot_dyadic:
-                val = S[1 << k] - S[0] * tol
-                acc += w * val * val
-            w *= rho
-        return acc
-
-    @staticmethod
-    def avg_loss(cuts_S, rho=0.9):
-        """Average ``dyadic_loss`` over a list of singular-value spectra ``cuts_S``."""
-        max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(
-            max(len(S) for S in cuts_S)
-        )
-        total_loss = 0.0
-        for S in cuts_S:
-            total_loss += N_Qubit_Decomposition_Guided_Tree.dyadic_loss(
-                S, max_dyadic, rho
+            partitioned_circuit, parameters, _ = PartitionCircuit(
+                sqdr_circ,
+                sqdr_parameters,
+                self.block_size,
+                strategy="ilp",
             )
-        return total_loss / len(cuts_S)
-
-    # Aggregated cost over cuts: softmax (log-sum-exp) of per-cut dyadic losses
-    @staticmethod
-    def cuts_softmax_dyadic_cost(cuts_S, rho=0.1, tau=1e-2):
-        """Log-sum-exp aggregate of per-cut dyadic losses (temperature ``tau``)."""
-        if tau <= 0.0:
-            raise RuntimeError("tau must be > 0")
-        Lc = []
-        max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(
-            max(len(S) for S in cuts_S)
-        )
-        for S in cuts_S:
-            Lc.append(N_Qubit_Decomposition_Guided_Tree.dyadic_loss(S, max_dyadic, rho))
-        return N_Qubit_Decomposition_Guided_Tree.logsumexp_smoothmax(Lc, tau)
-
-    # Gradient w.r.t. the singular values (diagonal of dL/dΣ):
-    @staticmethod
-    def dyadic_loss_grad_diag(S, max_dyadic, Fnorm, rho=0.1, tol=1e-4):
-        """Diagonal gradient of ``dyadic_loss`` w.r.t. singular values (dyadic indices only)."""
-        n = len(S)
-        # c_k = rho^k / Mk  for k=1..n-1, then prefix sum C_j = sum_{k=1}^j c_k
-        tot_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(n)
-        grad = [0.0] * tot_dyadic
-        w = 1.0
-        for k in range(max_dyadic - 1, -1, -1):
-            if k < tot_dyadic:
-                idx = 1 << k
-                grad[k] = (
-                    2.0 * w * S[idx] * (1.0 - tol) / Fnorm
-                )  # 1-tol not needed if using stop-grad
-            w *= rho  # w = rho^k
-        return grad
 
-    @staticmethod
-    def cuts_avg_dyadic_grad(cuts_S, Fnorm, rho=0.1):
-        """Per-cut gradients for the average dyadic loss (list parallel to ``cuts_S``)."""
-        C = len(cuts_S)
-        max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(
-            max(len(S) for S in cuts_S)
-        )
-        Lc = []
-        for c in range(C):
-            Lc.append(
-                N_Qubit_Decomposition_Guided_Tree.dyadic_loss_grad_diag(
-                    cuts_S[c], max_dyadic, Fnorm * C, rho
+            partitioned = BQCircuit(circuit.num_qudits, circuit.radixes)
+            qasm = OPENQASM2Language()
+
+            for subcircuit in partitioned_circuit.get_Gates():
+                global_qudits = list(subcircuit.get_Qbits())
+                if not global_qudits:
+                    continue
+
+                start = subcircuit.get_Parameter_Start_Index()
+                stop = start + subcircuit.get_Parameter_Num()
+                sub_parameters = parameters[start:stop]
+                local_map = {q: i for i, q in enumerate(global_qudits)}
+                local_subcircuit = subcircuit.Remap_Qbits(
+                    local_map,
+                    len(global_qudits),
                 )
-            )
-        return Lc
-
-    # Gradient w.r.t. singular values (same length as S).
-    # Only dyadic positions (1,2,4,...) get nonzero entries; others are 0.
-    @staticmethod
-    def cuts_softmax_tail_grad(cuts_S, Fnorm, rho=0.1, tau=1e-2):
-        """Gradient of softmax-of-dyadic-losses w.r.t. each cut's singular values."""
-        C = len(cuts_S)
-        if C == 0:
-            return []
-        max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(
-            max(len(S) for S in cuts_S)
-        )
-        # 1) per-cut losses
-        Lc = [
-            N_Qubit_Decomposition_Guided_Tree.dyadic_loss(cuts_S[c], max_dyadic, rho)
-            for c in range(C)
-        ]
-
-        # 2) softmax weights w_c = exp((Lc - m)/tau) / Z
-        m = max(Lc)
-        w = [np.exp((Lc[c] - m) / tau) for c in range(C)]
-        Z = np.sum(w)
-        for c in range(C):
-            w[c] /= Z if Z > 0.0 else 1.0
-
-        # 3) dL/dS^{(c)} = w_c * dL_c/dS^{(c)}
-        return [
-            [
-                v * w[c]
-                for v in N_Qubit_Decomposition_Guided_Tree.dyadic_loss_grad_diag(
-                    cuts_S[c], max_dyadic, Fnorm, rho
+                local_qiskit = Qiskit_IO.get_Qiskit_Circuit(
+                    local_subcircuit,
+                    sub_parameters,
                 )
-            ]
-            for c in range(C)
-        ]
-
-    @staticmethod
-    def loss_for_rank(S, rank):
-        """Sum of squares of singular values from index ``2**rank`` onward (tail beyond target rank)."""
-        start = 1 << rank
-        if start >= len(S):
-            return 0.0
-        return sum(x * x for x in S[start:])
-
-    @staticmethod
-    def avg_loss_for_rank(cuts_S, rank):
-        """Average ``loss_for_rank`` over cuts."""
-        if not cuts_S:
-            return 0.0
-        total_loss = 0.0
-        for S in cuts_S:
-            total_loss += N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank)
-        return total_loss / len(cuts_S)
-
-    # Aggregated cost over cuts: softmax (log-sum-exp) of per-cut dyadic losses
-    @staticmethod
-    def cuts_softmax_rank_cost(cuts_S, rank, tau=1e-2):
-        """Softmax aggregate of per-cut ``loss_for_rank`` (temperature ``tau``)."""
-        Lc = []
-        for S in cuts_S:
-            Lc.append(N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank))
-        return N_Qubit_Decomposition_Guided_Tree.logsumexp_smoothmax(Lc, tau)
-
-    # Gradient w.r.t. the singular values (diagonal of dL/dΣ):
-    @staticmethod
-    def loss_for_rank_grad_diag(S, rank, Fnorm):
-        """
-        Gradient of a single-cut tail loss with respect to the RAW singular values,
-        assuming S is already normalized and Fnorm is treated as constant.
-
-        If S = sigma / Fnorm, then d/dsigma_i sum_{j>=r} S_j^2 = 2*S_i/Fnorm on tail.
-        """
-        n = len(S)
-        start = 1 << rank
-        grad = [0.0] * n
-        if start >= n:
-            return grad
-        invF = 1.0 / Fnorm
-        for i in range(start, n):
-            grad[i] = 2.0 * S[i] * invF
-        return grad
-
-    @staticmethod
-    def cuts_avg_rank_grad(cuts_S, rank, Fnorm):
-        """
-        Gradient of average tail loss across cuts.
-        Returns one gradient vector per cut, same length as that cut's S.
-        """
-        C = len(cuts_S)
-        if C == 0:
-            return []
-        scale = 1.0 / C
-        out = []
-        for S in cuts_S:
-            g = N_Qubit_Decomposition_Guided_Tree.loss_for_rank_grad_diag(
-                S, rank, Fnorm
-            )
-            out.append([scale * v for v in g])
-        return out
-
-    # Gradient w.r.t. singular values (same length as S).
-    @staticmethod
-    def cuts_softmax_rank_grad(cuts_S, rank, Fnorm, tau=1e-2):
-        """
-        Gradient of smooth-max across cuts:
-            L = tau * log(sum_c exp(L_c / tau))
-        so
-            dL = sum_c softmax_c * dL_c
-        """
-        C = len(cuts_S)
-        if C == 0:
-            return []
-        if tau <= 0.0:
-            raise RuntimeError("tau must be > 0")
-
-        Lc = [N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank) for S in cuts_S]
-
-        m = max(Lc)
-        w = [np.exp((v - m) / tau) for v in Lc]
-        Z = np.sum(w)
-        if Z <= 0.0:
-            Z = 1.0
-        w = [x / Z for x in w]
-
-        out = []
-        for c, S in enumerate(cuts_S):
-            g = N_Qubit_Decomposition_Guided_Tree.loss_for_rank_grad_diag(
-                S, rank, Fnorm
-            )
-            out.append([w[c] * v for v in g])
-        return out
-
-    # Build M with build_osr_matrix, then SVD (econ) and grab top triplet.
-    @staticmethod
-    def top_k_triplet_for_cut(
-        U,  # (N x N), row-major, N = 1<<q
-        q,  # number of qubits
-        A,  # qubits on side A
-        Fnorm,  # e.g., sqrt(N)
-    ):
-        """SVD of OSR matrix for cut ``A``: returns normalized singular values and ``U``, ``Vh``."""
-        # 1) Build M for this cut
-        M = N_Qubit_Decomposition_Guided_Tree.build_osr_matrix(U, q, A)
-        k = min(M.shape)
-
-        # 2) SVD: M = U * diag(S) * VT  (VT = V^H)
-        # Row-major API handles leading dims as col counts.
-        res = np.linalg.svd(M, full_matrices=False, compute_uv=True)
-        return res.S / Fnorm, res.U, res.Vh  # normalized singular value
-
-    @staticmethod
-    def get_deriv_osr_entanglement(matrix, use_cuts, rank, use_softmax):
-        """Gradient of rank / softmax-rank entanglement cost w.r.t. unitary ``matrix`` entries."""
-        qbit_num = len(matrix).bit_length() - 1
-        cuts = (
-            list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(qbit_num))
-            if len(use_cuts) == 0
-            else use_cuts
-        )
-        Fnorm = np.sqrt(len(matrix))
-        deriv = np.zeros(matrix.shape, dtype=complex)
-        # Compute the derivative of the OSR entanglement cost function
-        triplets = []
-        allS = []
-        for cut in cuts:
-            # 1) top k triplet on the normalized reshape M_c
-            S, Umat, VTmat = N_Qubit_Decomposition_Guided_Tree.top_k_triplet_for_cut(
-                matrix, qbit_num, cut, Fnorm
-            )
-            triplets.append(([], Umat, VTmat))
-            allS.append(S)
-        if use_softmax:
-            allS = N_Qubit_Decomposition_Guided_Tree.cuts_softmax_rank_grad(
-                allS, rank, Fnorm
-            )
-        else:
-            allS = N_Qubit_Decomposition_Guided_Tree.cuts_avg_rank_grad(
-                allS, rank, Fnorm
-            )
-        for i in range(len(cuts)):
-            triplets[i] = (allS[i], triplets[i][1], triplets[i][2])
-        for i in range(len(cuts)):
-            G, Umat, VTmat = triplets[i]
-            N_Qubit_Decomposition_Guided_Tree.accumulate_grad_for_cut(
-                deriv, G, Umat, VTmat, qbit_num, cuts[i]
-            )
-        return deriv
-
-    # Compute grad component = Re Tr( A^† B ) for A = dL/dU, B = dU/dθ
-    # A and B are (rows x cols) with row-major leading dimension.
-    @staticmethod
-    def real_trace_conj_dot(A, B):
-        """Return ``Re Tr(A† B)`` for complex matrices ``A``, ``B`` (row-major storage)."""
-        return np.sum(A.real * B.real + A.imag * B.imag)  # Re Tr(A^† B)
-
-    @staticmethod
-    def param_derivs(circ, Umtx, x):
-        """Finite-difference / shift-style partial derivatives ``∂U/∂θ_i`` for each gate parameter in ``x``."""
-        n = len(x)
-        derivs = [None] * n
-        for i in range(n):
-            kind = i % 3
-            if kind == 0:  # d/dt:  ∂U/∂t = U(t+π/2, φ, λ)
-                x_shift = x.copy()
-                x_shift[i] += np.pi / 2
-                Ui = Umtx.copy()
-                circ.apply_to(x_shift, Ui)
-                derivs[i] = Ui
-            else:  # d/dφ or d/dλ: ∂U/∂p = 0.5*(U(p+π/2) - U(p-π/2))
-                xp = x.copy()
-                xp[i] += np.pi / 2
-                xm = x.copy()
-                xm[i] -= np.pi / 2
-                Up = Umtx.copy()
-                Um = Umtx.copy()
-                circ.apply_to(xp, Up)
-                circ.apply_to(xm, Um)
-                derivs[i] = 0.5 * (Up - Um)
-        return derivs
-
-    @staticmethod
-    def _global_phase_fix(U):
-        """Remove global phase from square unitary ``U`` using determinant normalization."""
-        return U / (np.linalg.det(U) ** (1 / len(U)))
-
-    @staticmethod
-    def _polar_unitary(X):
-        """Nearest unitary to ``X`` via polar decomposition (SVD)."""
-        U, _, Vh = np.linalg.svd(X, full_matrices=False)
-        return U @ Vh
-
-    @staticmethod
-    def su2_to_u3_zyz(U):
-        """
-        Decompose a 2x2 unitary (det=1) into Qiskit U3: Rz(phi) @ Ry(theta) @ Rz(lam).
-        Returns (theta, phi, lam) in radians.
-        """
-        U = N_Qubit_Decomposition_Guided_Tree._global_phase_fix(U)
-        # Handle numeric edge cases robustly
-        a = U[0, 0]
-        b = U[0, 1]
-        c = U[1, 0]
-        d = U[1, 1]
-        # Prefer arccos for theta; it's stable when |a| is not tiny
-        ca = np.clip(np.abs(a), 0.0, 1.0)
-        theta = 2.0 * np.arccos(ca)
-        # If sin(theta/2) ~ 0, collapse to Z rotations
-        eps = 1e-12
-        if abs(np.sin(theta / 2)) < eps:
-            # Then c≈0, b≈0; only Z phases matter: U ≈ e^{iα} Rz(phi+lam)
-            # Choose phi=0, lam = arg(d) - arg(a)
-            phi = 0.0
-            lam = np.angle(d) - np.angle(a)
-            # Normalize to [-pi,pi)
-            lam = (lam + np.pi) % (2 * np.pi) - np.pi
-            return float(theta), float(phi), float(lam)
-
-        # Otherwise, phases from elements and normalize
-        phi = np.angle(c) - np.angle(a)
-        phi = (phi + np.pi) % (2 * np.pi) - np.pi
-        lam = np.angle(b) - np.angle(a) - np.pi
-        lam = (lam + np.pi) % (2 * np.pi) - np.pi
-        return float(theta), float(phi), float(lam)
-
-    @staticmethod
-    def _A_from_c(c1, c2, c3):
-        """Two-qubit canonical interaction ``exp(-i/2 * (c1 XX + c2 YY + c3 ZZ))`` as a unitary."""
-        X = np.array([[0, 1], [1, 0]], complex)
-        Y = np.array([[0, -1j], [1j, 0]], complex)
-        Z = np.array([[1, 0], [0, -1]], complex)
-        XX = np.kron(X, X)
-        YY = np.kron(Y, Y)
-        ZZ = np.kron(Z, Z)
-        H = c1 * XX + c2 * YY + c3 * ZZ
-        # use exp via eig (4x4) for robustness
-        ew, EV = np.linalg.eig(1j * H)
-        A = EV @ np.diag(np.exp(ew)) @ np.linalg.inv(EV)
-        # project back to unitary (remove numeric drift)
-        return N_Qubit_Decomposition_Guided_Tree._polar_unitary(A)
-
-    # Factor K1, K2 → (2x2 ⊗ 2x2)
-    @staticmethod
-    def factor_local(K):
-        """Factor 4x4 unitary ``K`` into Kronecker product of two 2x2 unitaries (SVD on reshaped tensor)."""
-        # reshape to (2,2,2,2), SVD the (a,c ; b,d) unfolding
-        M = K.reshape(2, 2, 2, 2).transpose(0, 2, 1, 3).reshape(4, 4)
-        U, _, Vh = np.linalg.svd(M, full_matrices=False)
-        A = U[:, 0].reshape(2, 2)
-        B = Vh.conj().T[:, 0].reshape(2, 2)
-        return N_Qubit_Decomposition_Guided_Tree._polar_unitary(
-            A
-        ), N_Qubit_Decomposition_Guided_Tree._polar_unitary(B)
-
-    @staticmethod
-    def _magic_basis_plusYY():
-        """Magic basis matrix for two-qubit canonical form (Bell-like columns)."""
-        # Complex magic basis (matches A(c)=exp(-i/2*(c1 XX + c2 YY + c3 ZZ)) below)
-        # Columns are (|Φ+>, i|Φ->, i|Ψ+>, |Ψ->) up to harmless phases
-        return (1 / np.sqrt(2)) * np.array(
-            [[1, 0, 0, 1j], [0, 1j, 1, 0], [0, 1j, -1, 0], [1j, 0, 0, -1]],
-            dtype=complex,
-        )
-
-    @staticmethod
-    def _project_to_SO4(O):
-        """Nearest proper SO(4) rotation to real matrix ``O`` (SVD with det fix)."""
-        # nearest real orthogonal with det=+1
-        O = np.real_if_close(O, tol=1e5)
-        U, _, Vt = np.linalg.svd(O)
-        O = U @ Vt
-        if np.linalg.det(O) < 0:
-            O[:, 0] *= -1
-        return O
-
-    @staticmethod
-    def _clean_col_phases(W):
-        """Remove column-wise global phases from matrix ``W`` (largest-magnitude entry per column)."""
-        Wc = W.copy()
-        for j in range(Wc.shape[1]):
-            col = Wc[:, j]
-            k = np.argmax(np.abs(col))
-            if np.abs(col[k]) > 1e-14:
-                Wc[:, j] *= np.exp(-1j * np.angle(col[k]))
-        return Wc
-
-    @staticmethod
-    def closest_local_product(W4):
-        """Best product of single-qubit unitaries approximating 4x4 ``W4`` (via ``factor_local``)."""
-        A, B = N_Qubit_Decomposition_Guided_Tree.factor_local(W4)
-        return N_Qubit_Decomposition_Guided_Tree._global_phase_fix(
-            A
-        ), N_Qubit_Decomposition_Guided_Tree._global_phase_fix(B)
-
-    @staticmethod
-    def kak_u3s_around_cx(U, n, c, t, iters=3):
-        """KAK-style two-qubit block on control ``c`` and target ``t``: Weyl angles and U3 params (debug helper)."""
-        U4 = N_Qubit_Decomposition_Guided_Tree.trace_out_qubits(U, n, (c, t))
-        U4 = N_Qubit_Decomposition_Guided_Tree._global_phase_fix(U4)
-        from qiskit.synthesis import TwoQubitWeylDecomposition
-
-        twd = TwoQubitWeylDecomposition(U4)
-        c1, c2, c3 = twd.a, twd.b, twd.c
-        K1A, K1B, K2A, K2B = twd.K1l, twd.K1r, twd.K2l, twd.K2r
-        A = N_Qubit_Decomposition_Guided_Tree._A_from_c(c1, c2, c3)
-        U_rec = np.kron(K1A, K1B) @ A @ np.kron(K2A, K2B)
-        z = np.trace(U_rec.conj().T @ U4)
-        U_rec *= np.exp(1j * np.angle(z))
-        print("Frob err:", np.linalg.norm(U_rec - U4), c1, c2, c3)
-        thA_pre, phA_pre, laA_pre = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz(
-            K2A.conj().T
-        )
-        thB_pre, phB_pre, laB_pre = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz(
-            K2B.conj().T
-        )
-        thA_post, phA_post, laA_post = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz(
-            K1A.conj().T
-        )  # left-apply ⇒ take dagger on outputs
-        thB_post, phB_post, laB_post = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz(
-            K1B.conj().T
-        )
-        return {
-            "c": (c1, c2, c3),
-            "pre": {
-                "A": (thA_pre / 2, phA_pre, laA_pre),
-                "B": (thB_pre / 2, phB_pre, laB_pre),
-            },
-            "post": {
-                "A": (thA_post / 2, phA_post, laA_post),
-                "B": (thB_post / 2, phB_post, laB_post),
-            },
-        }
-
-    def params_to_mat(self, params):
-        """Apply current gate structure to each target unitary with (optional) affine parameter scaling."""
-        allU = []
-        for U, pspace in zip(
-            self.Umtx, [None] if self.paramspace is None else self.paramspace
-        ):
-            U = U.copy()
-            scaled_params = (
-                np.sum(
-                    params.reshape(-1, 1 + len(pspace)) * np.array((1.0,) + pspace),
-                    axis=1,
+                local_bqskit = qasm.decode(qasm2.dumps(local_qiskit))
+                partitioned.append_circuit(
+                    local_bqskit,
+                    global_qudits,
+                    as_circuit_gate=True,
                 )
-                if pspace is not None
-                else params
-            )
-            self.get_Circuit().apply_to(
-                scaled_params if pspace is not None else params, U
-            )
-            allU.append(U)
-        return allU
 
-    def OSR_with_local_alignment(
-        self, pairs, cuts, Fnorm, tol, rank, use_softmax, method="dual_annealing"
-    ):
-        """Optimize gate parameters to reduce OSR-based entanglement across ``cuts`` (optionally softmax-aggregated).
+            circuit.become(partitioned, False)
 
-        Uses cost variant 10 during optimization, then restores variant 3. Returns list of
-        ``(ceil_log2(rank), singular_spectrum)``-style entries per unitary and cut.
-        """
-        if len(pairs) != 0:
-            self.set_Cost_Function_Variant(10)
-            # self.Run_Decomposition(pairs, False)
-            self.set_Gate_Structure(self.get_circuit_from_pairs(pairs, False))
-            import scipy
-
-            param_bound = np.array(
-                ([2 * np.pi] + [1 / x for x in self.paramscale])
-                * self.get_Parameter_Num()
-            )
+    class SetPAMInitialPlacementPass(BasePass):
+        """Set the placement used as the starting point for the final PAM layout."""
 
-            def cost(x):
-                allU = self.params_to_mat(x)
-                S = [
-                    N_Qubit_Decomposition_Guided_Tree.operator_schmidt_rank(
-                        U, self.qbit_num, cut, Fnorm, tol
-                    )[1]
-                    for U in allU
-                    for cut in cuts
-                ]
-                if use_softmax:
-                    return N_Qubit_Decomposition_Guided_Tree.cuts_softmax_rank_cost(
-                        S, rank
-                    )
-                else:
-                    return N_Qubit_Decomposition_Guided_Tree.avg_loss_for_rank(S, rank)
-
-            def jacobian(x):
-                allU = self.params_to_mat(x)
-                grad = np.zeros(len(x), dtype=float)
-                for Ubase, U, pspace in zip(
-                    self.Umtx,
-                    allU,
-                    [None] if self.paramspace is None else self.paramspace,
-                ):
-                    dL = N_Qubit_Decomposition_Guided_Tree.get_deriv_osr_entanglement(
-                        U, cuts, rank, use_softmax
-                    )
-                    basevec = np.array((1.0,) if pspace is None else (1.0,) + pspace)
-                    scaled_params = (
-                        np.sum(x.reshape(-1, 1 + len(pspace)) * basevec, axis=1)
-                        if pspace is not None
-                        else x
-                    )
-                    derivs = N_Qubit_Decomposition_Guided_Tree.param_derivs(
-                        self.get_Circuit(), Ubase, scaled_params
-                    )
-                    newgrad = np.array(
-                        [
-                            N_Qubit_Decomposition_Guided_Tree.real_trace_conj_dot(
-                                dL, deriv
-                            )
-                            for deriv in derivs
-                        ]
-                    )
-                    if pspace is not None:
-                        newgrad = (np.array(newgrad)[:, np.newaxis] * basevec).reshape(
-                            -1
-                        )
-                    grad += newgrad
-                return grad / len(self.Umtx)
+        def __init__(self, placement):
+            self.placement = None if placement is None else list(placement)
 
-            if method == "differential_evolution":
-                best = scipy.optimize.differential_evolution(
-                    cost, [(0, x) for x in param_bound], maxiter=100, polish=False
-                )
-                best = scipy.optimize.minimize(
-                    cost, best.x, method="BFGS", jac=jacobian, options={"maxiter": 200}
-                )
-            elif method == "dual_annealing":
-                best = None
-                for seed in range(20):
-                    res = scipy.optimize.dual_annealing(
-                        cost, [(0, x) for x in param_bound], maxiter=100
-                    )  # , minimizer_kwargs={'jac': jacobian})
-                    if best is None or res.fun < best.fun:
-                        best = res
-            elif method == "basinhopping":
-                best = scipy.optimize.basinhopping(
-                    cost,
-                    np.random.rand(len(param_bound)) * param_bound,
-                    niter=50,
-                    stepsize=np.pi / 2,
-                    minimizer_kwargs={"jac": jacobian},
-                )
-            else:
-                best = min(
-                    [
-                        scipy.optimize.minimize(
-                            cost,
-                            np.random.rand(len(param_bound)) * param_bound,
-                            method="BFGS",
-                            jac=jacobian,
-                            options={"maxiter": 200},
-                        )
-                        for _ in range(20)
-                    ],
-                    key=lambda r: r.fun,
+        async def run(self, circuit, data):
+            if self.placement is None:
+                return
+            if len(self.placement) != circuit.num_qudits:
+                raise ValueError(
+                    "PAM initial placement length must match circuit width."
                 )
-            # print(best)
-            self.set_Cost_Function_Variant(3)
-            assert best is not None
-            allU = self.params_to_mat(best.x)
-        else:
-            allU = self.Umtx
-        return [
-            (N_Qubit_Decomposition_Guided_Tree.ceil_log2(rank), s)
-            for U in allU
-            for cut in cuts
-            for rank, s in (
-                N_Qubit_Decomposition_Guided_Tree.operator_schmidt_rank(
-                    U, self.qbit_num, cut, Fnorm, tol
-                ),
-            )
-        ]
-
-    def Run_Decomposition(self, pairs, finalizing=True):
-        """Run BFGS decomposition for CNOT structure ``pairs``; set ``self.err`` and return success vs tolerance."""
-        circ = self.get_circuit_from_pairs(pairs, finalizing)
-        self.set_Gate_Structure(circ)
-        self.set_Optimized_Parameters(
-            np.random.rand(self.get_Parameter_Num()) * (2 * np.pi)
-        )
-        super().Start_Decomposition()
-        if finalizing:
-            params = self.get_Optimized_Parameters()
-            self.err = self.Optimization_Problem(params)
-            return self.err < self.config.get("tolerance", 1e-8)
-
-    @staticmethod
-    def generate_insertions(curpath, topology, num_cnot):
-        """Yield CNOT insertion patterns: insert ``num_cnot`` topology pairs into sequence ``curpath``."""
-        import itertools
+            data.placement = list(self.placement)
 
-        n = len(curpath)
-        nslots = n + 1
-        for places in itertools.combinations_with_replacement(range(nslots), num_cnot):
-            for pairs in itertools.product(topology, repeat=num_cnot):
-                out = []
-                j = 0  # index into inserted pairs
-                for slot in range(nslots):
-                    while j < num_cnot and places[j] == slot:
-                        out.append(pairs[j])
-                        j += 1
-                    if slot < n:
-                        out.append(curpath[slot])
-                yield tuple(out)
-
-    def Start_Decomposition(self):
-        """Beam-style search over CNOT prefixes guided by OSR stats; collects solutions in ``self.all_solutions``."""
-        import heapq, itertools
-
-        self.all_solutions = []
-        self.err = 1.0
-        stop_first_solution = self.config.get("stop_first_solution", True)
-        cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(self.qbit_num))
-        # because we have U already conjugate transposed, must use prefix order
-        B = self.config.get("beam", None)  # 8*len(self.topology))
-        max_depth = self.config.get("tree_level_max", 14)
-        tol = 1e-3
-        Fnorm = np.sqrt(1 << self.qbit_num)
-        best = []
-        visited = set()
-        all_ranks = list(range(min(2, self.qbit_num - 1)))
-
-        def get_osr_stats(path, rank, use_softmax):
-            """Return ``(min_cnots, rank_kappa_metric, raw_osr_list)`` for prefix ``path``."""
-            h = self.OSR_with_local_alignment(
-                path,
-                cuts,
-                Fnorm,
-                tol=tol,
-                rank=rank,
-                use_softmax=use_softmax,
-                method="basin_hopping",
-            )
-            min_cnots = max((x[0] for x in h), default=0)
-            ranktot = sum(x[0] for x in h)
-            kappa = sum(sum(y * y for y in x[1][1:]) for x in h)
-            return min_cnots, ranktot + kappa, h
-
-        def add_to_heap(path, parent_stats):
-            """Push ``path`` onto search heap if within depth and OSR bounds improve on ``parent_stats``."""
-            if len(path) > max_depth:
-                return False
-            if path in visited:
-                return False
-            visited.add(path)
-            if self.qbit_num > 1:
-                min_cnots, rankkappa = min(
-                    get_osr_stats(path, rank, use_sm)[:2]
-                    for (rank, use_sm) in itertools.product(all_ranks, (False,))
-                )  # (False, True)
-            else:
-                min_cnots, rankkappa = 0, 0.0
-            if parent_stats is not None and (min_cnots, rankkappa) >= parent_stats:
-                return False
-            heapq.heappush(best, (min_cnots, rankkappa, path))
-            return True
-
-        add_to_heap((), None)
-        while best:
-            # print(best[0])
-            min_cnots, rankkappa, curpath = heapq.heappop(best)
-            if min_cnots == 0:
-                # print(path)
-                for i in range(10):
-                    if self.Run_Decomposition(curpath):
-                        self.all_solutions.append(
-                            (self.get_Circuit(), self.get_Optimized_Parameters())
-                        )
-                        if stop_first_solution:
-                            return
-                        break
-                    # print("Looping", h)
-            num_cnot = 1
-            while True:
-                any_added = False
-                for newpath in N_Qubit_Decomposition_Guided_Tree.generate_insertions(
-                    curpath, self.topology, num_cnot
-                ):
-                    if add_to_heap(newpath, (min_cnots, rankkappa)):
-                        any_added = True
-                if any_added:
-                    break
-                num_cnot += 1
-                if len(curpath) + num_cnot > max_depth:
-                    break
-        self.set_Gate_Structure(Circuit(self.qbit_num))
-        self.set_Optimized_Parameters(np.array([]))
-        # print("No decomposition found within the given CNOT limit.")
-
-    """
-    def Start_Decomposition(self):
-        self.all_solutions = []
-        self.err = 1.0
-        stop_first_solution = self.config.get("stop_first_solution", True)
-        cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(self.qbit_num))
-        if self.topology is None:
-            self.topology = [(i, j) for i in range(self.qbit_num) for j in range(i+1, self.qbit_num)]
-        pair_affects = {
-            pair: [i for i,A in enumerate(cuts) if (pair[0] in A) ^ (pair[1] in A)]
-            for pair in self.topology
-        }
-        #because we have U already conjugate transposed, must use prefix order
-        B = self.config.get('beam', None)#8*len(self.topology))
-        max_depth = self.config.get('tree_level_max', 14)
-        tol = 1e-3
-        Fnorm = np.sqrt(1<<self.qbit_num)
-        prior_level_info = None
-        for depth in range(max_depth+1):
-            remaining = max_depth - depth
-            visited, seq_pairs_of, seq_dir_of, res = N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS_level(self.qbit_num, self.topology, prior_level_info, use_gl=False)
-            nextprefixes = []
-            for path in set(tuple(x[1]) for x in res):
-                curh = None if len(path)==0 else prefixes[path[:-1]]
-                check_cuts = pair_affects[tuple(sorted(path[-1]))] if not curh is None else range(len(cuts))
-                #samples = [max(x[0] for x in self.OSR_with_local_alignment(path, cuts, Fnorm, tol=tol)) for _ in range(5)]
-                #if len(set(samples)) != 1: print(samples)
-                h = self.OSR_with_local_alignment(path, cuts, Fnorm, tol=tol, use_softmax=False, method="dual_annealing")
-                min_cnots = max((x[0] for x in h), default=0)
-                print(path, h, N_Qubit_Decomposition_Guided_Tree.avg_loss([x[1] for x in h]), remaining, min_cnots)
-                if min_cnots == 0:
-                    #print(path)
-                    for i in range(10):
-                        if self.Run_Decomposition(path):
-                            self.all_solutions.append((self.get_Circuit(), self.get_Optimized_Parameters()))
-                            if stop_first_solution: return
-                            break
-                        #print("Looping", h)
-                if min_cnots > remaining: continue
-                if not curh is None:
-                    #print(path, [(h[i], curh[i]) for i in check_cuts])
-                    #if any(h[i][0] > curh[i][0] for i in check_cuts): continue
-                    if max((x[0] for x in curh), default=0) < min_cnots: continue
-                nextprefixes.append((path, h))
-            nextprefixes.sort(key=lambda t: (max((x[0] for x in t[1]), default=0), sum(x[0] for x in t[1]), N_Qubit_Decomposition_Guided_Tree.avg_loss([x[1] for x in t[1]])))
-            prefixes = {x[0]: x[1] for x in nextprefixes[:B]}
-            prior_level_info = (visited, seq_pairs_of, seq_dir_of, list(x[0] for x in reversed(res) if tuple(x[1]) in prefixes))
-        self.set_Gate_Structure(Circuit(self.qbit_num))
-        self.set_Optimized_Parameters(np.array([]))
-        #print("No decomposition found within the given CNOT limit.")
-    """
+    from bqskit.passes import QuickPartitioner
+    squander    = SquanderSynthesisPass(squander_config=squander_config)
+    partitioner = SquanderILPPartitioner(block_size)
+    enable_pam_verification = bool(squander_config.get("enable_pam_verification", False))
+    num_layout_passes = int(squander_config.get("num_layout_passes", 3))
+    pam_initial_placement = squander_config.get("pam_initial_placement", None)
 
-    def get_Decomposition_Error(self):
-        """Last decomposition error (Frobenius / cost) from guided search or ``Run_Decomposition``."""
-        return self.err
-
-    @staticmethod
-    def compositions(total, parts):
-        """
-        All nonnegative integer tuples of length `parts` summing to `total`.
-        """
-        if parts == 1:
-            yield (total,)
-            return
-        for x in range(total + 1):
-            for rest in N_Qubit_Decomposition_Guided_Tree.compositions(
-                total - x, parts - 1
-            ):
-                yield (x,) + rest
-
-    @staticmethod
-    def solve_best_min_cnots(num_qubits, cuts, rank_kappa, topology, use_surplus=True):
-        """Minimize total CNOT count subject to per-cut edge coverage vs ``rank_kappa`` bounds; return best kappa."""
-        m = len(topology)
-        cut_to_edges = [
-            [i for i, z in enumerate(topology) if (z[0] in cut) != (z[1] in cut)]
-            for cut in cuts
-        ]
-        total = 0
-        best_kappa = None
-        while True:
-            for edge_counts in N_Qubit_Decomposition_Guided_Tree.compositions(total, m):
-                if all(
-                    sum(edge_counts[j] for j in cut_to_edge) >= cut_bound[0]
-                    for cut_to_edge, cut_bound in zip(cut_to_edges, rank_kappa)
-                ):
-                    new_kappa = 0.0
-                    for cut_to_edge, cut_bound in zip(cut_to_edges, rank_kappa):
-                        coverage = sum(edge_counts[j] for j in cut_to_edge)
-                        if use_surplus:
-                            new_kappa += cut_bound[1] * (coverage - cut_bound[0])
-                        else:
-                            new_kappa += cut_bound[1] * coverage
-                    best_kappa = (
-                        new_kappa if best_kappa is None else max(best_kappa, new_kappa)
-                    )
-            if best_kappa is not None:
-                break
-            total += 1
-        return total, best_kappa
-
-    @staticmethod
-    def solve_min_cnots(num_qubits, cuts, cut_bounds, topology):
-        """Smallest total CNOT budget such that each cut's crossing edges meet ``cut_bounds``."""
-        m = len(topology)
-        cut_to_edges = [
-            [i for i, z in enumerate(topology) if (z[0] in cut) != (z[1] in cut)]
-            for cut in cuts
-        ]
-        total = 0
-        while True:
-            for edge_counts in N_Qubit_Decomposition_Guided_Tree.compositions(total, m):
-                if all(
-                    sum(edge_counts[j] for j in cut_to_edge) >= cut_bound
-                    for cut_to_edge, cut_bound in zip(cut_to_edges, cut_bounds)
-                ):
-                    return total
-            total += 1
-
-    @staticmethod
-    def gen_all_min_cnots(
-        num_qbits, topology=None
-    ):  # OSR tells min CNOTs at most for 3 qubits 3, 4 qubits 6, 5 qubits 7
-        """Debug: print min CNOT solutions for all combinations of per-cut bounds (see ``solve_min_cnots``)."""
-        import itertools
+    pam_verify_passes = (
+        [PAMVerificationSequence(block_size)] if enable_pam_verification else []
+    )
 
-        cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(num_qbits))
-        min_cnot_bounds = [
-            2 * min(cut_size, num_qbits - cut_size)
-            for cut_size in (len(cut) for cut in cuts)
-        ]
-        if topology is None:
-            topology = [
-                (i, j) for i in range(num_qbits) for j in range(i + 1, num_qbits)
-            ]
-        for cnot_bounds in itertools.product(
-            *(range(bound + 1) for bound in min_cnot_bounds)
-        ):
-            # if tuple(sorted(cnot_bounds)) != cnot_bounds: continue
-            print(
-                cnot_bounds,
-                N_Qubit_Decomposition_Guided_Tree.solve_min_cnots(
-                    num_qbits, cuts, cnot_bounds, topology
-                ),
-            )
+    inner_passes = [
+        LogPass("Caching permutation-aware synthesis results."),
+        ExtractModelConnectivityPass(),
+        partitioner,
+        ForEachBlockPass(
+            EmbedAllPermutationsPass(
+                inner_synthesis=squander,
+                input_perm=True,
+                output_perm=False,
+                vary_topology=False,
+            ),
+        ),
+        LogPass("Preoptimizing with permutation-aware mapping."),
+        PAMRoutingPass(),
+        *pam_verify_passes,
+        UnfoldPass(),
+        RestoreModelConnectivityPass(),
+        LogPass("Recaching permutation-aware synthesis results."),
+        SubtopologySelectionPass(block_size),
+        QuickPartitioner(block_size),
+        ForEachBlockPass(
+            EmbedAllPermutationsPass(
+                inner_synthesis=squander,
+                input_perm=False,
+                output_perm=True,
+                vary_topology=True,
+            ),
+        ),
+        LogPass("Performing permutation-aware mapping."),
+        ApplyPlacement(),
+        SetPAMInitialPlacementPass(pam_initial_placement),
+        PAMLayoutPass(num_layout_passes),
+        PAMRoutingPass(0.1),
+        *pam_verify_passes,
+        ApplyPlacement(),
+        UnfoldPass(),
+    ]
+
+    return Workflow(
+        IfThenElsePass(
+            NotPredicate(WidthPredicate(2)),
+            inner_passes,
+        ),
+        name="SeqPAM Mapping",
+    )
 
 
-# N_Qubit_Decomposition_Guided_Tree.gen_all_min_cnots(3); assert False
-# N_Qubit_Decomposition_Guided_Tree.build_sequence(); assert False
-# print(len(list(N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS(3, [(0,1),(1,2),])))); assert False
 class qgd_Wide_Circuit_Optimization:
     """Optimize wide (many-qubit) circuits via partitioning and subcircuit decomposition.
 
     Supports multiple decomposition strategies, optional global recombination (ILP),
     and routing when the circuit does not match the target topology.
+
     """
 
     def __init__(self, config):
@@ -1312,6 +305,7 @@ def __init__(self, config):
             "TreeGuided",
             "qiskit",
             "bqskit",
+            "seqpam_PartAM",
         ]
         if not strategy in allowed_startegies:
             raise Exception(
@@ -1390,7 +384,8 @@ def ConstructCircuitFromPartitions(
     def DecomposePartition(
         Umtx: np.ndarray, config: dict, mini_topology=None, structure=None
     ) -> list[tuple[Circuit, np.ndarray]]:
-        """Decompose a unitary ``Umtx`` (e.g. from a partition) using ``config['strategy']``.
+        """
+        Decompose a unitary ``Umtx`` (e.g. from a partition) using ``config['strategy']``.
 
         Args:
             Umtx: Complex unitary matrix.
@@ -1399,11 +394,7 @@ def DecomposePartition(
             structure: Required gate structure when ``strategy == "Custom"``.
 
         Returns:
-            Normally ``[(circuit, parameters)]`` on success, or ``[]`` if the
-            decomposition error exceeds ``tolerance``. If
-            ``config.get('stop_first_solution')`` is false, returns
-            ``cDecompose.all_solutions`` from the underlying decomposer instead of
-            a single best pair.
+            List of ``(squander_circuit, parameters)`` on success, or ``[]`` if error exceeds tolerance.
         """
         strategy = config["strategy"]
         if strategy == "TreeSearch":
@@ -1421,10 +412,6 @@ def DecomposePartition(
                 level_limit_min=1,
                 topology=mini_topology,
             )
-        elif strategy == "TreeGuided":
-            cDecompose = N_Qubit_Decomposition_Guided_Tree(
-                Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology
-            )
         elif strategy == "Custom":
             cDecompose = N_Qubit_Decomposition_custom(
                 Umtx.conj().T, config=config, accelerator_num=0
@@ -1469,7 +456,7 @@ def DecomposePartition(
                 parameters = cDecompose.get_Optimized_Parameters()
                 err = cDecompose.Optimization_Problem(parameters)
                 it += 1
-            if err > tolerance or it != 0:
+            if (err > tolerance or it != 0) and config.get("verbosity", 0) >= 1:
                 print("Decomposition error: ", err, it)
         else:
             err = cDecompose.get_Decomposition_Error()
@@ -1486,15 +473,25 @@ def CompareAndPickCircuits(
         parameter_arrs: List[np.ndarray],
         metric: Callable[[Circuit], int] = CNOTGateCount,
     ) -> tuple[Circuit, np.ndarray]:
-        """Select the circuit with the lowest ``metric`` value.
+        """
+        Call to pick the most optimal circuit corresponding a specific metric. Looks for the circuit
+        with the minimal metric value.
+
 
         Args:
-            circs: Candidate Squander circuits (same length as ``parameter_arrs``).
-            parameter_arrs: Parameter vectors aligned with ``circs``.
-            metric: Scalar cost functional; lower is better. Defaults to ``CNOTGateCount``.
 
-        Returns:
-            ``(best_circuit, best_parameters)`` for the minimizing index.
+            circs ( List[Circuit] ) A list of Squander circuits to be compared
+
+            parameter_arrs ( List[np.ndarray] ) A list of parameter arrays associated with the sqaunder circuits
+
+            metric (optional) The metric function to decide which input circuit is better.
+
+
+        Return:
+
+            Returns with the chosen circuit and the corresponding parameter array
+
+
         """
 
         if not isinstance(circs, list):
@@ -1521,10 +518,8 @@ def PartitionDecompositionProcess(
         config: dict,
         structure=None,
     ) -> Tuple[Circuit, np.ndarray]:
-        """Decompose one partition subcircuit (multiprocessing-safe entry point).
-
-        For ``TreeGuided`` on large registers, may recursively partition and
-        enumerate combinations before returning remapped results.
+        """
+        Worker-friendly entry: decompose a partition subcircuit (optionally nested for TreeGuided).
 
         Args:
             subcircuit: Subcircuit acting on a subset of the wide register.
@@ -1533,8 +528,7 @@ def PartitionDecompositionProcess(
             structure: Optional fixed gate structure when ``strategy == "Custom"``.
 
         Returns:
-            Tuple of ``(decomposed_circuit, decomposed_parameters)`` pairs, each
-            remapped back to the original qubit indices of ``subcircuit``.
+            List of ``(Circuit, parameters)`` pairs (or empty list on failure), remapped to the original register.
         """
 
         qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
@@ -1553,116 +547,16 @@ def PartitionDecompositionProcess(
         # remap the subcircuit to a smaller qubit register
         remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num)
 
-        if (
-            qbit_num > 3
-            and structure is None
-            and config.get("strategy", "") == "TreeGuided"
-        ):
-            circo = Circuit(qbit_num)
-            for gate in remapped_subcircuit.get_Gates():
-                circo.add_Gate(gate)
-            remapped_subcircuit = circo
-            partitioned_circuit, params, recombine_info, _ = (
-                qgd_Wide_Circuit_Optimization.make_all_partition_circuit(
-                    remapped_subcircuit, subcircuit_parameters, 3
-                )
-            )
-            optimized_circuits = []
-            subcircs = partitioned_circuit.get_Gates()
-            # first find the optimal CNOT decomposition
-            for innercirc in subcircs:
-                start_idx = innercirc.get_Parameter_Start_Index()
-                innercirc_parameters = params[
-                    start_idx : start_idx + innercirc.get_Parameter_Num()
-                ]
-                callback_fnc = (
-                    lambda x: qgd_Wide_Circuit_Optimization.CompareAndPickCircuits(
-                        [innercirc, *(z[0] for z in x)],
-                        [innercirc_parameters, *(z[1] for z in x)],
-                    )
-                )
-                optimized_circuits.append(
-                    callback_fnc(
-                        qgd_Wide_Circuit_Optimization.PartitionDecompositionProcess(
-                            innercirc,
-                            innercirc_parameters,
-                            {
-                                **config,
-                                "stop_first_solution": True,
-                                "tree_level_max": max(
-                                    0, CNOTGateCount(subcircuit, 0) - 1
-                                ),
-                            },
-                            structure=None,
-                        )
-                    )
-                )
-            parts, struct_idxs = (
-                qgd_Wide_Circuit_Optimization.recombine_all_partition_circuit(
-                    remapped_subcircuit,
-                    [x[0] for x in optimized_circuits],
-                    params,
-                    recombine_info,
-                )
-            )
-            # enumerate all solutions for each subcircuit in the optimal
-            all_sol_for_idx = []
-            for idx in struct_idxs:
-                innercirc = subcircs[idx]
-                start_idx = innercirc.get_Parameter_Start_Index()
-                innercirc_parameters = params[
-                    start_idx : start_idx + innercirc.get_Parameter_Num()
-                ]
-                callback_fnc = lambda x: x + [(innercirc, innercirc_parameters)]
-                all_sol_for_idx.append(
-                    callback_fnc(
-                        qgd_Wide_Circuit_Optimization.PartitionDecompositionProcess(
-                            innercirc,
-                            innercirc_parameters,
-                            {
-                                **config,
-                                "stop_first_solution": False,
-                                "tree_level_max": max(0, CNOTGateCount(subcircuit, 0)),
-                            },
-                            structure=None,
-                        )
-                    )
-                )
-            all_decomposed = []
-            import itertools
+        if not structure is None:
+            structure = structure.Remap_Qbits(qbit_map, qbit_num)
 
-            opt = qgd_Wide_Circuit_Optimization({**config, "max_partition_size": 3})
-            if np.prod([len(x) for x in all_sol_for_idx]) > 32:
-                import random
+        # get the unitary representing the circuit
+        unitary = remapped_subcircuit.get_Matrix(subcircuit_parameters)
 
-                trycombs = [
-                    [random.choice(x) for x in all_sol_for_idx] for _ in range(32)
-                ]
-            else:
-                trycombs = itertools.product(*all_sol_for_idx)
-            for combination in trycombs:
-                structures = [
-                    qgd_Wide_Circuit_Optimization.copy_circuit_structure(x[0])
-                    for x in combination
-                ]
-                optcirc, optparams = opt._OptimizeWideCircuit(
-                    remapped_subcircuit, subcircuit_parameters, False, parts, structures
-                )
-                reoptcirc, reoptparams = opt._OptimizeWideCircuit(
-                    optcirc.get_Flat_Circuit(), optparams
-                )
-                all_decomposed.append((reoptcirc.get_Flat_Circuit(), reoptparams))
-        else:
-            if not structure is None:
-                structure = structure.Remap_Qbits(qbit_map, qbit_num)
-
-            # get the unitary representing the circuit
-            unitary = remapped_subcircuit.get_Matrix(subcircuit_parameters)
-
-            # decompose a small unitary into a new circuit
-            all_decomposed = qgd_Wide_Circuit_Optimization.DecomposePartition(
-                unitary, config, mini_topology, structure=structure
-            )
+        # decompose a small unitary into a new circuit
+        all_decomposed = qgd_Wide_Circuit_Optimization.DecomposePartition(
+            unitary, config, mini_topology, structure=structure
+        )
         # create inverse qbit map:
         inverse_qbit_map = {}
         for key, value in qbit_map.items():
@@ -1690,15 +584,7 @@ def PartitionDecompositionProcess(
 
     @staticmethod
     def build_partition_topo_deps(allparts):
-        """Order partition gate-sets by dependencies and build a reverse-dependency map.
-
-        Args:
-            allparts: List of sets of gate indices, one per partition.
-
-        Returns:
-            ``(ordered_parts, rg_new)`` where ``ordered_parts`` lists partitions in
-            topological order and ``rg_new`` maps each new index to predecessors.
-        """
+        """Topological sort of partition gate-sets; returns ordered partitions and reverse-dependency map."""
         gate_to_parts = {}
         for i, part in enumerate(allparts):
             for gate in part:
@@ -1812,15 +698,7 @@ def make_all_partition_circuit(circ, orig_parameters, max_partition_size):
 
     @staticmethod
     def strip_single_qubit_head_tails(circ, params):
-        """Drop single-qubit gates that sit only at the head or tail of the dependency DAG.
-
-        Args:
-            circ: Input circuit.
-            params: Flat parameter array for ``circ``.
-
-        Returns:
-            ``(new_circuit, new_params)`` with head/tail single-qubit gates removed.
-        """
+        """Remove single-qubit gates that are purely at the head/tail of the dependency graph."""
         gate_dict, g, rg, gate_to_qubit, _ = build_dependency(circ)
         newcirc = Circuit(circ.get_Qbit_Num())
         new_params = []
@@ -1839,15 +717,7 @@ def strip_single_qubit_head_tails(circ, params):
 
     @staticmethod
     def get_fingerprint(circ, params):
-        """Hashable signature of gate layout and parameters (for decomposition caching).
-
-        Args:
-            circ: Squander circuit.
-            params: Parameter array associated with ``circ``.
-
-        Returns:
-            Tuple usable as a dict key for memoizing decompositions.
-        """
+        """Hashable signature of gate types, qubits, and parameters (for decomposition caching)."""
         return tuple(
             (gate.get_Name(), tuple(gate.get_Involved_Qbits()))
             for gate in circ.get_Gates()
@@ -1857,16 +727,10 @@ def get_fingerprint(circ, params):
     def recombine_all_partition_circuit(
         circ, optimized_subcircuits, optimized_parameter_list, recombine_info
     ):
-        """Reorder optimized partitions to respect global gate dependencies.
-
-        Args:
-            circ: Original flat circuit (for topological ordering context).
-            optimized_subcircuits: One optimized subcircuit per partition slot.
-            optimized_parameter_list: Parameter lists aligned with ``optimized_subcircuits``.
-            recombine_info: Tuple from ``make_all_partition_circuit`` (ILP metadata).
+        """Reorder partition results to satisfy global dependencies.
 
-        Returns:
-            ``(reordered_circuits, reordered_parameter_lists)`` in execution order.
+        Uses ILP-based ordering and a final topological sort, then returns
+        reordered subcircuits and parameter arrays aligned by structure index.
         """
         from squander.partitioning.ilp import (
             topo_sort_partitions,
@@ -1918,13 +782,15 @@ def OptimizeWideCircuit(
             circ, self.config["topology"]
         ):
 
-            print("fixing topology in the circuit")
+            if self.config["verbosity"] >= 1:
+                print("fixing topology in the circuit")
             topo = self.config["topology"]
             self.config["topology"] = None
             strat = self.config["strategy"]
             self.config["strategy"] = self.config["pre-opt-strategy"]
 
-            print("Optimizing circuit with all-to-all (a2a) connectivity")
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with all-to-all (a2a) connectivity")
             circ, parameters = self.OptimizeWideCircuit(circ, parameters)
             self.config["all_to_all_optimization_time"] = self.config[
                 "optimization_time"
@@ -1935,17 +801,20 @@ def OptimizeWideCircuit(
             self.config["topology"] = topo
             start_time = time.time()
 
-            print("Routing circuit to fix the topology")
+            if self.config["verbosity"] >= 1:
+                print("Routing circuit to fix the topology")
             circ, parameters = self.route_circuit(circ, parameters)
             self.config["routing_time"] = time.time() - start_time
             self.config["routed_circuit"] = circ
             self.config["routed_parameters"] = parameters
         else:
-            print("No additional routing is needed on the circuit")
+            if self.config["verbosity"] >= 1:
+                print("No additional routing is needed on the circuit")
 
         start_time = time.time()
         if self.config["strategy"] == "bqskit":
-            print("Optimizing circuit with BQSkit")
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with BQSkit")
             from squander import Qiskit_IO
             from bqskit import compile
 
@@ -1989,7 +858,7 @@ def OptimizeWideCircuit(
                 LogErrorPass(),
             ]
 
-            with Compiler() as compiler:
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
                 routed_bqskit_circ, pass_data = compiler.compile(
                     bqskit_circ, compilation_workflow, True
                 )
@@ -2009,12 +878,61 @@ def OptimizeWideCircuit(
             qgd_Wide_Circuit_Optimization.check_valid_routing(
                 newcirc, self.config["topology"]
             )
-            print("OptimizeWideCircuit::check_compare_circuits")
+            if self.config["verbosity"] >= 2:
+                print("OptimizeWideCircuit::check_compare_circuits")
+            self.check_compare_circuits(circ, parameters, newcirc, newparameters)
+            circ, parameters = newcirc, newparameters
+
+        elif self.config["strategy"] == "seqpam_PartAM":
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with BQSKit SeqPAM + Squander (PartAM ILP weights)")
+            from squander import Qiskit_IO
+            from bqskit.compiler import Compiler
+            from bqskit.compiler.machine import MachineModel
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from bqskit.passes import SetModelPass
+            from qiskit import qasm2, QuantumCircuit
+
+            strategy_map = {"TreeSearch": "Tree_search", "TabuSearch": "Tabu_search"}
+            squander_config = {
+                "strategy": strategy_map.get(self.config.get("strategy", "TreeSearch"), "Tree_search"),
+                "optimization_tolerance": self.config.get("tolerance", 1e-8),
+                "verbosity": self.config.get("verbosity", 0),
+                "optimizer_engine": self.config.get("optimizer_engine", "BFGS"),
+                "Cost_Function_Variant": self.config.get("Cost_Function_Variant", 3),
+                "size_density_weight": True,
+                "sparse_penalty": self.config.get("sparse_penalty", 3.0),
+                "max_partition_size": self.max_partition_size,
+            }
+            block_size = self.max_partition_size
+
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
+            circo = Qiskit_IO.get_Qiskit_Circuit(circ, parameters)
+            bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
+
+            workflow = generate_squander_seqpam(squander_config, block_size)
+
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
+                routed_bqskit_circ = compiler.compile(
+                    bqskit_circ, [SetModelPass(model), workflow]
+                )
+
+            circuit_qiskit = QuantumCircuit.from_qasm_str(
+                OPENQASM2Language().encode(routed_bqskit_circ)
+            )
+            newcirc, newparameters = Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit)
+
+            qgd_Wide_Circuit_Optimization.check_valid_routing(
+                newcirc, self.config["topology"]
+            )
+            if self.config["verbosity"] >= 2:
+                print("OptimizeWideCircuit::check_compare_circuits")
             self.check_compare_circuits(circ, parameters, newcirc, newparameters)
             circ, parameters = newcirc, newparameters
 
         elif self.config["strategy"] == "qiskit":
-            print("Optimizing circuit with Qiskit")
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with Qiskit")
             from squander import Qiskit_IO
             from qiskit import transpile
             from qiskit.transpiler import CouplingMap
@@ -2045,18 +963,16 @@ def OptimizeWideCircuit(
             qgd_Wide_Circuit_Optimization.check_valid_routing(
                 newcirc, self.config["topology"]
             )
-            print("OptimizeWideCircuit::check_compare_circuits")
+            if self.config["verbosity"] >= 2:
+                print("OptimizeWideCircuit::check_compare_circuits")
             self.check_compare_circuits(circ, parameters, newcirc, newparameters)
             circ, parameters = newcirc, newparameters
         else:
 
-            print("Optimizing circuit with Squander")
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with Squander")
             part_size_start = self.max_partition_size
-            part_size_end = self.max_partition_size
-            if self.config.get("use_osr", False) or self.config.get(
-                "use_graph_search", False
-            ):
-                part_size_end = min(4, circ.get_Qbit_Num())
+            part_size_end = self.config.get("part_size_end",self.max_partition_size)
             count = CNOTGateCount(circ, 0)
             fingerprint_dict = {}
             for max_part_size in range(part_size_start, part_size_end + 1):
@@ -2126,7 +1042,7 @@ def InnerOptimizeWideCircuit(
 
         in_parent = parent_process() is not None
 
-        if not in_parent:
+        if not in_parent and self.config["verbosity"] >= 1:
             print(len(subcircuits), "partitions found to optimize")
 
         # the list of optimized subcircuits
@@ -2147,7 +1063,7 @@ def process_result(partition_idx):
             if optimized_subcircuits[partition_idx] is not None:
                 return
             subcircuit = subcircuits[partition_idx]
-            # callback on the master process to compare the decomposed and original subcircuit
+            # callback function done on the master process to compare the new decomposed and the original suncircuit
             start_idx = subcircuit.get_Parameter_Start_Index()
             subcircuit_parameters = parameters[
                 start_idx : start_idx + subcircuit.get_Parameter_Num()
@@ -2173,7 +1089,7 @@ def process_result(partition_idx):
                     else async_results[partition_idx].get(timeout=None)
                 )
 
-                if subcircuit != new_subcircuit:
+                if subcircuit != new_subcircuit and self.config["verbosity"] >= 2:
                     print(
                         "original subcircuit:    ",
                         subcircuit.get_Gate_Nums(),
@@ -2197,14 +1113,16 @@ def process_result(partition_idx):
                             trim_subcirc, trim_parameters
                         )
                     ] = (trim_subcirc, trim_parameters)
-            if total_opt[0] % 100 == 99:
+            if total_opt[0] % 100 == 99 and self.config["verbosity"] >= 1:
                 print(total_opt[0] + 1, "partitions optimized")
             total_opt[0] += 1
             optimized_subcircuits[partition_idx] = new_subcircuit
             optimized_parameter_list[partition_idx] = new_parameters
 
         with (
-            contextlib.nullcontext() if in_parent else Pool(processes=mp.cpu_count())
+            contextlib.nullcontext()
+            if in_parent
+            else Pool(processes=len(os.sched_getaffinity(0)) if hasattr(os, 'sched_getaffinity') else mp.cpu_count())
         ) as pool:
             remaining = list(range(len(subcircuits)))
             while remaining:
@@ -2272,9 +1190,10 @@ def process_result(partition_idx):
                         (subcircuit, subcircuit_parameters, config, None),
                     )
                     # print("Dispatching", subcircuit.get_Involved_Qubits(), "qubits with", CNOGateCount(subcircuit, 0), "CNOT gates, partition ", partition_idx)
-                    assert pool is not None
                     async_results[partition_idx] = (
-                        fargs if in_parent else pool.apply_async(*fargs)
+                        fargs
+                        if in_parent
+                        else pool.apply_async(*fargs)
                     )
                 if len(remaining) == len(still_remaining):
                     time.sleep(0.1)
@@ -2283,7 +1202,7 @@ def process_result(partition_idx):
             for partition_idx in range(len(subcircuits)):
                 process_result(partition_idx)
 
-        # construct the wide circuit from the optimized subcircuits
+        # construct the wide circuit from the optimized suncircuits
         if global_min:
             optimized_subcircuits, optimized_parameter_list = (
                 qgd_Wide_Circuit_Optimization.recombine_all_partition_circuit(
@@ -2305,14 +1224,15 @@ def process_result(partition_idx):
             cast(List[List[np.ndarray]], optimized_parameter_list),
         )
 
-        if not in_parent:
+        if not in_parent and self.config["verbosity"] >= 1:
             print("original circuit:    ", circ.get_Gate_Nums())
             print("reoptimized circuit: ", wide_circuit.get_Gate_Nums())
 
         qgd_Wide_Circuit_Optimization.check_valid_routing(
             wide_circuit, self.config["topology"]
         )
-        print("InnerOptimizeWideCircuit: check_compare_circuits")
+        if self.config["verbosity"] >= 2:
+            print("InnerOptimizeWideCircuit: check_compare_circuits")
         self.check_compare_circuits(
             circ, orig_parameters, wide_circuit, wide_parameters
         )
@@ -2354,16 +1274,15 @@ def lattice_topology(x_qbits, y_qbits):
 
     @staticmethod
     def heavy_hexagonal_topology(rows, cols):
-        """Build a finite heavy-hex coupling list (honeycomb with subdivided edges).
+        """
+        Finite heavy-hex patch.
 
-        Args:
-            rows: Number of rows in the brick-wall honeycomb patch.
-            cols: Number of columns in the patch.
+        rows, cols describe the underlying honeycomb 'brick-wall' patch.
+        The first rows*cols qubits are the original honeycomb vertices.
+        Every original edge gets one inserted degree-2 qubit.
 
         Returns:
-            List of undirected edges ``(u, v)``. The first ``rows * cols`` qubit
-            indices are honeycomb vertices; each original edge introduces one
-            additional degree-2 qubit on the subdivided link.
+            list[(u, v)]  undirected couplers
         """
 
         def vid(r, c):
@@ -2446,26 +1365,9 @@ def check_valid_routing(wide_circuit, topo):
         ), "Final circuit contains gates that do not respect the routing constraints."
 
     def check_compare_circuits(
-        self,
-        circ,
-        orig_parameters,
-        wide_circuit,
-        wide_parameters,
-        routing=False,
-        forced_test=False,
+        self, circ, orig_parameters, wide_circuit, wide_parameters, routing=False, forced_test=False,
     ):
-        """Optionally verify equivalence of ``circ`` and ``wide_circuit`` via ``CompareCircuits``.
-
-        Args:
-            circ: Original circuit.
-            orig_parameters: Parameters for ``circ``.
-            wide_circuit: Optimized or routed circuit.
-            wide_parameters: Parameters for ``wide_circuit``.
-            routing: If true and initial/final mappings exist in ``self.config``,
-                pass them to ``CompareCircuits`` for layout-aware comparison.
-            forced_test: If true, run the comparison even when ``test_final_circuit``
-                is false in config.
-        """
+        """If ``test_final_circuit``, numerically compare unitaries (optional initial/final layout for routing)."""
         if self.config["test_final_circuit"] or forced_test:
             if (
                 routing
@@ -2485,23 +1387,59 @@ def check_compare_circuits(
                 CompareCircuits(circ, orig_parameters, wide_circuit, wide_parameters)
 
     def route_circuit(self, circ: Circuit, orig_parameters: np.ndarray):
-        """Map ``circ`` onto ``self.config['topology']`` using the configured router.
+        """Map ``circ`` onto ``self.config['topology']`` using BQSKit SeQPAM, Qiskit SABRE, or Squander SABRE."""
+        strategy = self.config.get("routing-strategy", "seqpam-ilp")
+
+        if strategy == "seqpam-ilp":
+            from squander import Qiskit_IO
+            from squander.decomposition.qgd_Wide_Circuit_Optimization import generate_squander_seqpam
+            from bqskit.compiler import Compiler
+            from bqskit.compiler.machine import MachineModel
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from bqskit.passes import SetModelPass
+            from qiskit import qasm2, QuantumCircuit
 
-        The strategy is ``self.config['routing-strategy']``, e.g. ``seqpam-ilp``,
-        ``seqpam-quick``, ``bqskit-sabre``, ``light-sabre`` (Qiskit), or ``sabre``
-        (Squander). Writes ``initial_mapping`` and ``final_mapping`` into
-        ``self.config`` when the backend provides them.
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
+            circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters)
+            bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
 
-        Args:
-            circ: Circuit before routing.
-            orig_parameters: Parameter vector for ``circ``.
+            strategy_map = {"TreeSearch": "Tree_search", "TabuSearch": "Tabu_search"}
+            squander_config = {
+                "strategy": strategy_map.get(self.config.get("strategy", "TreeSearch"), "Tree_search"),
+                "optimization_tolerance": self.config.get("tolerance", 1e-8),
+                "verbosity": self.config.get("verbosity", 0),
+                "optimizer_engine": self.config.get("optimizer_engine", "BFGS"),
+                "Cost_Function_Variant": self.config.get("Cost_Function_Variant", 3),
+                "size_density_weight": True,
+                "sparse_penalty": self.config.get("sparse_penalty", 3.0),
+                "max_partition_size": self.max_partition_size,
+            }
+            block_size = self.max_partition_size
 
-        Returns:
-            ``(routed_circuit, routed_parameters)`` laid out for ``self.config['topology']``.
-        """
-        strategy = self.config.get("routing-strategy", "seqpam-ilp")
+            workflow = generate_squander_seqpam(squander_config, block_size)
+
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
+                routed_bqskit_circ, pass_data = compiler.compile(
+                    bqskit_circ, [SetModelPass(model), workflow], True
+                )
 
-        if strategy in ("seqpam-ilp", "seqpam-quick", "bqskit-sabre"):
+            circuit_qiskit_routed = QuantumCircuit.from_qasm_str(
+                OPENQASM2Language().encode(routed_bqskit_circ)
+            )
+            Squander_remapped_circuit, parameters_remapped_circuit = (
+                Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_routed)
+            )
+            Squander_remapped_circuit = Squander_remapped_circuit.Remap_Qbits(
+                {i: j for i, j in enumerate(pass_data.placement)}
+            )
+            self.config["initial_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.initial_mapping
+            )
+            self.config["final_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.final_mapping
+            )
+
+        elif strategy in ("seqpam-quick", "bqskit-sabre"):
             from squander import Qiskit_IO
             from bqskit import Circuit as BQSKitCircuit, compile
             from bqskit.compiler import Compiler
@@ -2564,14 +1502,6 @@ async def run(self, circuit: BQSKitCircuit, data=None):
             mainflow = build_seqpam_mapping_optimization_workflow(
                 block_size=self.config["max_partition_size"]
             )
-            if strategy == "seqpam-ilp":
-                for curpass in mainflow._passes:
-                    if isinstance(curpass, IfThenElsePass):
-                        for i in range(len(curpass.on_true._passes)):
-                            if isinstance(curpass.on_true._passes[i], QuickPartitioner):
-                                curpass.on_true._passes[i] = SquanderPartitioner(
-                                    self.config["max_partition_size"]
-                                )
 
             routing_workflow = [
                 SetModelPass(model),  # attach hardware model to circuit
@@ -2585,7 +1515,7 @@ async def run(self, circuit: BQSKitCircuit, data=None):
                 ),  # SABRE-style routing
             ]
 
-            with Compiler() as compiler:
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
                 routed_bqskit_circ, pass_data = compiler.compile(
                     bqskit_circ, routing_workflow, True
                 )
@@ -2607,6 +1537,53 @@ async def run(self, circuit: BQSKitCircuit, data=None):
                 pass_data.placement[x] for x in pass_data.final_mapping
             )
 
+        elif strategy == "seqpam_partam":
+            from squander import Qiskit_IO
+            from squander.decomposition.qgd_Wide_Circuit_Optimization import generate_squander_seqpam
+            from bqskit.compiler import Compiler
+            from bqskit.compiler.machine import MachineModel
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from bqskit.passes import SetModelPass
+            from qiskit import qasm2, QuantumCircuit
+
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
+            circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters)
+            bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
+
+            squander_config = {
+                'strategy': 'Tree_search',
+                'optimization_tolerance': self.config.get('tolerance', 1e-8),
+                'verbosity': self.config.get('verbosity', 0),
+                'optimizer_engine': self.config.get('optimizer_engine', 'BFGS'),
+                'size_density_weight': True,
+                'sparse_penalty': self.config.get('sparse_penalty', 3.0),
+                'max_partition_size': self.max_partition_size,
+                'use_osr':0,
+                'use_graph_search':0,
+            }
+            workflow = generate_squander_seqpam(squander_config, self.max_partition_size)
+
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
+                routed_bqskit_circ, pass_data = compiler.compile(
+                    bqskit_circ, [SetModelPass(model), workflow], True
+                )
+
+            circuit_qiskit_routed = QuantumCircuit.from_qasm_str(
+                OPENQASM2Language().encode(routed_bqskit_circ)
+            )
+            Squander_remapped_circuit, parameters_remapped_circuit = (
+                Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_routed)
+            )
+            Squander_remapped_circuit = Squander_remapped_circuit.Remap_Qbits(
+                {i: j for i, j in enumerate(pass_data.placement)}
+            )
+            self.config["initial_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.initial_mapping
+            )
+            self.config["final_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.final_mapping
+            )
+
         elif strategy == "light-sabre":
             from squander import Qiskit_IO
             from qiskit import transpile
@@ -2674,7 +1651,8 @@ async def run(self, circuit: BQSKitCircuit, data=None):
             Squander_remapped_circuit, self.config["topology"]
         )
 
-        print("cheking circuit after routing")
+        if self.config["verbosity"] >= 2:
+            print("cheking circuit after routing")
         self.check_compare_circuits(
             circ,
             orig_parameters,
diff --git a/squander/gates/gates_Wrapper.cpp b/squander/gates/gates_Wrapper.cpp
index debfa4a57..58b2a4bb7 100644
--- a/squander/gates/gates_Wrapper.cpp
+++ b/squander/gates/gates_Wrapper.cpp
@@ -61,6 +61,7 @@ along with this program.  If not, see http://www.gnu.org/licenses/.
 #include "SWAP.h"
 #include "CSWAP.h"
 #include "numpy_interface.h"
+#include "Permutation.h"
 #include "RXX.h"
 #include "RYY.h"
 #include "RZZ.h"
@@ -79,7 +80,6 @@ typedef struct {
 
 
 
-
 template<typename GateT>
 Gate* create_gate( int qbit_num, int target_qbit ) {
     GateT* gate = new GateT( qbit_num, target_qbit );
@@ -126,6 +126,11 @@ Gate* create_multi_target_controlled_gate( int qbit_num, const std::vector<int>&
 }
 
 
+Gate* create_permutation_gate( int qbit_num, const std::vector<int>& pattern ) {
+    Permutation* gate = new Permutation( qbit_num, pattern );
+    return static_cast<Gate*>( gate );
+}
+
 
 /**
 @brief Method called when a python instance of the class  Gate_Wrapper is destroyed
@@ -143,6 +148,7 @@ static void
 }
 
 
+
 /**
 @brief Method called when a python instance of the class  qgd_CH_Wrapper is allocated
 @param type A pointer pointing to a structure describing the type of the class  qgd_CH_Wrapper.
@@ -487,6 +493,112 @@ static PyObject *
 }
 
 
+template<typename GateT>
+static PyObject *
+ permutation_gate_Wrapper_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {(char*)"qbit_num", (char*)"pattern", NULL};
+    int qbit_num = -1;
+    PyObject* pattern_py = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO", kwlist, &qbit_num, &pattern_py)) {
+        std::string err("Unable to parse arguments");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+
+    if (qbit_num == -1 || pattern_py == NULL) {
+        PyErr_SetString(PyExc_ValueError, "qbit_num and pattern must be provided!");
+        return NULL;
+    }
+
+    // Convert tuple to list if necessary, or check if it's a list
+    PyObject* pattern_list = NULL;
+    bool created_list = false;
+    if (PyTuple_Check(pattern_py)) {
+        pattern_list = PySequence_List(pattern_py);
+        if (pattern_list == NULL) {
+            PyErr_SetString(PyExc_TypeError, "Failed to convert tuple to list");
+            return NULL;
+        }
+        created_list = true;  // We created it, so we need to DECREF
+    } else if (PyList_Check(pattern_py)) {
+        pattern_list = pattern_py;
+        // We're borrowing the reference, no need to INCREF/DECREF
+    } else {
+        PyErr_SetString(PyExc_TypeError, "pattern must be a list or tuple!");
+        return NULL;
+    }
+
+    std::vector<int> pattern;
+    Py_ssize_t pattern_size = PyList_Size(pattern_list);
+
+    // Check pattern size matches qbit_num
+    if (pattern_size != qbit_num) {
+        if (created_list) {
+            Py_DECREF(pattern_list);
+        }
+        std::string err = "Pattern size " + std::to_string(pattern_size) + 
+                         " does not match qubit number " + std::to_string(qbit_num);
+        PyErr_SetString(PyExc_ValueError, err.c_str());
+        return NULL;
+    }
+
+    // Track which values we've seen to validate it's a permutation
+    std::vector<bool> seen(qbit_num, false);
+
+    for (Py_ssize_t i = 0; i < pattern_size; i++) {
+        PyObject* item = PyList_GetItem(pattern_list, i);
+        if (!PyLong_Check(item)) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            PyErr_SetString(PyExc_TypeError, "pattern must contain integers!");
+            return NULL;
+        }
+        int qbit = PyLong_AsLong(item);
+        if (qbit < 0 || qbit >= qbit_num) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err = "Pattern qubit index " + std::to_string(qbit) + 
+                             " out of range [0, " + std::to_string(qbit_num - 1) + "]";
+            PyErr_SetString(PyExc_ValueError, err.c_str());
+            return NULL;
+        }
+        if (seen[qbit]) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err = "Pattern contains duplicate value " + std::to_string(qbit);
+            PyErr_SetString(PyExc_ValueError, err.c_str());
+            return NULL;
+        }
+        seen[qbit] = true;
+        pattern.push_back(qbit);
+    }
+
+    // Release the pattern_list reference (only if we created it from a tuple)
+    if (created_list) {
+        Py_DECREF(pattern_list);
+    }
+
+    Gate_Wrapper *self;
+    self = (Gate_Wrapper *) type->tp_alloc(type, 0);
+    if (self != NULL) {
+        try {
+            self->gate = create_permutation_gate(qbit_num, pattern);
+        } catch (const std::string& e) {
+            PyErr_SetString(PyExc_ValueError, e.c_str());
+            return NULL;
+        } catch (const std::exception& e) {
+            PyErr_SetString(PyExc_ValueError, e.what());
+            return NULL;
+        }
+    }
+
+    return (PyObject *) self;
+}
 /**
 @brief Method called when a python instance of a non-controlled gate class is initialized
 @param self A pointer pointing to an instance of the class  Gate_Wrapper.
@@ -672,7 +784,6 @@ Gate_Wrapper_get_Matrix( Gate_Wrapper *self, PyObject *args, PyObject *kwds ) {
         }
 }
 
-
 /**
 @brief Call to apply the gate operation from the right side on an input state or matrix
 */
@@ -2428,8 +2539,139 @@ Gate_Wrapper_getstate( Gate_Wrapper *self ) {
 }
 
 
+static PyObject * Gate_Wrapper_get_Pattern( Gate_Wrapper *self ) {
+    std::vector<int> pattern;
+    try {
+        // Cast to Permutation* to access pattern methods
+        Permutation* perm_gate = dynamic_cast<Permutation*>(self->gate);
+        if (perm_gate == nullptr) {
+            PyErr_SetString(PyExc_TypeError, "Gate is not a Permutation gate");
+            return NULL;
+        }
+        pattern = perm_gate->get_pattern();
+    }
+    catch (std::string err) {
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    catch(...) {
+        std::string err( "Invalid pointer to gate class");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
 
+    PyObject* pattern_py = PyList_New(pattern.size());
+    for (size_t i = 0; i < pattern.size(); i++) {
+        PyList_SetItem(pattern_py, i, Py_BuildValue("i", pattern[i]));
+    }
+    return pattern_py;
+}
 
+static PyObject * Gate_Wrapper_set_Pattern( Gate_Wrapper *self, PyObject *args ) {
+    PyObject* pattern_py = NULL;
+    if (!PyArg_ParseTuple(args, "O", &pattern_py)) {
+        std::string err("Unable to parse arguments");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    // Convert tuple to list if necessary, or check if it's a list
+    PyObject* pattern_list = NULL;
+    bool created_list = false;
+    if (PyTuple_Check(pattern_py)) {
+        pattern_list = PySequence_List(pattern_py);
+        if (pattern_list == NULL) {
+            PyErr_SetString(PyExc_TypeError, "Failed to convert tuple to list");
+            return NULL;
+        }
+        created_list = true;  // We created it, so we need to DECREF
+    } else if (PyList_Check(pattern_py)) {
+        pattern_list = pattern_py;
+        // We're borrowing the reference, no need to INCREF/DECREF
+    } else {
+        std::string err("Pattern must be a list or tuple!");
+        PyErr_SetString(PyExc_TypeError, err.c_str());
+        return NULL;
+    }
+    
+    // Cast to Permutation* to access pattern methods and get qbit_num
+    Permutation* perm_gate = dynamic_cast<Permutation*>(self->gate);
+    if (perm_gate == nullptr) {
+        if (created_list) {
+            Py_DECREF(pattern_list);
+        }
+        PyErr_SetString(PyExc_TypeError, "Gate is not a Permutation gate");
+        return NULL;
+    }
+    
+    int qbit_num = perm_gate->get_qbit_num();
+    std::vector<int> pattern;
+    Py_ssize_t pattern_size = PyList_Size(pattern_list);
+    
+    // Check pattern size matches qbit_num
+    if (pattern_size != qbit_num) {
+        if (created_list) {
+            Py_DECREF(pattern_list);
+        }
+        std::string err = "Pattern size " + std::to_string(pattern_size) + 
+                         " does not match qubit number " + std::to_string(qbit_num);
+        PyErr_SetString(PyExc_ValueError, err.c_str());
+        return NULL;
+    }
+    
+    // Track which values we've seen to validate it's a permutation
+    std::vector<bool> seen(qbit_num, false);
+    
+    for (Py_ssize_t i = 0; i < pattern_size; i++) {
+        PyObject* item = PyList_GetItem(pattern_list, i);
+        if (!PyLong_Check(item)) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err("Pattern must contain integers!");
+            PyErr_SetString(PyExc_TypeError, err.c_str());
+            return NULL;
+        }
+        int qbit = PyLong_AsLong(item);
+        if (qbit < 0 || qbit >= qbit_num) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err = "Pattern qubit index " + std::to_string(qbit) + 
+                             " out of range [0, " + std::to_string(qbit_num - 1) + "]";
+            PyErr_SetString(PyExc_ValueError, err.c_str());
+            return NULL;
+        }
+        if (seen[qbit]) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err = "Pattern contains duplicate value " + std::to_string(qbit);
+            PyErr_SetString(PyExc_ValueError, err.c_str());
+            return NULL;
+        }
+        seen[qbit] = true;
+        pattern.push_back(qbit);
+    }
+    
+    // Release the pattern_list reference (only if we created it from a tuple)
+    if (created_list) {
+        Py_DECREF(pattern_list);
+    }
+    
+    try {
+        perm_gate->set_pattern(pattern);
+    }
+    catch (std::string err) {
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    catch(...) {
+        std::string err( "Invalid pointer to gate class");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    return Py_BuildValue("i", 0);
+}
 
 /**
 @brief Call to set the state of quantum gate from a human-readable data serialized and pickle-able format
@@ -2860,6 +3102,12 @@ extern "C"
     }, \
     {"get_Name", (PyCFunction) Gate_Wrapper_get_Name, METH_NOARGS, \
      "Method to get the name label of the gate" \
+    }, \
+    {"get_Pattern", (PyCFunction) Gate_Wrapper_get_Pattern, METH_NOARGS, \
+     "Method to get the pattern of the permutation gate." \
+    }, \
+    {"set_Pattern", (PyCFunction) Gate_Wrapper_set_Pattern, METH_VARARGS, \
+     "Method to set the pattern of the permutation gate." \
     }
 
 static PyMethodDef Gate_Wrapper_methods[] = {
@@ -2875,6 +3123,7 @@ static PyMethodDef Gate_Wrapper_methods[] = {
 };
 
 
+
 /**
 @brief Structure containing metadata about the members of class  qgd_CH_Wrapper.
 */
@@ -2883,6 +3132,7 @@ static PyMemberDef  Gate_Wrapper_members[] = {
 };
 
 
+
 struct Gate_Wrapper_Type_tmp : PyTypeObject {
 
 
@@ -3044,6 +3294,8 @@ gate_wrapper_type_template(Tdg, Gate_Wrapper_new);
 
 gate_wrapper_type_template(R, Gate_Wrapper_new);
 
+gate_wrapper_type_template(Permutation, permutation_gate_Wrapper_new);
+
 
 
 
@@ -3120,7 +3372,8 @@ PyInit_gates_Wrapper(void)
         PyType_Ready(&CCX_Wrapper_Type_ins) < 0 ||
         PyType_Ready(&SWAP_Wrapper_Type_ins) < 0 ||
         PyType_Ready(&CSWAP_Wrapper_Type_ins) < 0 ||
-        PyType_Ready(&R_Wrapper_Type_ins) < 0 ) {
+        PyType_Ready(&R_Wrapper_Type_ins) < 0 ||
+        PyType_Ready(&Permutation_Wrapper_Type_ins) < 0 ) {
 
         Py_DECREF(m);
         return NULL;
@@ -3218,6 +3471,8 @@ PyInit_gates_Wrapper(void)
 
     Py_INCREF_template(CSWAP);
 
+    Py_INCREF_template(Permutation);
+
     return m;
 }
 
diff --git a/squander/gates/qgd_Circuit.py b/squander/gates/qgd_Circuit.py
index eb259b4d0..6626f1549 100644
--- a/squander/gates/qgd_Circuit.py
+++ b/squander/gates/qgd_Circuit.py
@@ -80,6 +80,14 @@ def __init__(self, qbit_num):
         # call the constructor of the wrapper class
         super().__init__(qbit_num)
 
+    def copy(self):
+        """
+        Create a deep copy of the circuit.
+        @return A new qgd_Circuit instance with all gates copied.
+        """
+        # Call the C wrapper function that uses the clone() method
+        return super().copy()
+
     def add_U1(self, target_qbit):
         """Add a U1 gate to the front of the gate structure.
 
@@ -382,6 +390,18 @@ def add_CP(self, target_qbit, control_qbit):
         # call the C wrapper function
         super(qgd_Circuit, self).add_CP(target_qbit, control_qbit)
 
+#@brief Call to add a Permutation gate to the front of the gate structure.
+#@param self A pointer pointing to an instance of the class qgd_Circuit.
+#@param Input arguments: pattern (list of int) - permutation pattern.
+
+    def add_Permutation( self, pattern):
+
+	# call the C wrapper function
+        super(qgd_Circuit, self).add_Permutation(pattern)
+
+#@brief Call to add a SWAP gate to the front of the gate structure.
+#@param self A pointer pointing to an instance of the class qgd_Circuit.
+#@param Input arguments: target_qbits (list of int) - list of target qubits (at least 2).
     def add_SWAP(self, target_qbits, target_qbit2=-1):
         """Add a SWAP gate to the front of the gate structure.
 
@@ -674,12 +694,12 @@ def get_Qbits(self):
 
         return super().get_Qbits()
 
-    def set_min_fusion(self, min_fusion):
-        """Set the minimum fusion parameter in the circuit.
+    def get_Involved_Qbits(self):
 
-        Args:
-            min_fusion: Minimum fusion value (int)
-        """
+        return super().get_Qbits()
+#@brief Call to set hte min fusion in the circuit
+#@param Input arguments: min_fusion
+    def set_min_fusion( self, min_fusion):
 
         super().set_min_fusion(min_fusion)
 
diff --git a/squander/gates/qgd_Circuit_Wrapper.cpp b/squander/gates/qgd_Circuit_Wrapper.cpp
index 22e500b87..17f79d545 100644
--- a/squander/gates/qgd_Circuit_Wrapper.cpp
+++ b/squander/gates/qgd_Circuit_Wrapper.cpp
@@ -55,6 +55,7 @@ along with this program.  If not, see http://www.gnu.org/licenses/.
 #include "SXdg.h"
 #include "SYC.h"
 #include "Adaptive.h"
+#include "Permutation.h"
 #include "RXX.h"
 #include "RYY.h"
 #include "RZZ.h"
@@ -471,6 +472,49 @@ qgd_Circuit_Wrapper_add_CSWAP(qgd_Circuit_Wrapper *self, PyObject *args, PyObjec
 
 }
 
+/**
+@brief Wrapper function to add a Permutation gate to the front of the gate structure.
+@param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper.
+@param args A tuple of the input arguments: pattern (list of ints)
+@param kwds A tuple of keywords
+*/
+static PyObject *
+qgd_Circuit_Wrapper_add_Permutation(qgd_Circuit_Wrapper *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {(char*)"pattern", NULL};
+    PyObject* pattern_py = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &pattern_py))
+        return Py_BuildValue("i", -1);
+
+    if (pattern_py != NULL && PyList_Check(pattern_py)) {
+        std::vector<int> pattern;
+        Py_ssize_t list_size = PyList_Size(pattern_py);
+        for (Py_ssize_t i = 0; i < list_size; i++) {
+            PyObject* item = PyList_GetItem(pattern_py, i);
+            pattern.push_back(PyLong_AsLong(item));
+        }
+        if (pattern.size() == self->circuit->get_qbit_num()) {
+            try {
+                self->circuit->add_permutation(pattern);
+            } catch (const std::string& e) {
+                PyErr_SetString(PyExc_ValueError, e.c_str());
+                return Py_BuildValue("i", -1);
+            } catch (const std::exception& e) {
+                PyErr_SetString(PyExc_ValueError, e.what());
+                return Py_BuildValue("i", -1);
+            } catch (...) {
+                PyErr_SetString(PyExc_ValueError, "Unknown error occurred in add_permutation");
+                return Py_BuildValue("i", -1);
+            }
+        } else {
+            std::string err = "Pattern size " + std::to_string(pattern.size()) + 
+                             " does not match circuit qubit number " + std::to_string(self->circuit->get_qbit_num());
+            PyErr_SetString(PyExc_ValueError, err.c_str());
+            return Py_BuildValue("i", -1);
+        }
+    }   
+    return Py_BuildValue("i", 0);
+}
 /**
 @brief Wrapper function to add a block of operations to the front of the gate structure.
 @param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper.
@@ -824,7 +868,22 @@ qgd_Circuit_Wrapper_get_Matrix( qgd_Circuit_Wrapper *self, PyObject *args, PyObj
     // get the C++ wrapper around the data
     Matrix_real parameters_mtx = numpy2matrix_real(parameters_arr);
 
-    Matrix mtx = self->circuit->get_matrix(parameters_mtx);
+    Matrix mtx;
+    try {
+        mtx = self->circuit->get_matrix( parameters_mtx );
+    }
+    catch (std::string err) {
+        Py_DECREF(parameters_arr);
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        std::cout << err << std::endl;
+        return NULL;
+    }
+    catch(...) {
+        Py_DECREF(parameters_arr);
+        std::string err( "Invalid pointer to circuit class or error in get_matrix");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
 
     // convert to numpy array
     mtx.set_owner(false);
@@ -2423,6 +2482,33 @@ get_gate( Gates_block* circuit, int &idx ) {
         Py_DECREF( circuit_input );
 
     }
+    else if (gate->get_type() == PERMUTATION_OPERATION) {
+        // Handle Permutation gate
+        PyObject* qgd_gate_Dict  = PyModule_GetDict( qgd_gate );
+        PyObject* py_gate_class = PyDict_GetItemString( qgd_gate_Dict, "Permutation");
+        
+        // Get the pattern from the Permutation gate
+        Permutation* perm_gate = static_cast<Permutation*>(gate);
+        std::vector<int> pattern = perm_gate->get_pattern();
+        
+        // Convert pattern to Python list
+        PyObject* pattern_list = PyList_New(pattern.size());
+        for (size_t i = 0; i < pattern.size(); i++) {
+            PyList_SetItem(pattern_list, i, Py_BuildValue("i", pattern[i]));
+        }
+        
+        PyObject* gate_input = Py_BuildValue("(OO)", qbit_num, pattern_list);
+        py_gate = PyObject_CallObject(py_gate_class, gate_input);
+        
+        // replace dummy data with real gate data
+        qgd_Gate* py_gate_C = reinterpret_cast<qgd_Gate*>( py_gate );
+        delete( py_gate_C->gate );
+        py_gate_C->gate = static_cast<Gate*>( gate->clone() );
+        
+        Py_DECREF( qgd_gate );
+        Py_DECREF( gate_input );
+        Py_DECREF( pattern_list );
+    }
     else {
 
             Py_DECREF( qgd_gate );    
@@ -2806,6 +2892,62 @@ qgd_Circuit_Wrapper_get_Flat_Circuit( qgd_Circuit_Wrapper *self ) {
 
 
 
+/**
+@brief Wrapper function to create a deep copy of the circuit.
+@param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper.
+@return Returns a new qgd_Circuit Python object that is a deep copy.
+*/
+static PyObject *
+qgd_Circuit_Wrapper_copy( qgd_Circuit_Wrapper *self ) {
+
+    Gates_block* copied_circuit = NULL;
+
+    try {
+        copied_circuit = self->circuit->clone();
+    }
+    catch (std::string err) {
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        std::cout << err << std::endl;
+        return NULL;
+    }
+    catch(...) {
+        std::string err( "Invalid pointer to circuit class");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+
+    int qbit_num = copied_circuit->get_qbit_num();
+
+    // import gate operation modules
+    PyObject* qgd_circuit  = PyImport_ImportModule("squander.gates.qgd_Circuit");
+
+    if ( qgd_circuit == NULL ) {
+        PyErr_SetString(PyExc_Exception, "Module import error: squander.gates.qgd_Circuit" );
+        delete copied_circuit;
+        return NULL;
+    }
+
+    PyObject* qgd_circuit_Dict  = PyModule_GetDict( qgd_circuit );
+
+    // PyDict_GetItemString creates a borrowed reference to the item in the dict. Reference counting is not increased on this element, dont need to decrease the reference counting at the end
+    PyObject* py_circuit_class = PyDict_GetItemString( qgd_circuit_Dict, "qgd_Circuit");
+
+    PyObject* circuit_input = Py_BuildValue("(O)", Py_BuildValue("i", qbit_num) );
+    PyObject* py_circuit    = PyObject_CallObject(py_circuit_class, circuit_input);
+
+    // replace dummy data with real gate data
+    qgd_Circuit_Wrapper* py_circuit_C = reinterpret_cast<qgd_Circuit_Wrapper*>( py_circuit );
+
+    delete( py_circuit_C->circuit );
+    py_circuit_C->circuit = copied_circuit;
+
+    Py_DECREF( qgd_circuit );
+    Py_DECREF( circuit_input );
+
+    return py_circuit;
+}
+
+
 /**
 @brief Method to extract the stored quantum circuit in a human-readable data serialized and pickle-able format
 @param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper
@@ -3173,14 +3315,17 @@ static PyMethodDef qgd_Circuit_Wrapper_Methods[] = {
     {"add_CRY", (PyCFunction) qgd_Circuit_Wrapper_add_CRY, METH_VARARGS | METH_KEYWORDS,
      "Call to add a CRY gate to the front of the gate structure"
     },
+    {"add_Permutation", (PyCFunction) qgd_Circuit_Wrapper_add_Permutation, METH_VARARGS | METH_KEYWORDS,
+     "Call to add a Permutation gate to the front of the gate structure"
+    },
     {"add_CRX", (PyCFunction) qgd_Circuit_Wrapper_add_CRX, METH_VARARGS | METH_KEYWORDS,
-     "Call to add a CRY gate to the front of the gate structure"
+     "Call to add a CRX gate to the front of the gate structure"
     },
     {"add_CRZ", (PyCFunction) qgd_Circuit_Wrapper_add_CRZ, METH_VARARGS | METH_KEYWORDS,
-     "Call to add a CRY gate to the front of the gate structure"
+     "Call to add a CRZ gate to the front of the gate structure"
     },
     {"add_CP", (PyCFunction) qgd_Circuit_Wrapper_add_CP, METH_VARARGS | METH_KEYWORDS,
-     "Call to add a CRY gate to the front of the gate structure"
+     "Call to add a CP gate to the front of the gate structure"
     },
     {"add_CCX", (PyCFunction) qgd_Circuit_Wrapper_add_CCX, METH_VARARGS | METH_KEYWORDS,
      "Call to add a CCX gate to the front of the gate structure"
@@ -3283,6 +3428,9 @@ static PyMethodDef qgd_Circuit_Wrapper_Methods[] = {
     {"get_Children", (PyCFunction) qgd_Circuit_Wrapper_get_children, METH_VARARGS,
      "Method to get the list of child gate indices. Then the children gates can be obtained from the list of gates involved in the circuit."
     },
+    {"copy", (PyCFunction) qgd_Circuit_Wrapper_copy, METH_NOARGS,
+     "Method to create a deep copy of the circuit."
+    },
     {"__getstate__", (PyCFunction) qgd_Circuit_Wrapper_getstate, METH_NOARGS,
      "Method to extract the stored quantum circuit in a human-readable data serialized and pickle-able format."
     },
diff --git a/squander/partitioning/ilp.py b/squander/partitioning/ilp.py
index e3ad3e3c1..9731247e6 100644
--- a/squander/partitioning/ilp.py
+++ b/squander/partitioning/ilp.py
@@ -539,6 +539,48 @@ def sol_to_badsccs(g, allparts, L):
     _, scc = scc_tarjan_iterative(G_part)
     return {frozenset(v) for v in scc if len(v) > 1}
 
+def parts_to_overlap_scores(allparts, g, gate_to_qubit):
+    """
+    Per-part tie-breaker weights from logical-qubit overlap with DAG-downstream
+    candidate parts.
+
+    For each part i, score s[i] is the mean over candidate parts j reachable
+    from i in the gate DAG of |support(i) ∩ support(j)|. Returned weights are
+    `(s_max - s[i]) * eps` (lower is better — ILP minimizes), with eps small
+    enough that count-minimization in `ilp_global_optimal` is strictly
+    preserved when these weights are passed via `weights=`.
+
+    Args:
+        allparts (list[frozenset[int]]): Candidate parts (gate sets).
+        g (dict[int, set[int]]): Contracted gate DAG (u -> successors v) as
+            returned by `get_all_partitions`.
+        gate_to_qubit (dict[int, set[int]]): Gate -> qubits acted on.
+
+    Returns:
+        list[float]: weights[i] indexed like allparts, all in
+            [0, 1 / (len(allparts) * len(g))).
+    """
+    N = len(allparts)
+    if N == 0: return []
+    _, reach = nuutila_reach_scc(g)
+    gate_to_parts = {gate: [] for gate in g}
+    for i, part in enumerate(allparts):
+        for gate in part: gate_to_parts[gate].append(i)
+    supports = [set.union(*(gate_to_qubit[v] for v in part)) for part in allparts]
+    scores = [0.0] * N
+    for i, part in enumerate(allparts):
+        dgates = set().union(*(reach[u] for u in part)) - part
+        if not dgates: continue
+        succ_idxs = set().union(*(gate_to_parts[v] for v in dgates))
+        succ_idxs.discard(i)
+        if not succ_idxs: continue
+        sup_i = supports[i]
+        scores[i] = sum(len(sup_i & supports[j]) for j in succ_idxs) / len(succ_idxs)
+    s_max = max(scores)
+    if s_max == 0.0: return [0.0] * N
+    eps = 0.9 / (N * max(len(g), 1) * (s_max + 1.0))
+    return [(s_max - s) * eps for s in scores]
+
 def ilp_global_optimal(allparts, g, weighted_info=None, gurobi_direct=False, use_order=False, weights=None):
     """
     Select an optimal set of non-overlapping parts via ILP/MIP with cycle cuts.
diff --git a/squander/src-cpp/gates/Gate.cpp b/squander/src-cpp/gates/Gate.cpp
index ba24471d7..8c03ff1ef 100644
--- a/squander/src-cpp/gates/Gate.cpp
+++ b/squander/src-cpp/gates/Gate.cpp
@@ -158,7 +158,9 @@ Gate::Gate(int qbit_num_in) {
     // number of qubits spanning the matrix of the operation
     qbit_num = qbit_num_in;
     // the size of the matrix
-    matrix_size = Power_of_2(qbit_num);
+    if (qbit_num<31){
+        matrix_size = Power_of_2(qbit_num);
+    }
     // A string describing the type of the operation
     type = GENERAL_OPERATION;
     // The index of the qubit on which the operation acts (target_qbit >= 0)
@@ -192,7 +194,10 @@ Gate::Gate(int qbit_num_in, const std::vector<int>& target_qbits_in, const std::
     // number of qubits spanning the matrix of the operation
     qbit_num = qbit_num_in;
     // the size of the matrix
-    matrix_size = Power_of_2(qbit_num);
+    if (qbit_num<31){
+        matrix_size = Power_of_2(qbit_num);
+    }
+   
     // A string describing the type of the operation
     type = GENERAL_OPERATION;
     // The number of parameters
diff --git a/squander/src-cpp/gates/Gates_block.cpp b/squander/src-cpp/gates/Gates_block.cpp
index c974dad77..970f37db7 100644
--- a/squander/src-cpp/gates/Gates_block.cpp
+++ b/squander/src-cpp/gates/Gates_block.cpp
@@ -59,6 +59,8 @@ limitations under the License.
 #include "RZZ.h"
 #include "Adaptive.h"
 #include "Gates_block.h"
+#include "Permutation.h"
+
 #include "qgd_math.h"
 
 #ifdef _WIN32
@@ -1414,6 +1416,43 @@ void Gates_block::add_u3_to_front(int target_qbit) {
 
 }
 
+/**
+@brief Append a Permutation gate to the list of gates
+@param pattern The pattern of the permutation
+*/
+void Gates_block::add_permutation(const std::vector<int>& pattern) {
+    // create the operation
+    try {
+        Gate* operation = static_cast<Gate*>(new Permutation( qbit_num, pattern ));
+        add_gate( operation );
+    } catch (const std::string& e) {
+        // Re-throw as proper exception
+        throw std::runtime_error(e);
+    } catch (const std::exception& e) {
+        // Re-throw as-is
+        throw;
+    }
+}
+
+
+/**
+@brief Add a Permutation gate to the front of the list of gates
+@param pattern The pattern of the permutation
+*/
+void Gates_block::add_permutation_to_front(const std::vector<int>& pattern) {
+    // create the operation
+    try {
+        Gate* operation = static_cast<Gate*>(new Permutation( qbit_num, pattern ));
+        add_gate_to_front( operation );
+    } catch (const std::string& e) {
+        // Re-throw as proper exception
+        throw std::runtime_error(e);
+    } catch (const std::exception& e) {
+        // Re-throw as-is
+        throw;
+    }
+}
+
 /**
 @brief Append a RX gate to the list of gates
 @param target_qbit The identification number of the targt qubit. (0 <= target_qbit <= qbit_num-1)
@@ -3045,6 +3084,13 @@ Gates_block::create_remapped_circuit( const std::map<int, int>& qbit_map, const
 
             break;
         }
+        case PERMUTATION_OPERATION:
+        {
+            Gate* cloned_op = op->clone();
+            cloned_op->set_qbit_num( qbit_num_ );
+            ret->add_gate( cloned_op );
+            break;
+        }
         default:
             std::string err("Gates_block::create_remapped_circuit: unimplemented gate"); 
             throw err;
@@ -3277,7 +3323,7 @@ int Gates_block::extract_gates( Gates_block* op_block ) {
         case CH_OPERATION: case SYC_OPERATION:
         case U1_OPERATION: case U2_OPERATION: 
         case U3_OPERATION: case CP_OPERATION:
-        case RY_OPERATION: case CRY_OPERATION: 
+        case RY_OPERATION: case CRY_OPERATION: case PERMUTATION_OPERATION:
         case CRX_OPERATION: case CRZ_OPERATION:
         case RX_OPERATION: case CR_OPERATION:
         case RZ_OPERATION: case X_OPERATION:
diff --git a/squander/src-cpp/gates/Permutation.cpp b/squander/src-cpp/gates/Permutation.cpp
new file mode 100644
index 000000000..4d9b98d49
--- /dev/null
+++ b/squander/src-cpp/gates/Permutation.cpp
@@ -0,0 +1,267 @@
+/*
+Created on Fri Jun 26 14:13:26 2020
+Copyright 2020 Peter Rakyta, Ph.D.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author: Peter Rakyta, Ph.D.
+*/
+/*! \file Permutation.cpp
+    \brief Class for the representation of Permutation gate.
+*/
+#include "Permutation.h"
+#include "apply_dedicated_gate_kernel_to_input.h"
+#include "common.h"
+
+Permutation::Permutation(){
+    name = "Permutation";
+    type = PERMUTATION_OPERATION;
+    target_qbits.clear();
+    control_qbits.clear();
+    parameter_num = 0;
+    cycles_cache_valid = false;
+    cycles_cache_matrix_size = 0;
+}
+
+Permutation::Permutation(int qbit_num_in, const std::vector<int>& pattern_in) : Gate(qbit_num_in) {
+    if (pattern_in.size() != qbit_num_in) {
+        std::stringstream sstream;
+        sstream << "Permutation: Pattern size " << pattern_in.size() << " is not equal to the number of qubits " << qbit_num_in << std::endl;
+        print(sstream, 0);
+        throw sstream.str();
+    }
+    name = "Permutation";
+    type = PERMUTATION_OPERATION;
+    pattern = pattern_in;
+    control_qbits.clear();
+    parameter_num = 0;
+    target_qbits.resize(qbit_num_in);
+    for (int idx=0; idx<qbit_num_in; idx++){
+        target_qbits[idx] = idx;
+    }
+    cycles_cache_valid = false;
+    cycles_cache_matrix_size = 0;
+}
+Permutation::~Permutation(){
+    target_qbits.clear();
+    control_qbits.clear();
+}
+
+Matrix Permutation::get_matrix(){
+    return get_matrix(false);
+}
+
+Matrix Permutation::get_matrix(int parallel){
+    Matrix permutation_matrix = create_identity(matrix_size);
+    apply_to(permutation_matrix, parallel);
+    return permutation_matrix;
+}
+
+void Permutation::apply_to(Matrix& input, int parallel){
+    if (input.rows != matrix_size) {
+        std::string err("Permutation::apply_to: Wrong input size in Permutation gate apply");
+        throw err;
+    }
+
+    if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) {
+        build_cycles_cache();
+    }
+    if (parallel == 2) {
+        apply_Permutation_kernel_to_input_tbb(input, pattern, matrix_size, cycles_cache);
+    }
+    else if (parallel == 1) {
+        apply_Permutation_kernel_to_input_omp(input, pattern, matrix_size, cycles_cache);
+    }
+    else {
+        apply_Permutation_kernel_to_input(input, pattern, matrix_size, cycles_cache);
+    }
+}
+void Permutation::apply_to(Matrix& input){
+    if (input.rows != matrix_size) {
+        std::string err("Permutation::apply_to: Wrong input size in Permutation gate apply");
+        throw err;
+    }
+    if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) {
+        build_cycles_cache();
+    }
+    apply_Permutation_kernel_to_input(input, pattern, matrix_size, cycles_cache);
+}
+
+// Apply the permutation from the right (input @ P) by permuting the columns of
+// the input. The column swaps are applied in reverse cycle order so the result
+// matches input * get_matrix(). This acts row-independently, so the input may
+// have any number of rows as long as its column count equals matrix_size.
+template<typename MatrixType>
+static void permute_columns_from_cycles(MatrixType& input, const std::vector<std::vector<int>>& cycles){
+    auto* data = input.get_data();
+    for (const auto& cycle : cycles) {
+        for (int idx = (int)cycle.size() - 2; idx >= 0; --idx) {
+            int c0 = cycle[idx];
+            int c1 = cycle[idx + 1];
+            for (int row = 0; row < input.rows; ++row) {
+                auto tmp = data[row * input.stride + c0];
+                data[row * input.stride + c0] = data[row * input.stride + c1];
+                data[row * input.stride + c1] = tmp;
+            }
+        }
+    }
+}
+
+void Permutation::apply_from_right(Matrix& input){
+    if (input.cols != matrix_size) {
+        std::string err("Permutation::apply_from_right: Wrong input size in Permutation gate apply");
+        throw err;
+    }
+    if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) {
+        build_cycles_cache();
+    }
+    permute_columns_from_cycles(input, cycles_cache);
+}
+
+void Permutation::apply_to(Matrix_float& input, int parallel){
+    if (input.rows != matrix_size) {
+        std::string err("Permutation::apply_to: Wrong input size in Permutation gate apply");
+        throw err;
+    }
+
+    if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) {
+        build_cycles_cache();
+    }
+    if (parallel == 2) {
+        apply_Permutation_kernel_to_input_tbb(input, pattern, matrix_size, cycles_cache);
+    }
+    else if (parallel == 1) {
+        apply_Permutation_kernel_to_input_omp(input, pattern, matrix_size, cycles_cache);
+    }
+    else {
+        apply_Permutation_kernel_to_input(input, pattern, matrix_size, cycles_cache);
+    }
+}
+
+void Permutation::apply_from_right(Matrix_float& input){
+    if (input.cols != matrix_size) {
+        std::string err("Permutation::apply_from_right: Wrong input size in Permutation gate apply");
+        throw err;
+    }
+    if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) {
+        build_cycles_cache();
+    }
+    permute_columns_from_cycles(input, cycles_cache);
+}
+
+void Permutation::apply_to_list(std::vector<Matrix>& inputs, int parallel){
+    int work_batch = 1;
+    if ( parallel == 0 ) {
+        work_batch = inputs.size();
+    }
+    else {
+        work_batch = 1;
+    }
+
+
+    tbb::parallel_for( tbb::blocked_range<int>(0,inputs.size(),work_batch), [&](tbb::blocked_range<int> r) {
+        for (int idx=r.begin(); idx<r.end(); ++idx) { 
+
+            Matrix* input = &inputs[idx];
+
+            apply_to( *input, parallel );
+
+        }
+
+    });
+}
+
+
+std::vector<int> Permutation::get_target_qbits(){
+    return target_qbits;
+}
+
+std::vector<int> Permutation::get_control_qbits(){
+    return control_qbits;
+}
+
+std::vector<int> Permutation::get_pattern(){
+    return pattern;
+}
+
+void Permutation::set_pattern(const std::vector<int>& pattern_in){
+    pattern = pattern_in;
+    invalidate_cache();
+}
+
+std::vector<int> Permutation::get_involved_qubits(bool only_target){
+    std::vector<int> involved_qubits;
+    for (int i = 0; i < qbit_num; i++) {
+        involved_qubits.push_back(i);
+    }
+    return involved_qubits;
+}
+
+Permutation* Permutation::clone(){
+    Permutation* ret = new Permutation(qbit_num, pattern);
+    ret->set_parameter_start_idx(get_parameter_start_idx());
+    ret->set_parents(parents);
+    ret->set_children(children);
+    return ret;
+}
+
+void Permutation::reorder_qubits(std::vector<int> qbit_list){
+    Gate::reorder_qubits(qbit_list);
+    std::vector<int> new_pattern(qbit_num);
+    for (int idx=0; idx<qbit_num; idx++){
+        new_pattern[idx] = std::find(qbit_list.begin(), qbit_list.end(), pattern[idx]) - qbit_list.begin();
+    }
+    pattern = new_pattern;
+    invalidate_cache();
+}
+
+void Permutation::invalidate_cache(){
+    cycles_cache_valid = false;
+    cycles_cache.clear();
+    cycles_cache_matrix_size = 0;
+}
+
+void Permutation::build_cycles_cache(){
+    cycles_cache.clear();
+    cycles_cache_matrix_size = matrix_size;
+
+    int qbit_num = pattern.size();
+
+    // Precompute next index for all rows once to avoid repeated bit work in cycle walks
+    std::vector<int> next_index(matrix_size);
+    for (int row_idx = 0; row_idx < matrix_size; ++row_idx) {
+        int new_row_idx = 0;
+        for (int idx = 0; idx < qbit_num; idx++) {
+            int bit = (row_idx >> pattern[idx]) & 1;
+            new_row_idx |= (bit << idx);
+        }
+        next_index[row_idx] = new_row_idx;
+    }
+
+    std::vector<uint8_t> visited(matrix_size, 0);
+    for (int start = 0; start < matrix_size; ++start) {
+        if (visited[start]) continue;
+        std::vector<int> cycle;
+        int current = start;
+        while (!visited[current]) {
+            visited[current] = 1;
+            cycle.push_back(current);
+            current = next_index[current];
+        }
+        if (cycle.size() > 1) {
+            cycles_cache.push_back(std::move(cycle));
+        }
+    }
+
+    cycles_cache_valid = true;
+}
\ No newline at end of file
diff --git a/squander/src-cpp/gates/include/Gate.h b/squander/src-cpp/gates/include/Gate.h
index 6b62e9d62..e5f2626f6 100644
--- a/squander/src-cpp/gates/include/Gate.h
+++ b/squander/src-cpp/gates/include/Gate.h
@@ -76,7 +76,8 @@ typedef enum gate_type {GENERAL_OPERATION=1,
                         RXX_OPERATION=44,
                         RYY_OPERATION=45,
                         RZZ_OPERATION=46,
-                        SXDG_OPERATION=47} gate_type;
+                        SXDG_OPERATION=47,
+                        PERMUTATION_OPERATION=48} gate_type;
 
 
 
diff --git a/squander/src-cpp/gates/include/Gates_block.h b/squander/src-cpp/gates/include/Gates_block.h
index 3b616a839..75ee2da8e 100644
--- a/squander/src-cpp/gates/include/Gates_block.h
+++ b/squander/src-cpp/gates/include/Gates_block.h
@@ -308,8 +308,17 @@ void add_ry(int target_qbit);
 */
 void add_ry_to_front(int target_qbit);
 
+/**
+@brief Append a Permutation gate to the list of gates
+@param pattern The pattern of the permutation
+*/
+void add_permutation(const std::vector<int>& pattern);
 
-
+/**
+@brief Add a Permutation gate to the front of the list of gates
+@param pattern The pattern of the permutation
+*/
+void add_permutation_to_front(const std::vector<int>& pattern);
 
 /**
 @brief Append a CRY gate to the list of gates
diff --git a/squander/src-cpp/gates/include/Permutation.h b/squander/src-cpp/gates/include/Permutation.h
new file mode 100644
index 000000000..529d05c81
--- /dev/null
+++ b/squander/src-cpp/gates/include/Permutation.h
@@ -0,0 +1,65 @@
+/*
+Created on Fri Jun 26 14:13:26 2020
+Copyright 2020 Peter Rakyta, Ph.D.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author: Peter Rakyta, Ph.D.
+*/
+/*! \file Permutation.h
+    \brief Class for the representation of Permutation gate.
+*/
+
+#ifndef PERMUTATION_H
+#define PERMUTATION_H
+
+#include "Gate.h"
+#include "common.h"
+#include "matrix.h"
+#include "logging.h"
+#include "tbb/tbb.h"
+
+class Permutation : public Gate {
+
+protected:
+    std::vector<int> pattern;
+    // Cached cycles for current pattern and matrix size
+    std::vector<std::vector<int>> cycles_cache;
+    int cycles_cache_matrix_size = 0;
+    bool cycles_cache_valid = false;
+
+    void invalidate_cache();
+    void build_cycles_cache();
+
+public:
+    Permutation();
+    Permutation(int qbit_num_in, const std::vector<int>& pattern_in);
+    ~Permutation();
+    Matrix get_matrix();
+    Matrix get_matrix(int parallel);
+    void apply_to(Matrix& input, int parallel);
+    void apply_to(Matrix& input);
+    void apply_from_right(Matrix& input);
+    void apply_to(Matrix_float& input, int parallel);
+    void apply_from_right(Matrix_float& input);
+    void apply_to_list(std::vector<Matrix>& inputs, int parallel);
+    std::vector<int> get_pattern();
+    void set_pattern(const std::vector<int>& pattern_in);
+    std::vector<int> get_target_qbits();
+    std::vector<int> get_control_qbits();
+    std::vector<int> get_involved_qubits(bool only_target = false);
+    Permutation* clone();
+    void reorder_qubits(std::vector<int> qbit_list);
+};
+
+#endif //PERMUTATION_H
\ No newline at end of file
diff --git a/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp b/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
index 73bac921b..d3050f5ab 100644
--- a/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
+++ b/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
@@ -25,6 +25,8 @@ limitations under the License.
 //#include <immintrin.h>
 #include "tbb/tbb.h"
 #include <omp.h>
+#include <unordered_map>
+#include <unordered_set>
 #include <type_traits>
 #include <utility>
 
@@ -529,6 +531,152 @@ void apply_SWAP_kernel_to_input_impl(MatrixT& input, const std::vector<int>& tar
     }
 }
 
+void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size){
+
+    int qbit_num = pattern.size();
+
+    auto permuted_index = [&](int row_idx) -> int {
+        int new_row_idx = 0;
+        for (int idx = 0; idx < qbit_num; idx++) {
+            int bit = (row_idx >> pattern[idx]) & 1;
+            new_row_idx |= (bit << idx);
+        }
+        return new_row_idx;
+    };
+
+    std::vector<uint8_t> visited(matrix_size, 0);
+
+    for (int start = 0; start < matrix_size; ++start) {
+        if (visited[start]) continue;
+
+        std::vector<int> cycle;
+        int current = start;
+        while (!visited[current]) {
+            visited[current] = 1;
+            cycle.push_back(current);
+            current = permuted_index(current);
+        }
+
+        if (cycle.size() <= 1) continue;
+
+        for (size_t idx = 0; idx < cycle.size() - 1; idx++) {
+            std::swap_ranges(
+                input.get_data() + cycle[idx] * input.stride,
+                input.get_data() + cycle[idx] * input.stride + input.cols,
+                input.get_data() + cycle[idx + 1] * input.stride
+            );
+        }
+    }
+}
+
+// Overload that applies permutation using precomputed cycles
+void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern; // currently unused, kept for interface symmetry / potential validation
+    (void)matrix_size; // rows already validated by caller
+
+    for (const auto& cycle : cycles) {
+        for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
+            std::swap_ranges(
+                input.get_data() + cycle[idx] * input.stride,
+                input.get_data() + cycle[idx] * input.stride + input.cols,
+                input.get_data() + cycle[idx + 1] * input.stride
+            );
+        }
+    }
+}
+
+void apply_Permutation_kernel_to_input_tbb(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern;
+    (void)matrix_size;
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, cycles.size(), 64),
+        [&](const tbb::blocked_range<size_t>& range) {
+            for (size_t cdx = range.begin(); cdx != range.end(); ++cdx) {
+                const auto& cycle = cycles[cdx];
+                for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
+                    std::swap_ranges(
+                        input.get_data() + cycle[idx] * input.stride,
+                        input.get_data() + cycle[idx] * input.stride + input.cols,
+                        input.get_data() + cycle[idx + 1] * input.stride
+                    );
+                }
+            }
+        }
+    );
+}
+
+void apply_Permutation_kernel_to_input_omp(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern;
+    (void)matrix_size;
+
+    #pragma omp parallel for schedule(static)
+    for (int cdx = 0; cdx < (int)cycles.size(); ++cdx) {
+        const auto& cycle = cycles[cdx];
+        for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
+            std::swap_ranges(
+                input.get_data() + cycle[idx] * input.stride,
+                input.get_data() + cycle[idx] * input.stride + input.cols,
+                input.get_data() + cycle[idx + 1] * input.stride
+            );
+        }
+    }
+}
+
+// float32 (complex64) overloads of the precomputed-cycle Permutation kernels.
+// The permutation only swaps whole rows, so the logic is identical to the
+// Matrix versions; only the underlying element type differs.
+void apply_Permutation_kernel_to_input(Matrix_float& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern;
+    (void)matrix_size;
+
+    for (const auto& cycle : cycles) {
+        for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
+            std::swap_ranges(
+                input.get_data() + cycle[idx] * input.stride,
+                input.get_data() + cycle[idx] * input.stride + input.cols,
+                input.get_data() + cycle[idx + 1] * input.stride
+            );
+        }
+    }
+}
+
+void apply_Permutation_kernel_to_input_tbb(Matrix_float& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern;
+    (void)matrix_size;
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, cycles.size(), 64),
+        [&](const tbb::blocked_range<size_t>& range) {
+            for (size_t cdx = range.begin(); cdx != range.end(); ++cdx) {
+                const auto& cycle = cycles[cdx];
+                for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
+                    std::swap_ranges(
+                        input.get_data() + cycle[idx] * input.stride,
+                        input.get_data() + cycle[idx] * input.stride + input.cols,
+                        input.get_data() + cycle[idx + 1] * input.stride
+                    );
+                }
+            }
+        }
+    );
+}
+
+void apply_Permutation_kernel_to_input_omp(Matrix_float& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern;
+    (void)matrix_size;
+
+    #pragma omp parallel for schedule(static)
+    for (int cdx = 0; cdx < (int)cycles.size(); ++cdx) {
+        const auto& cycle = cycles[cdx];
+        for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
+            std::swap_ranges(
+                input.get_data() + cycle[idx] * input.stride,
+                input.get_data() + cycle[idx] * input.stride + input.cols,
+                input.get_data() + cycle[idx + 1] * input.stride
+            );
+        }
+    }
+}
+
 
 template<typename MatrixT>
 void apply_SWAP_kernel_from_right_impl(MatrixT& input, const std::vector<int>& target_qbits, const std::vector<int>& control_qbits, const int& matrix_size) {
diff --git a/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h b/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
index de6c501fa..03c7d4c30 100644
--- a/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
+++ b/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
@@ -83,6 +83,34 @@ void apply_SWAP_kernel_from_right(Matrix& input, const std::vector<int>& target_
 void apply_SYC_kernel_to_input(Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size);
 void apply_SYC_kernel_from_right(Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size);
 
+/**
+ * @brief Applies the Permutation gate kernel to the input matrix.
+ *
+ * @param input The input matrix on which the transformation is applied.
+ * @param pattern The pattern of the permutation.
+ * @param matrix_size The size of the input.
+ */
+void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size);
+
+/**
+ * @brief Applies the Permutation gate kernel using precomputed cycles.
+ *
+ * @param input The input matrix on which the transformation is applied.
+ * @param pattern The pattern of the permutation (used only for validation or future extensions).
+ * @param matrix_size The size of the input.
+ * @param cycles The disjoint cycles of row indices representing the permutation.
+ */
+void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+
+// Parallelized versions for permutation with precomputed cycles
+void apply_Permutation_kernel_to_input_tbb(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+void apply_Permutation_kernel_to_input_omp(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+
+// float32 (complex64) overloads of the precomputed-cycle Permutation kernels
+void apply_Permutation_kernel_to_input(Matrix_float& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+void apply_Permutation_kernel_to_input_tbb(Matrix_float& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+void apply_Permutation_kernel_to_input_omp(Matrix_float& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+
 // TBB Parallelized versions
 void apply_X_kernel_to_input_tbb(Matrix& input, const std::vector<int>& target_qbits, const std::vector<int>& control_qbits, const int& matrix_size);
 void apply_X_kernel_from_right_tbb(Matrix& input, const std::vector<int>& target_qbits, const std::vector<int>& control_qbits, const int& matrix_size);
diff --git a/squander/src-cpp/sabre_router/CMakeLists.txt b/squander/src-cpp/sabre_router/CMakeLists.txt
new file mode 100644
index 000000000..f4cfa9604
--- /dev/null
+++ b/squander/src-cpp/sabre_router/CMakeLists.txt
@@ -0,0 +1,118 @@
+# ===================================================================
+# SQUANDER SABRE Router Module - C++ Routing Engine + pybind11 Bindings
+# ===================================================================
+
+message(STATUS "")
+message(STATUS "=== Configuring SABRE Router Module ===")
+
+# ===================================================================
+# Find pybind11
+# ===================================================================
+
+find_package(pybind11 CONFIG QUIET)
+
+if(NOT pybind11_FOUND)
+    message(STATUS "pybind11 not found via find_package, trying Python import...")
+    execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} -c "import pybind11; print(pybind11.get_cmake_dir())"
+        OUTPUT_VARIABLE pybind11_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    if(pybind11_DIR)
+        message(STATUS "Found pybind11 via Python at: ${pybind11_DIR}")
+        find_package(pybind11 CONFIG PATHS ${pybind11_DIR})
+    endif()
+endif()
+
+if(NOT pybind11_FOUND)
+    message(WARNING "")
+    message(WARNING "pybind11 not found - SABRE router module will be skipped")
+    message(WARNING "Install with: pip install pybind11")
+    message(WARNING "")
+    return()
+endif()
+
+message(STATUS "pybind11 version: ${pybind11_VERSION}")
+
+# ===================================================================
+# Source Files
+# ===================================================================
+
+set(SABRE_SOURCES
+    sabre_router.cpp
+)
+
+set(SABRE_HEADERS
+    include/sabre_router.hpp
+)
+
+# ===================================================================
+# Static C++ library
+# ===================================================================
+
+add_library(sabre_router_core STATIC
+    ${SABRE_SOURCES}
+    ${SABRE_HEADERS}
+)
+
+target_include_directories(sabre_router_core
+    PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
+
+# C++17 for this module only (does not affect global C++11)
+target_compile_features(sabre_router_core PUBLIC cxx_std_17)
+
+target_compile_options(sabre_router_core PRIVATE
+    $<$<OR:$<CXX_COMPILER_ID:GNU>,$<CXX_COMPILER_ID:Clang>>:
+        -Wall -Wextra -fPIC
+        $<$<CONFIG:Release>:-O3 -march=native>
+        $<$<CONFIG:Debug>:-g -O0>
+    >
+    $<$<CXX_COMPILER_ID:MSVC>:
+        $<$<CONFIG:Release>:/O2>
+        $<$<CONFIG:Debug>:/Od /Zi>
+    >
+)
+
+set_target_properties(sabre_router_core PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# ===================================================================
+# pybind11 module
+# ===================================================================
+
+pybind11_add_module(_sabre_router MODULE
+    ../../synthesis/bindings.cpp
+)
+
+target_link_libraries(_sabre_router PRIVATE
+    sabre_router_core
+)
+
+set_target_properties(_sabre_router PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/squander/synthesis
+    OUTPUT_NAME "_sabre_router"
+)
+
+# Set BUILD_RPATH to prioritize conda libraries
+if(DEFINED ENV{CONDA_PREFIX})
+    set_target_properties(_sabre_router PROPERTIES
+        BUILD_RPATH "${CONDA_PREFIX}/lib"
+        BUILD_RPATH_USE_ORIGIN TRUE
+    )
+endif()
+
+# ===================================================================
+# Installation
+# ===================================================================
+
+install(TARGETS _sabre_router
+        LIBRARY DESTINATION squander/synthesis
+        RUNTIME DESTINATION squander/synthesis
+        COMPONENT python)
+
+message(STATUS "=== SABRE Router Module Configured ===")
+message(STATUS "")
diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
new file mode 100644
index 000000000..0e037a9f9
--- /dev/null
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -0,0 +1,450 @@
+#pragma once
+/*
+Copyright 2025 SQUANDER Contributors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+C++ backend for the SABRE-style partition-aware routing engine.
+*/
+
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <queue>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace squander::routing {
+
+// ---------------------------------------------------------------------------
+// Data structures (flattened from Python objects)
+// ---------------------------------------------------------------------------
+
+struct Edge {
+    int u, v;
+};
+
+struct CandidateData {
+    int partition_idx;
+    int topology_idx;
+    int permutation_idx;
+    int candidate_idx = -1;
+    int cnot_count;
+    bool has_multi_qubit_body = true;
+
+    // Permutations within the reduced (q*) space
+    // P_i[v] = position in Q* space for input routing
+    // P_o[v] = position in Q* space for output placement
+    std::vector<int> P_i;
+    std::vector<int> P_o;
+
+    // node_mapping_flat[Q*_idx] = Q (physical qubit)
+    // Dense array indexed by Q* index
+    std::vector<int> node_mapping_flat;
+
+    // qbit_map: original circuit qubit q -> reduced qubit q*
+    std::vector<int> qbit_map_keys;
+    std::vector<int> qbit_map_vals;
+
+    // Original circuit qubits involved in this partition
+    std::vector<int> involved_qbits;
+
+    // Precomputed routing helpers.
+    std::vector<int> P_i_inv;
+    std::vector<int> P_o_inv;
+    std::vector<int> qbit_map_keys_sorted;
+    std::vector<int> qbit_map_vals_sorted;
+    std::vector<int> qstar_to_q;
+};
+
+struct CanonicalEntry {
+    std::vector<int> edges_u; // virtual qubit indices
+    std::vector<int> edges_v;
+    int cnot = 0;
+};
+
+struct LayoutPartInfo {
+    bool is_single;
+    std::vector<int> involved_qbits;
+};
+
+struct SabreConfig {
+    int prefilter_top_k = 50;
+    int prefilter_min_per_partition = 2;
+    int prefilter_min_3q = 12;
+    int max_E_size = 20;
+    int max_lookahead = 4;
+    double E_weight = 0.5;
+    double E_alpha = 1.0; // LightSABRE uses no per-depth decay; set <1 for SQUANDER-style decay
+    double cnot_cost = 1.0 / 3.0; // weight on candidate.cnot_count; swap cost is fixed at 1.0 (1 SWAP = 3 CNOTs)
+    int sabre_iterations = 1;
+    int n_layout_trials = 1;
+    int random_seed = 42;
+    double decay_delta = 0.001; // Qiskit LightSABRE DECAY_RATE
+    int swap_burst_budget = 5; // Qiskit LightSABRE DECAY_RESET_INTERVAL
+    double path_tiebreak_weight = 0.2;
+    double three_qubit_exit_weight = 1.0;
+    int boundary_beam_width = 1;
+    int boundary_beam_depth = 1;
+};
+
+struct RouteStep {
+    int type = 0; // 0=swap, 1=partition, 2=single
+    int partition_idx = -1;
+    int candidate_idx = -1;
+    int physical_qubit = -1;
+    std::vector<std::pair<int,int>> swaps;
+};
+
+struct ForwardRouteResult {
+    std::vector<int> pi_initial;
+    std::vector<int> pi;
+    int cnot_count = 0;
+    std::vector<RouteStep> steps;
+};
+
+struct TrialResult {
+    std::vector<int> pi;
+    double total_cost;
+};
+
+struct NeighborEdge {
+    int u_idx;
+    int v_idx;
+    double weight;
+};
+
+struct NeighborInfo {
+    std::vector<int> neighbor_vqs;
+    std::vector<int> initial_pos;
+    std::vector<NeighborEdge> edges;
+    double weight = 0.0;
+
+    bool uses_tiebreak() const {
+        return weight > 0.0 && !edges.empty();
+    }
+};
+
+// ---------------------------------------------------------------------------
+// Swap cache key for deduplication within a single heuristic_search call
+// ---------------------------------------------------------------------------
+
+struct SwapCacheKey {
+    int64_t pi_snapshot;
+    int64_t targets;
+    int k;
+    // 0 when the neighbor tiebreak is inactive; otherwise a stable hash of
+    // (edges, initial_pos, weight) from NeighborInfo so that two calls with
+    // the same active future context share cache entries.
+    uint64_t neighbor_hash;
+
+    bool operator==(const SwapCacheKey& o) const {
+        return pi_snapshot == o.pi_snapshot && targets == o.targets
+            && k == o.k && neighbor_hash == o.neighbor_hash;
+    }
+};
+
+struct SwapCacheKeyHash {
+    size_t operator()(const SwapCacheKey& k) const {
+        size_t h = static_cast<size_t>(k.pi_snapshot);
+        h ^= static_cast<size_t>(k.targets) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        h ^= static_cast<size_t>(k.k) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        h ^= static_cast<size_t>(k.neighbor_hash) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        return h;
+    }
+};
+
+using SwapList = std::vector<std::pair<int,int>>;
+using SwapCache = std::unordered_map<SwapCacheKey, SwapList, SwapCacheKeyHash>;
+
+// ---------------------------------------------------------------------------
+// A* state packing helpers
+// ---------------------------------------------------------------------------
+
+// For k <= 4 partition qubits on N <= 64 physical qubits, pack state into int64_t
+// State = sum(positions[i] * N^i), fits in 64 bits when N <= 64 and k <= 4
+inline int64_t pack_state(const std::vector<int>& positions, int N) {
+    int64_t s = 0;
+    int64_t stride = 1;
+    for (size_t i = 0; i < positions.size(); i++) {
+        s += static_cast<int64_t>(positions[i]) * stride;
+        stride *= N;
+    }
+    return s;
+}
+
+inline std::vector<int> unpack_state(int64_t packed, int k, int N) {
+    std::vector<int> positions(k);
+    for (int i = 0; i < k; i++) {
+        positions[i] = static_cast<int>(packed % N);
+        packed /= N;
+    }
+    return positions;
+}
+
+// ---------------------------------------------------------------------------
+// SabreRouter class
+// ---------------------------------------------------------------------------
+
+class SabreRouter {
+public:
+    SabreRouter(
+        const SabreConfig& config,
+        int N,
+        std::vector<double> D,
+        std::vector<std::vector<int>> adj,
+        std::vector<std::vector<int>> DAG,
+        std::vector<std::vector<int>> IDAG,
+        std::vector<std::vector<CandidateData>> candidate_cache,
+        std::vector<LayoutPartInfo> layout_partitions,
+        std::unordered_map<int, CanonicalEntry> canonical_data_fwd,
+        std::unordered_map<int, CanonicalEntry> canonical_data_rev
+    );
+
+    // Thread-safe: all mutable state is stack-local
+    ForwardRouteResult route_forward(
+        const std::vector<int>& pi
+    ) const;
+
+    TrialResult run_trial(
+        int trial_idx,
+        const std::vector<int>& seeded_pi,
+        int n_iterations,
+        int n_trials
+    ) const;
+
+private:
+    // Distance lookup (flat row-major)
+    inline double dist(int phys_u, int phys_v) const {
+        return D_[phys_u * N_ + phys_v];
+    }
+
+    // Main heuristic search loop.
+    // children_graph/parents_graph are swapped for backward passes.
+    std::pair<std::vector<int>, double> heuristic_search(
+        const std::vector<int>& F_init,
+        std::vector<int> pi,
+        bool reverse,
+        std::mt19937* rng,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::vector<std::vector<int>>& parents_graph,
+        ForwardRouteResult* route_trace = nullptr
+    ) const;
+
+    // A* constrained swap search over the k-dimensional partition state space.
+    std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+    find_constrained_swaps(
+        const std::vector<int>& pi,
+        const std::vector<int>& qbit_map_keys,
+        const std::vector<int>& qbit_map_vals,
+        const std::vector<int>& node_mapping_flat,
+        const std::vector<int>& P_route_inv,
+        SwapCache* swap_cache,
+        const NeighborInfo* neighbor_info = nullptr
+    ) const;
+
+    // Lower-bound swap estimate for routing the candidate's partition qubits.
+    int estimate_swap_count(
+        const CandidateData& cand,
+        const std::vector<int>& pi,
+        bool reverse
+    ) const;
+
+    // BFS lookahead: multi-qubit partitions near the front layer.
+    std::vector<std::pair<int,int>> generate_extended_set(
+        const std::vector<int>& F,
+        const std::vector<uint8_t>& resolved,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::vector<std::vector<int>>& parents_graph
+    ) const;
+
+    // LightSABRE relative scoring (arXiv:2409.08368, eq. 1).
+    double score_candidate(
+        const CandidateData& cand,
+        const std::vector<int>& F_snapshot,
+        const std::vector<int>& pi,
+        const std::vector<std::pair<int,int>>& E,
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data,
+        SwapCache* swap_cache,
+        const std::vector<double>* decay = nullptr,
+        std::vector<std::pair<int,int>>* out_swaps = nullptr,
+        std::vector<int>* out_pi_new = nullptr,
+        const NeighborInfo* cached_neighbor_info = nullptr
+    ) const;
+
+    // Route a candidate's partition qubits to their input positions and
+    // update pi for the exit positions.
+    std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+    transform_pi(
+        const CandidateData& cand,
+        const std::vector<int>& pi,
+        bool reverse,
+        SwapCache* swap_cache,
+        const NeighborInfo* neighbor_info = nullptr
+    ) const;
+
+    NeighborInfo build_neighbor_info(
+        int exclude_partition_idx,
+        const std::vector<int>& F_snapshot,
+        const std::vector<std::pair<int,int>>& E,
+        const std::vector<int>& pi,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    double decay_factor_for_swaps(
+        const std::vector<std::pair<int,int>>& swaps,
+        const std::vector<double>& decay
+    ) const;
+
+    double routing_objective(
+        double route_cost,
+        int cnot_count,
+        double cnot_weight = 1.0,
+        double decay_factor = 1.0
+    ) const;
+
+    double future_partition_cost(
+        int partition_idx,
+        const std::vector<int>& pi,
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    void apply_decay_for_swaps(
+        const std::vector<std::pair<int,int>>& swaps,
+        std::vector<double>& decay
+    ) const;
+
+    void reset_decay(std::vector<double>& decay) const;
+
+    std::vector<int> bfs_shortest_path(int src, int dst) const;
+
+    std::pair<std::vector<std::pair<int,int>>, std::vector<int>> release_valve(
+        const std::vector<int>& F,
+        const std::vector<int>& pi,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    // Apply a list of SWAPs to pi
+    std::vector<int> apply_swaps_to_pi(
+        const std::vector<int>& pi,
+        const std::vector<std::pair<int,int>>& swaps
+    ) const;
+
+    // Get initial layer (partitions with no unresolved parents)
+    std::vector<int> get_initial_layer() const;
+
+    // Get final layer (partitions with no children)
+    std::vector<int> get_final_layer() const;
+
+    // Prefilter candidates by cheap swap estimate
+    std::vector<const CandidateData*> prefilter_candidates(
+        const std::vector<const CandidateData*>& candidates,
+        const std::vector<int>& pi,
+        int top_k,
+        const std::vector<int>& F_snapshot,
+        const std::vector<std::pair<int,int>>& E,
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    // Select best candidate with optional stochastic tie-breaking
+    const CandidateData& select_best_candidate(
+        const std::vector<const CandidateData*>& candidates,
+        const std::vector<double>& scores,
+        std::mt19937* rng
+    ) const;
+
+    std::pair<std::vector<int>, std::vector<uint8_t>> advance_layout_frontier(
+        int selected_partition_idx,
+        const std::vector<int>& F,
+        const std::vector<uint8_t>& resolved,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::vector<std::vector<int>>& parents_graph
+    ) const;
+
+    size_t boundary_beam_select_index(
+        const std::vector<const CandidateData*>& candidates,
+        const std::vector<double>& scores,
+        const std::vector<std::vector<std::pair<int,int>>>& cached_swaps,
+        const std::vector<std::vector<int>>& cached_pi,
+        const std::vector<int>& F_snapshot,
+        const std::vector<uint8_t>& resolved,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::vector<std::vector<int>>& parents_graph,
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data,
+        SwapCache* swap_cache
+    ) const;
+
+    // Check if partition is single-qubit
+    inline bool partition_is_single(int partition_idx) const {
+        return layout_partitions_[partition_idx].is_single;
+    }
+
+    // Gather all candidates for partitions in F
+    std::vector<const CandidateData*> obtain_partition_candidates(
+        const std::vector<int>& F
+    ) const;
+
+    // Random permutation of [0..N-1]
+    std::vector<int> random_permutation(int n, std::mt19937& rng) const;
+
+    // Initial-layout sampling: trial 0 uses the seed, later trials are random.
+    std::vector<int> sample_initial_layout(
+        int trial_idx,
+        int n_trials,
+        const std::vector<int>& seeded_pi,
+        std::mt19937& rng
+    ) const;
+
+    double entry_future_cost(
+        const CanonicalEntry& entry,
+        const std::vector<int>& pi
+    ) const;
+
+    double future_context_cost(
+        int exclude_partition_idx,
+        const std::vector<int>& pi,
+        const std::vector<int>& F_snapshot,
+        const std::vector<std::pair<int,int>>& E,
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    std::vector<int> estimate_candidate_output_layout(
+        const CandidateData& cand,
+        const std::vector<int>& pi,
+        bool reverse
+    ) const;
+
+    // Immutable data members
+    SabreConfig config_;
+    int N_; // number of physical qubits
+    int num_partitions_;
+    std::vector<double> D_; // flat N*N distance matrix (owned copy)
+    std::vector<std::vector<int>> adj_;
+    // CSR view of adj_ for tight inner loops
+    std::vector<int> adj_offsets_;
+    std::vector<int> adj_flat_;
+    std::vector<std::vector<int>> DAG_;
+    std::vector<std::vector<int>> IDAG_;
+    std::vector<std::vector<CandidateData>> candidate_cache_;
+    std::vector<LayoutPartInfo> layout_partitions_;
+    std::unordered_map<int, CanonicalEntry> canonical_data_fwd_;
+    std::unordered_map<int, CanonicalEntry> canonical_data_rev_;
+    std::vector<double> alpha_weights_;
+    double max_finite_distance_ = 1.0;
+};
+
+} // namespace squander::routing
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
new file mode 100644
index 000000000..8f1770ae5
--- /dev/null
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -0,0 +1,1932 @@
+/*
+Copyright 2025 SQUANDER Contributors
+
+C++ backend for the SABRE-style partition-aware routing engine.
+*/
+
+#include "sabre_router.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <deque>
+#include <functional>
+#include <initializer_list>
+#include <limits>
+#include <numeric>
+#include <queue>
+#include <random>
+#include <stdexcept>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace squander::routing {
+
+namespace {
+
+std::vector<int> invert_permutation(const std::vector<int>& P) {
+    std::vector<int> inv(P.size());
+    for (size_t i = 0; i < P.size(); i++) {
+        inv[P[i]] = static_cast<int>(i);
+    }
+    return inv;
+}
+
+void prepare_candidate(CandidateData& cand) {
+    cand.P_i_inv = invert_permutation(cand.P_i);
+    cand.P_o_inv = invert_permutation(cand.P_o);
+
+    const int k = static_cast<int>(cand.qbit_map_keys.size());
+    std::vector<int> order(k);
+    std::iota(order.begin(), order.end(), 0);
+    std::sort(order.begin(), order.end(), [&](int a, int b) {
+        return cand.qbit_map_keys[a] < cand.qbit_map_keys[b];
+    });
+
+    cand.qbit_map_keys_sorted.resize(k);
+    cand.qbit_map_vals_sorted.resize(k);
+    int max_qstar = -1;
+    for (int i = 0; i < k; i++) {
+        const int src_idx = order[i];
+        const int qstar = cand.qbit_map_vals[src_idx];
+        cand.qbit_map_keys_sorted[i] = cand.qbit_map_keys[src_idx];
+        cand.qbit_map_vals_sorted[i] = qstar;
+        if (qstar > max_qstar) max_qstar = qstar;
+    }
+
+    const int dense_size = std::max(
+        {max_qstar + 1,
+         static_cast<int>(cand.P_i.size()),
+         static_cast<int>(cand.P_o.size()),
+         static_cast<int>(cand.node_mapping_flat.size())}
+    );
+    cand.qstar_to_q.assign(dense_size, -1);
+    for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
+        const int qstar = cand.qbit_map_vals[i];
+        if (qstar >= 0) {
+            if (qstar >= static_cast<int>(cand.qstar_to_q.size())) {
+                cand.qstar_to_q.resize(qstar + 1, -1);
+            }
+            cand.qstar_to_q[qstar] = cand.qbit_map_keys[i];
+        }
+    }
+}
+
+inline void unpack_state_into(int64_t packed, int k, int N, std::vector<int>& positions) {
+    positions.resize(k);
+    for (int i = 0; i < k; i++) {
+        positions[i] = static_cast<int>(packed % N);
+        packed /= N;
+    }
+}
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Constructor
+// ---------------------------------------------------------------------------
+
+SabreRouter::SabreRouter(
+    const SabreConfig& config,
+    int N,
+    std::vector<double> D,
+    std::vector<std::vector<int>> adj,
+    std::vector<std::vector<int>> DAG,
+    std::vector<std::vector<int>> IDAG,
+    std::vector<std::vector<CandidateData>> candidate_cache,
+    std::vector<LayoutPartInfo> layout_partitions,
+    std::unordered_map<int, CanonicalEntry> canonical_data_fwd,
+    std::unordered_map<int, CanonicalEntry> canonical_data_rev
+)
+    : config_(config)
+    , N_(N)
+    , num_partitions_(static_cast<int>(DAG.size()))
+    , D_(std::move(D))
+    , adj_(std::move(adj))
+    , DAG_(std::move(DAG))
+    , IDAG_(std::move(IDAG))
+    , candidate_cache_(std::move(candidate_cache))
+    , layout_partitions_(std::move(layout_partitions))
+    , canonical_data_fwd_(std::move(canonical_data_fwd))
+    , canonical_data_rev_(std::move(canonical_data_rev))
+{
+    if (static_cast<int>(D_.size()) != N_ * N_) {
+        throw std::invalid_argument("Distance matrix D must be N x N");
+    }
+    // Build CSR view of adj_
+    adj_offsets_.resize(N_ + 1);
+    adj_offsets_[0] = 0;
+    for (int i = 0; i < N_; i++) {
+        adj_offsets_[i + 1] = adj_offsets_[i] + static_cast<int>(adj_[i].size());
+    }
+    adj_flat_.resize(adj_offsets_[N_]);
+    for (int i = 0; i < N_; i++) {
+        for (size_t j = 0; j < adj_[i].size(); j++) {
+            adj_flat_[adj_offsets_[i] + j] = adj_[i][j];
+        }
+    }
+    for (auto& partition_candidates : candidate_cache_) {
+        for (auto& cand : partition_candidates) {
+            prepare_candidate(cand);
+        }
+    }
+
+    const int max_depth = std::max(0, config_.max_lookahead);
+    alpha_weights_.resize(max_depth + 1);
+    if (!alpha_weights_.empty()) {
+        alpha_weights_[0] = 1.0;
+        for (int depth = 1; depth <= max_depth; depth++) {
+            alpha_weights_[depth] = alpha_weights_[depth - 1] * config_.E_alpha;
+        }
+    }
+
+    max_finite_distance_ = 1.0;
+    for (double d : D_) {
+        if (std::isfinite(d) && d > max_finite_distance_) {
+            max_finite_distance_ = d;
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Helper: random permutation
+// ---------------------------------------------------------------------------
+
+std::vector<int> SabreRouter::random_permutation(int n, std::mt19937& rng) const {
+    std::vector<int> perm(n);
+    std::iota(perm.begin(), perm.end(), 0);
+    std::shuffle(perm.begin(), perm.end(), rng);
+    return perm;
+}
+
+std::vector<int> SabreRouter::sample_initial_layout(
+    int trial_idx,
+    int n_trials,
+    const std::vector<int>& seeded_pi,
+    std::mt19937& rng
+) const {
+    if (n_trials <= 1 || trial_idx == 0) {
+        return seeded_pi;
+    }
+
+    return random_permutation(N_, rng);
+}
+
+// ---------------------------------------------------------------------------
+// apply_swaps_to_pi
+// ---------------------------------------------------------------------------
+
+std::vector<int> SabreRouter::apply_swaps_to_pi(
+    const std::vector<int>& pi,
+    const std::vector<std::pair<int,int>>& swaps
+) const {
+    std::vector<int> result(pi);
+    thread_local std::vector<int> p2v;
+    if (static_cast<int>(p2v.size()) < N_) p2v.assign(N_, 0);
+    for (int q = 0; q < N_; q++) p2v[result[q]] = q;
+
+    for (auto [P1, P2] : swaps) {
+        int q1 = p2v[P1];
+        int q2 = p2v[P2];
+        p2v[P1] = q2;
+        p2v[P2] = q1;
+        result[q1] = P2;
+        result[q2] = P1;
+    }
+    return result;
+}
+
+NeighborInfo SabreRouter::build_neighbor_info(
+    int exclude_partition_idx,
+    const std::vector<int>& F_snapshot,
+    const std::vector<std::pair<int,int>>& E,
+    const std::vector<int>& pi,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    NeighborInfo info;
+    info.weight = config_.path_tiebreak_weight;
+    if (info.weight <= 0.0) {
+        return info;
+    }
+
+    // Per-call scratch via thread_local, reset by tracking touched entries
+    thread_local std::vector<int> q_to_idx;
+    thread_local std::vector<int> q_touched;
+    if (static_cast<int>(q_to_idx.size()) < N_) q_to_idx.assign(N_, -1);
+    q_touched.clear();
+
+    auto ensure_qubit = [&](int q) -> int {
+        int idx = q_to_idx[q];
+        if (idx >= 0) return idx;
+        idx = static_cast<int>(info.neighbor_vqs.size());
+        q_to_idx[q] = idx;
+        q_touched.push_back(q);
+        info.neighbor_vqs.push_back(q);
+        info.initial_pos.push_back(pi[q]);
+        return idx;
+    };
+
+    // edges: parallel arrays keyed by (lo, hi) — small linear scan dedup
+    thread_local std::vector<int> ekey_lo;
+    thread_local std::vector<int> ekey_hi;
+    thread_local std::vector<int> eu_idx;
+    thread_local std::vector<int> ev_idx;
+    thread_local std::vector<double> ew;
+    ekey_lo.clear(); ekey_hi.clear();
+    eu_idx.clear(); ev_idx.clear(); ew.clear();
+
+    auto add_edge = [&](int u, int v, double weight) {
+        const int u_idx = ensure_qubit(u);
+        const int v_idx = ensure_qubit(v);
+        const int lo = std::min(u, v);
+        const int hi = std::max(u, v);
+        for (size_t i = 0; i < ekey_lo.size(); i++) {
+            if (ekey_lo[i] == lo && ekey_hi[i] == hi) {
+                ew[i] += weight;
+                return;
+            }
+        }
+        ekey_lo.push_back(lo);
+        ekey_hi.push_back(hi);
+        eu_idx.push_back(u_idx);
+        ev_idx.push_back(v_idx);
+        ew.push_back(weight);
+    };
+
+    auto add_partition_edges = [&](int partition_idx, double weight) {
+        if (partition_idx == exclude_partition_idx || weight <= 0.0) return;
+        if (
+            partition_idx < 0
+            || partition_idx >= static_cast<int>(layout_partitions_.size())
+        ) return;
+        auto canonical_it = canonical_data.find(partition_idx);
+        if (canonical_it != canonical_data.end()
+            && !canonical_it->second.edges_u.empty()
+        ) {
+            const auto& entry = canonical_it->second;
+            for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                add_edge(entry.edges_u[i], entry.edges_v[i], weight);
+            }
+            return;
+        }
+
+        const auto& involved = layout_partitions_[partition_idx].involved_qbits;
+        if (involved.size() < 2) return;
+        for (size_t i = 0; i < involved.size(); i++) {
+            for (size_t j = i + 1; j < involved.size(); j++) {
+                add_edge(involved[i], involved[j], weight);
+            }
+        }
+    };
+
+    for (int partition_idx : F_snapshot) {
+        add_partition_edges(partition_idx, 1.0);
+    }
+    for (auto [partition_idx, depth] : E) {
+        const double alpha =
+            (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+                ? alpha_weights_[depth]
+                : std::pow(config_.E_alpha, depth);
+        add_partition_edges(partition_idx, config_.E_weight * alpha);
+    }
+
+    info.edges.reserve(ew.size());
+    for (size_t i = 0; i < ew.size(); i++) {
+        info.edges.push_back(NeighborEdge{eu_idx[i], ev_idx[i], ew[i]});
+    }
+
+    // Reset q_to_idx via touched-list (avoids O(N) clear)
+    for (int q : q_touched) q_to_idx[q] = -1;
+
+    return info;
+}
+
+double SabreRouter::decay_factor_for_swaps(
+    const std::vector<std::pair<int,int>>& swaps,
+    const std::vector<double>& decay
+) const {
+    double factor = 1.0;
+    for (auto [u, v] : swaps) {
+        factor = std::max(factor, std::max(decay[u], decay[v]));
+    }
+    return factor;
+}
+
+double SabreRouter::routing_objective(
+    double route_cost,
+    int cnot_count,
+    double cnot_weight,
+    double decay_factor
+) const {
+    return decay_factor * (
+        route_cost
+        + cnot_weight * config_.cnot_cost * static_cast<double>(cnot_count)
+    );
+}
+
+void SabreRouter::apply_decay_for_swaps(
+    const std::vector<std::pair<int,int>>& swaps,
+    std::vector<double>& decay
+) const {
+    if (config_.decay_delta <= 0.0) {
+        return;
+    }
+    for (auto [u, v] : swaps) {
+        decay[u] += config_.decay_delta;
+        decay[v] += config_.decay_delta;
+    }
+}
+
+void SabreRouter::reset_decay(std::vector<double>& decay) const {
+    std::fill(decay.begin(), decay.end(), 1.0);
+}
+
+std::vector<int> SabreRouter::bfs_shortest_path(int src, int dst) const {
+    if (src == dst) {
+        return {src};
+    }
+
+    std::vector<int> parent(N_, -1);
+    std::vector<uint8_t> visited(N_, 0);
+    std::deque<int> queue;
+    queue.push_back(src);
+    visited[src] = 1;
+
+    while (!queue.empty()) {
+        const int node = queue.front();
+        queue.pop_front();
+        for (int nb : adj_[node]) {
+            if (visited[nb]) {
+                continue;
+            }
+            visited[nb] = 1;
+            parent[nb] = node;
+            if (nb == dst) {
+                std::vector<int> path;
+                int cur = dst;
+                while (cur != src) {
+                    path.push_back(cur);
+                    cur = parent[cur];
+                }
+                path.push_back(src);
+                std::reverse(path.begin(), path.end());
+                return path;
+            }
+            queue.push_back(nb);
+        }
+    }
+
+    return {};
+}
+
+std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::release_valve(
+    const std::vector<int>& F,
+    const std::vector<int>& pi,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    double best_worst_dist = -std::numeric_limits<double>::infinity();
+    int best_partition_idx = -1;
+    int best_u = -1;
+    int best_v = -1;
+
+    for (int partition_idx : F) {
+        auto it = canonical_data.find(partition_idx);
+        if (it == canonical_data.end()) continue;
+        const auto& entry = it->second;
+        if (entry.edges_u.empty()) continue;
+
+        double worst_dist = 0.0;
+        int worst_u = -1;
+        int worst_v = -1;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            const int u = entry.edges_u[i];
+            const int v = entry.edges_v[i];
+            const double d = dist(pi[u], pi[v]);
+            if (d > worst_dist) {
+                worst_dist = d;
+                worst_u = u;
+                worst_v = v;
+            }
+        }
+
+        if (worst_dist <= 1.0 || worst_u < 0) continue;
+
+        if (
+            worst_dist > best_worst_dist
+            || (worst_dist == best_worst_dist
+                && (best_partition_idx < 0 || partition_idx < best_partition_idx))
+        ) {
+            best_worst_dist = worst_dist;
+            best_partition_idx = partition_idx;
+            best_u = worst_u;
+            best_v = worst_v;
+        }
+    }
+
+    if (best_u < 0) {
+        return {{}, pi};
+    }
+
+    const auto path = bfs_shortest_path(pi[best_u], pi[best_v]);
+    if (path.size() < 2) {
+        return {{}, pi};
+    }
+
+    const int k = static_cast<int>(path.size()) - 1;
+    const int m = k / 2;
+    std::vector<std::pair<int,int>> swaps;
+    for (int i = 0; i < m; i++) {
+        swaps.push_back({path[i], path[i + 1]});
+    }
+    for (int i = k; i > m + 1; i--) {
+        swaps.push_back({path[i], path[i - 1]});
+    }
+
+    auto pi_new = apply_swaps_to_pi(pi, swaps);
+    return {swaps, pi_new};
+}
+
+// ---------------------------------------------------------------------------
+// get_initial_layer / get_final_layer
+// ---------------------------------------------------------------------------
+
+std::vector<int> SabreRouter::get_initial_layer() const {
+    std::vector<int> layer;
+    for (int p = 0; p < num_partitions_; p++) {
+        if (IDAG_[p].empty()) layer.push_back(p);
+    }
+    return layer;
+}
+
+std::vector<int> SabreRouter::get_final_layer() const {
+    std::vector<int> layer;
+    for (int p = num_partitions_ - 1; p >= 0; p--) {
+        if (DAG_[p].empty()) layer.push_back(p);
+    }
+    return layer;
+}
+
+// ---------------------------------------------------------------------------
+// estimate_swap_count
+// ---------------------------------------------------------------------------
+
+int SabreRouter::estimate_swap_count(
+    const CandidateData& cand,
+    const std::vector<int>& pi,
+    bool reverse
+) const {
+    if (!cand.has_multi_qubit_body) {
+        return 0;
+    }
+
+    const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
+
+    double total = 0.0;
+    for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
+        int k = cand.qbit_map_keys[i];
+        int v = cand.qbit_map_vals[i];
+        int target_P = cand.node_mapping_flat[P_route_inv[v]];
+        int current_P = pi[k];
+        double d = dist(current_P, target_P);
+        if (d < std::numeric_limits<double>::infinity()) {
+            total += d;
+        }
+    }
+    return static_cast<int>(total / 2.0);
+}
+
+// ---------------------------------------------------------------------------
+// find_constrained_swaps (A* over k-dimensional state space)
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+SabreRouter::find_constrained_swaps(
+    const std::vector<int>& pi,
+    const std::vector<int>& qbit_map_keys,
+    const std::vector<int>& qbit_map_vals,
+    const std::vector<int>& node_mapping_flat,
+    const std::vector<int>& P_route_inv,
+    SwapCache* swap_cache,
+    const NeighborInfo* neighbor_info
+) const {
+    const int k = static_cast<int>(qbit_map_keys.size());
+
+    // ---- Setup: target/initial positions, pow_N, h0 ----
+    thread_local std::vector<int> target_positions;
+    thread_local std::vector<int> initial_positions;
+    thread_local std::vector<int64_t> pow_N;
+    target_positions.resize(k);
+    initial_positions.resize(k);
+    pow_N.resize(k);
+    {
+        int64_t s = 1;
+        for (int i = 0; i < k; i++) { pow_N[i] = s; s *= N_; }
+    }
+
+    bool already_there = true;
+    double h0_sum = 0.0;
+    int64_t initial_packed = 0;
+    int64_t target_packed = 0;
+    for (int i = 0; i < k; i++) {
+        const int q = qbit_map_keys[i];
+        const int v = qbit_map_vals[i];
+        const int t = node_mapping_flat[P_route_inv[v]];
+        const int ip = pi[q];
+        target_positions[i] = t;
+        initial_positions[i] = ip;
+        if (ip != t) already_there = false;
+        h0_sum += dist(ip, t);
+        initial_packed += static_cast<int64_t>(ip) * pow_N[i];
+        target_packed  += static_cast<int64_t>(t)  * pow_N[i];
+    }
+    if (already_there) {
+        return {{}, pi};
+    }
+
+    const bool use_neighbor =
+        neighbor_info != nullptr && neighbor_info->uses_tiebreak();
+
+    auto mix64 = [](uint64_t h, uint64_t v) -> uint64_t {
+        h ^= v + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        return h;
+    };
+
+    uint64_t neighbor_hash = 0;
+    if (use_neighbor) {
+        neighbor_hash = 0xcbf29ce484222325ULL;
+        for (const auto& edge : neighbor_info->edges) {
+            const int lo = std::min(edge.u_idx, edge.v_idx);
+            const int hi = std::max(edge.u_idx, edge.v_idx);
+            uint64_t w_bits;
+            std::memcpy(&w_bits, &edge.weight, sizeof(w_bits));
+            neighbor_hash = mix64(neighbor_hash, static_cast<uint64_t>(lo));
+            neighbor_hash = mix64(neighbor_hash, static_cast<uint64_t>(hi));
+            neighbor_hash = mix64(neighbor_hash, w_bits);
+        }
+        for (int p : neighbor_info->initial_pos) {
+            neighbor_hash = mix64(neighbor_hash, static_cast<uint64_t>(p));
+        }
+        uint64_t weight_bits;
+        const double weight_val = neighbor_info->weight;
+        std::memcpy(&weight_bits, &weight_val, sizeof(weight_bits));
+        neighbor_hash = mix64(neighbor_hash, weight_bits);
+    }
+
+    const SwapCacheKey cache_key{initial_packed, target_packed, k, neighbor_hash};
+
+    if (swap_cache) {
+        auto it = swap_cache->find(cache_key);
+        if (it != swap_cache->end()) {
+            auto result_pi = apply_swaps_to_pi(pi, it->second);
+            return {it->second, result_pi};
+        }
+    }
+
+    // ---- Neighbor heuristic setup ----
+    double total_edge_weight = 0.0;
+    if (use_neighbor) {
+        for (const auto& edge : neighbor_info->edges) {
+            total_edge_weight += edge.weight;
+        }
+    }
+    const double neighbor_norm = std::max(
+        1.0, total_edge_weight * std::max(1.0, max_finite_distance_)
+    );
+    const double neighbor_scale =
+        use_neighbor ? (neighbor_info->weight / neighbor_norm) : 0.0;
+
+    auto compute_nb_total = [&](const std::vector<int>& pos_nb) {
+        double total = 0.0;
+        for (const auto& edge : neighbor_info->edges) {
+            total += edge.weight * dist(pos_nb[edge.u_idx], pos_nb[edge.v_idx]);
+        }
+        return total;
+    };
+
+    double initial_nb_total = 0.0;
+    if (use_neighbor) {
+        initial_nb_total = compute_nb_total(neighbor_info->initial_pos);
+    }
+
+    // ---- Arena + best-state table (replaces visited+parent maps) ----
+    struct Node {
+        int64_t packed;
+        int parent_idx;
+        int g;
+        int sw_lo, sw_hi;
+        double h_sum;       // sum(dist(pos[i], target[i])) — twice the admissible h
+        double nb_total;    // sum(edge.weight * dist(...)) — pre-scale
+        int nb_arena_idx;   // -1 if !use_neighbor; else slot in nb_pos_flat
+        uint64_t nb_hash;   // incremental XOR hash of neighbor VQ positions
+    };
+    thread_local std::vector<Node> arena;
+    // Flat storage for neighbor positions: slot s lives at
+    // [s * nb_stride, (s+1) * nb_stride). Slots are shared across nodes whose
+    // swap doesn't touch any neighbor virtual qubit.
+    thread_local std::vector<int> nb_pos_flat;
+    thread_local std::vector<std::vector<int>> vq_edges;
+    thread_local std::vector<int> nb_scratch;
+    arena.clear();
+    nb_pos_flat.clear();
+    arena.reserve(1024);
+    // key = mix(packed) ^ nb_hash; no heap allocation per lookup
+    thread_local std::unordered_map<uint64_t, int32_t> best_node;
+    best_node.clear();
+    best_node.reserve(2048);
+
+    const int nb_stride = use_neighbor
+        ? static_cast<int>(neighbor_info->neighbor_vqs.size())
+        : 0;
+    if (use_neighbor) {
+        // Per-vq edge index list: which edges touch each virtual qubit.
+        vq_edges.assign(nb_stride, {});
+        for (int e = 0; e < static_cast<int>(neighbor_info->edges.size()); e++) {
+            const auto& edge = neighbor_info->edges[e];
+            vq_edges[edge.u_idx].push_back(e);
+            if (edge.v_idx != edge.u_idx) {
+                vq_edges[edge.v_idx].push_back(e);
+            }
+        }
+        nb_pos_flat.reserve(static_cast<size_t>(nb_stride) * 1024);
+        nb_pos_flat.insert(nb_pos_flat.end(),
+                           neighbor_info->initial_pos.begin(),
+                           neighbor_info->initial_pos.end());
+        nb_scratch.resize(nb_stride);
+    }
+
+    // Per-(vq_idx, phys) contribution to nb_hash; XOR-based so removals are
+    // identical to additions (self-inverse), enabling incremental updates.
+    auto slot_hash = [](int vq_idx, int phys) -> uint64_t {
+        uint64_t h = static_cast<uint64_t>(vq_idx) * 0x9e3779b97f4a7c15ULL
+                   ^ static_cast<uint64_t>(phys)   * 0x6c62272e07bb0142ULL;
+        h ^= h >> 33; h *= 0xff51afd7ed558ccdULL; h ^= h >> 33;
+        return h;
+    };
+    auto make_key = [](int64_t packed, uint64_t nb_hash) -> uint64_t {
+        uint64_t h = static_cast<uint64_t>(packed);
+        h ^= h >> 33; h *= 0xff51afd7ed558ccdULL;
+        h ^= h >> 33; h *= 0xc4ceb9fe1a85ec53ULL;
+        h ^= h >> 33;
+        return h ^ nb_hash;
+    };
+
+    uint64_t initial_nb_hash = 0;
+    if (use_neighbor) {
+        for (int z = 0; z < nb_stride; z++) {
+            initial_nb_hash ^= slot_hash(z, neighbor_info->initial_pos[z]);
+        }
+    }
+
+    // ---- Push initial node ----
+    // Slot 0 of nb_pos_flat already holds neighbor_info->initial_pos.
+    {
+        Node n;
+        n.packed = initial_packed;
+        n.parent_idx = -1;
+        n.g = 0;
+        n.sw_lo = -1; n.sw_hi = -1;
+        n.h_sum = h0_sum;
+        n.nb_total = initial_nb_total;
+        n.nb_arena_idx = use_neighbor ? 0 : -1;
+        n.nb_hash = initial_nb_hash;
+        arena.push_back(n);
+        best_node.emplace(make_key(initial_packed, initial_nb_hash), 0);
+    }
+
+    // PQ entry: (f, g, counter, arena_idx)
+    using PQEntry = std::tuple<double, int, uint64_t, int32_t>;
+    std::priority_queue<PQEntry, std::vector<PQEntry>, std::greater<PQEntry>> pq;
+    uint64_t counter = 0;
+    pq.push({0.5 * h0_sum + neighbor_scale * initial_nb_total, 0, counter++, 0});
+
+    thread_local std::vector<int> positions;
+    positions.resize(k);
+
+    while (!pq.empty()) {
+        auto [f, g_e, ctr, idx] = pq.top();
+        pq.pop();
+        (void)f; (void)ctr;
+        const int g = g_e;
+        const int64_t packed = arena[idx].packed;
+        const uint64_t cur_nb_hash = arena[idx].nb_hash;
+
+        // A state can be reinserted with a lower g-cost after this queue entry
+        // was pushed. When the neighbor tie-breaker is active, future-qubit
+        // positions are part of the state so equal-length paths with different
+        // bystander layouts are not collapsed.
+        const uint64_t cur_key = make_key(packed, cur_nb_hash);
+        auto cur_best = best_node.find(cur_key);
+        if (cur_best == best_node.end() || cur_best->second != idx) {
+            continue;
+        }
+
+        if (packed == target_packed) {
+            // Reconstruct path
+            std::vector<std::pair<int,int>> path;
+            int cur = idx;
+            while (arena[cur].parent_idx != -1) {
+                path.push_back({arena[cur].sw_lo, arena[cur].sw_hi});
+                cur = arena[cur].parent_idx;
+            }
+            std::reverse(path.begin(), path.end());
+
+            auto result_pi = apply_swaps_to_pi(pi, path);
+            if (swap_cache) {
+                (*swap_cache)[cache_key] = path;
+            }
+            return {path, result_pi};
+        }
+
+        // Stale entry?
+        if (arena[idx].g < g) continue;
+
+        // Unpack positions for this state
+        {
+            int64_t p = packed;
+            for (int i = 0; i < k; i++) {
+                positions[i] = static_cast<int>(p % N_);
+                p /= N_;
+            }
+        }
+        const double cur_h_sum = arena[idx].h_sum;
+        const double cur_nb_total = arena[idx].nb_total;
+        const int cur_nb_arena_idx = arena[idx].nb_arena_idx;
+        // cur_nb_hash already read above
+
+        // Expand: every SWAP that moves at least one partition qubit
+        for (int i = 0; i < k; i++) {
+            const int p = positions[i];
+            const int t_i = target_positions[i];
+            const int adj_lo = adj_offsets_[p];
+            const int adj_hi = adj_offsets_[p + 1];
+            for (int nb_idx = adj_lo; nb_idx < adj_hi; nb_idx++) {
+                const int nb = adj_flat_[nb_idx];
+                // Find j such that positions[j] == nb (if any)
+                int j_swap = -1;
+                for (int j = 0; j < k; j++) {
+                    if (positions[j] == nb) { j_swap = j; break; }
+                }
+
+                // Incremental packed
+                int64_t new_packed = packed + static_cast<int64_t>(nb - p) * pow_N[i];
+                if (j_swap >= 0) {
+                    new_packed += static_cast<int64_t>(p - nb) * pow_N[j_swap];
+                }
+
+                // Incremental h_sum
+                double new_h_sum = cur_h_sum
+                    - dist(p, t_i) + dist(nb, t_i);
+                if (j_swap >= 0) {
+                    const int t_j = target_positions[j_swap];
+                    new_h_sum += -dist(nb, t_j) + dist(p, t_j);
+                }
+
+                const int new_g = g + 1;
+
+                // Neighbor heuristic: incremental delta. Only edges incident
+                // to the affected virtual qubits change; everything else
+                // contributes the same dist as in the parent state.
+                double new_nb_total = cur_nb_total;
+                int new_nb_arena_idx = -1;
+                uint64_t new_nb_hash = cur_nb_hash;
+                if (use_neighbor) {
+                    const size_t parent_base =
+                        static_cast<size_t>(cur_nb_arena_idx) * nb_stride;
+                    for (int z = 0; z < nb_stride; z++) {
+                        nb_scratch[z] = nb_pos_flat[parent_base + z];
+                    }
+                    int idx_nb_vq = -1, idx_p_vq = -1;
+                    for (int z = 0; z < nb_stride; z++) {
+                        const int phys = nb_scratch[z];
+                        if (phys == nb) idx_nb_vq = z;
+                        else if (phys == p) idx_p_vq = z;
+                        if (idx_nb_vq >= 0 && idx_p_vq >= 0) break;
+                    }
+                    if (idx_nb_vq >= 0 || idx_p_vq >= 0) {
+                        double delta = 0.0;
+                        auto accum = [&](int vq_idx, double sign) {
+                            if (vq_idx < 0) return;
+                            for (int e : vq_edges[vq_idx]) {
+                                const auto& edge = neighbor_info->edges[e];
+                                delta += sign * edge.weight * dist(
+                                    nb_scratch[edge.u_idx],
+                                    nb_scratch[edge.v_idx]);
+                            }
+                        };
+                        accum(idx_nb_vq, -1.0);
+                        accum(idx_p_vq, -1.0);
+                        if (idx_nb_vq >= 0) nb_scratch[idx_nb_vq] = p;
+                        if (idx_p_vq >= 0)  nb_scratch[idx_p_vq]  = nb;
+                        accum(idx_nb_vq, +1.0);
+                        accum(idx_p_vq, +1.0);
+                        new_nb_total = cur_nb_total + delta;
+                        new_nb_arena_idx = static_cast<int>(
+                            nb_pos_flat.size() / nb_stride);
+                        nb_pos_flat.insert(nb_pos_flat.end(),
+                                           nb_scratch.begin(),
+                                           nb_scratch.end());
+                        // Incremental hash: XOR out old slots, XOR in new ones
+                        if (idx_nb_vq >= 0) {
+                            new_nb_hash ^= slot_hash(idx_nb_vq, nb)
+                                         ^ slot_hash(idx_nb_vq, p);
+                        }
+                        if (idx_p_vq >= 0) {
+                            new_nb_hash ^= slot_hash(idx_p_vq, p)
+                                         ^ slot_hash(idx_p_vq, nb);
+                        }
+                    } else {
+                        new_nb_arena_idx = cur_nb_arena_idx;
+                        // new_nb_hash unchanged
+                    }
+                }
+
+                const uint64_t new_key = make_key(new_packed, new_nb_hash);
+                auto existing = best_node.find(new_key);
+                if (existing != best_node.end()
+                    && arena[existing->second].g <= new_g
+                ) {
+                    continue;
+                }
+
+                // Insert/update node
+                Node n;
+                n.packed = new_packed;
+                n.parent_idx = idx;
+                n.g = new_g;
+                const int lo = std::min(p, nb);
+                const int hi = std::max(p, nb);
+                n.sw_lo = lo; n.sw_hi = hi;
+                n.h_sum = new_h_sum;
+                n.nb_total = new_nb_total;
+                n.nb_arena_idx = new_nb_arena_idx;
+                n.nb_hash = new_nb_hash;
+
+                int32_t new_idx = static_cast<int32_t>(arena.size());
+                arena.push_back(n);
+                best_node[new_key] = new_idx;
+
+                const double f_new = static_cast<double>(new_g)
+                                   + 0.5 * new_h_sum
+                                   + neighbor_scale * new_nb_total;
+                pq.push({f_new, new_g, counter++, new_idx});
+            }
+        }
+    }
+
+    // Failed to route (should not happen on a connected graph)
+    return {{}, pi};
+}
+
+// ---------------------------------------------------------------------------
+// transform_pi
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+SabreRouter::transform_pi(
+    const CandidateData& cand,
+    const std::vector<int>& pi,
+    bool reverse,
+    SwapCache* swap_cache,
+    const NeighborInfo* neighbor_info
+) const {
+    const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
+    const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
+
+    if (!cand.has_multi_qubit_body) {
+        std::vector<int> dynamic_node_mapping(P_route_inv.size(), -1);
+        for (size_t i = 0; i < cand.qbit_map_keys_sorted.size(); i++) {
+            const int logical_q = cand.qbit_map_keys_sorted[i];
+            const int qstar = cand.qbit_map_vals_sorted[i];
+            dynamic_node_mapping[P_route_inv[qstar]] = pi[logical_q];
+        }
+
+        std::vector<int> pi_output = pi;
+        for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
+            if (q_star < cand.qstar_to_q.size()) {
+                const int logical_q = cand.qstar_to_q[q_star];
+                if (logical_q < 0) continue;
+                pi_output[logical_q] = dynamic_node_mapping[P_exit[q_star]];
+            }
+        }
+        return {{}, std::move(pi_output)};
+    }
+
+    // Route qubits to input positions
+    auto [swaps, pi_routed] = find_constrained_swaps(
+        pi,
+        cand.qbit_map_keys_sorted,
+        cand.qbit_map_vals_sorted,
+        cand.node_mapping_flat,
+        P_route_inv,
+        swap_cache,
+        neighbor_info
+    );
+
+    // Update output positions using P_exit
+    std::vector<int> pi_output = pi_routed;
+
+    for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
+        if (q_star < cand.qstar_to_q.size()) {
+            int k = cand.qstar_to_q[q_star];
+            if (k < 0) continue;
+            pi_output[k] = cand.node_mapping_flat[P_exit[q_star]];
+        }
+    }
+
+    return {swaps, pi_output};
+}
+
+// ---------------------------------------------------------------------------
+// generate_extended_set (BFS lookahead)
+// ---------------------------------------------------------------------------
+
+std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
+    const std::vector<int>& F,
+    const std::vector<uint8_t>& resolved,
+    const std::vector<std::vector<int>>& children_graph,
+    const std::vector<std::vector<int>>& parents_graph
+) const {
+    std::vector<std::pair<int,int>> E;
+    std::vector<uint8_t> in_E(num_partitions_, 0);
+    std::vector<uint8_t> in_F(num_partitions_, 0);
+    for (int p : F) in_F[p] = 1;
+
+    struct BFSNode {
+        int partition;
+        int depth;
+    };
+
+    for (int front_idx : F) {
+        if (static_cast<int>(E.size()) >= config_.max_E_size) break;
+
+        std::deque<BFSNode> queue;
+        // Push without pre-checking; eligibility is tested when popped so a
+        // single-qubit partition can act as a transparent transit node.
+        for (int child : children_graph[front_idx]) {
+            queue.push_back({child, 1});
+        }
+
+        while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
+            auto [part, depth] = queue.front();
+            queue.pop_front();
+
+            if (depth > config_.max_lookahead) continue;
+            if (in_E[part] || in_F[part] || resolved[part]) continue;
+
+            bool parents_ok = true;
+            for (int par : parents_graph[part]) {
+                if (!resolved[par] && !in_F[par]) {
+                    parents_ok = false;
+                    break;
+                }
+            }
+            if (!parents_ok) continue;
+
+            if (layout_partitions_[part].is_single) {
+                // Single-qubit partitions act as transparent transit nodes:
+                // forward their grandchildren at the same depth.
+                for (int child : children_graph[part]) {
+                    queue.push_back({child, depth});
+                }
+                continue;
+            }
+
+            E.push_back({part, depth});
+            in_E[part] = 1;
+
+            if (depth < config_.max_lookahead) {
+                for (int child : children_graph[part]) {
+                    queue.push_back({child, depth + 1});
+                }
+            }
+        }
+    }
+
+    return E;
+}
+
+// ---------------------------------------------------------------------------
+// Routing cost helpers
+// ---------------------------------------------------------------------------
+
+double SabreRouter::entry_future_cost(
+    const CanonicalEntry& entry,
+    const std::vector<int>& pi
+) const {
+    double total = 0.0;
+    for (size_t i = 0; i < entry.edges_u.size(); i++) {
+        const double d = dist(pi[entry.edges_u[i]], pi[entry.edges_v[i]]);
+        if (d > 1.0) total += d - 1.0;
+    }
+    return total;
+}
+
+double SabreRouter::future_partition_cost(
+    int partition_idx,
+    const std::vector<int>& pi,
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    if (
+        partition_idx >= 0
+        && partition_idx < static_cast<int>(candidate_cache_.size())
+        && !candidate_cache_[partition_idx].empty()
+        && candidate_cache_[partition_idx].front().involved_qbits.size() >= 3
+    ) {
+        double best = std::numeric_limits<double>::infinity();
+        for (const auto& cand : candidate_cache_[partition_idx]) {
+            best = std::min(
+                best,
+                static_cast<double>(estimate_swap_count(cand, pi, reverse))
+            );
+        }
+        return best;
+    }
+
+    auto it = canonical_data.find(partition_idx);
+    if (it == canonical_data.end()) {
+        return std::numeric_limits<double>::infinity();
+    }
+    return entry_future_cost(it->second, pi);
+}
+
+double SabreRouter::future_context_cost(
+    int exclude_partition_idx,
+    const std::vector<int>& pi,
+    const std::vector<int>& F_snapshot,
+    const std::vector<std::pair<int,int>>& E,
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    // Candidate-aware lower bound: for each future partition, use the best
+    // available candidate entry cost under this layout. This lets 3q line
+    // blocks distinguish which logical qubit should sit on the path center.
+    double f_sum = 0.0;
+    int n_other = 0;
+    for (int p_idx : F_snapshot) {
+        if (p_idx == exclude_partition_idx) continue;
+        const double cost = future_partition_cost(
+            p_idx, pi, reverse, canonical_data);
+        if (!std::isfinite(cost)) continue;
+        f_sum += cost;
+        n_other++;
+    }
+
+    double score = n_other > 0
+        ? f_sum / static_cast<double>(n_other)
+        : 0.0;
+
+    if (!E.empty()) {
+        double e_sum = 0.0;
+        int e_count = 0;
+        for (auto [p_idx, depth] : E) {
+            if (p_idx == exclude_partition_idx) continue;
+            const double cost = future_partition_cost(
+                p_idx, pi, reverse, canonical_data);
+            if (!std::isfinite(cost)) continue;
+            const double alpha =
+                (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+                    ? alpha_weights_[depth]
+                    : std::pow(config_.E_alpha, depth);
+            e_sum += alpha * cost;
+            e_count++;
+        }
+        if (e_count > 0) {
+            score += config_.E_weight * e_sum / static_cast<double>(e_count);
+        }
+    }
+
+    return score;
+}
+
+std::vector<int> SabreRouter::estimate_candidate_output_layout(
+    const CandidateData& cand,
+    const std::vector<int>& pi,
+    bool reverse
+) const {
+    if (!cand.has_multi_qubit_body) {
+        const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
+        const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
+        std::vector<int> dynamic_node_mapping(P_route_inv.size(), -1);
+        for (size_t i = 0; i < cand.qbit_map_keys_sorted.size(); i++) {
+            const int logical_q = cand.qbit_map_keys_sorted[i];
+            const int qstar = cand.qbit_map_vals_sorted[i];
+            dynamic_node_mapping[P_route_inv[qstar]] = pi[logical_q];
+        }
+
+        std::vector<int> pi_output = pi;
+        for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
+            if (q_star < cand.qstar_to_q.size()) {
+                const int logical_q = cand.qstar_to_q[q_star];
+                if (logical_q < 0) continue;
+                pi_output[logical_q] = dynamic_node_mapping[P_exit[q_star]];
+            }
+        }
+        return pi_output;
+    }
+
+    const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
+    std::vector<int> pi_output = pi;
+
+    for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
+        if (q_star < cand.qstar_to_q.size()) {
+            int k = cand.qstar_to_q[q_star];
+            if (k < 0) continue;
+            pi_output[k] = cand.node_mapping_flat[P_exit[q_star]];
+        }
+    }
+
+    return pi_output;
+}
+
+// ---------------------------------------------------------------------------
+// score_candidate (LightSABRE scoring)
+// ---------------------------------------------------------------------------
+
+double SabreRouter::score_candidate(
+    const CandidateData& cand,
+    const std::vector<int>& F_snapshot,
+    const std::vector<int>& pi,
+    const std::vector<std::pair<int,int>>& E,
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data,
+    SwapCache* swap_cache,
+    const std::vector<double>* decay,
+    std::vector<std::pair<int,int>>* out_swaps,
+    std::vector<int>* out_pi_new,
+    const NeighborInfo* cached_neighbor_info
+) const {
+    NeighborInfo local_neighbor_info;
+    const NeighborInfo* neighbor_ptr;
+    if (cached_neighbor_info) {
+        neighbor_ptr = cached_neighbor_info->uses_tiebreak() ? cached_neighbor_info : nullptr;
+    } else {
+        local_neighbor_info = build_neighbor_info(
+            cand.partition_idx, F_snapshot, E, pi, canonical_data);
+        neighbor_ptr = local_neighbor_info.uses_tiebreak() ? &local_neighbor_info : nullptr;
+    }
+    auto [swaps, output_perm] = transform_pi(
+        cand,
+        pi,
+        reverse,
+        swap_cache,
+        neighbor_ptr
+    );
+
+    double decay_factor = 1.0;
+    if (decay != nullptr && !swaps.empty()) {
+        decay_factor = decay_factor_for_swaps(swaps, *decay);
+    }
+    double score = routing_objective(
+        static_cast<double>(swaps.size()),
+        cand.cnot_count,
+        1.0,
+        decay_factor
+    );
+
+    const int cand_idx = cand.partition_idx;
+    double future_score = future_context_cost(
+        cand_idx,
+        output_perm,
+        F_snapshot,
+        E,
+        reverse,
+        canonical_data
+    );
+    if (cand.involved_qbits.size() >= 3) {
+        future_score *= config_.three_qubit_exit_weight;
+    }
+    score += future_score;
+
+    if (out_swaps) *out_swaps = std::move(swaps);
+    if (out_pi_new) *out_pi_new = std::move(output_perm);
+    return score;
+}
+
+// ---------------------------------------------------------------------------
+// obtain_partition_candidates
+// ---------------------------------------------------------------------------
+
+std::vector<const CandidateData*> SabreRouter::obtain_partition_candidates(
+    const std::vector<int>& F
+) const {
+    std::vector<const CandidateData*> result;
+    for (int p_idx : F) {
+        if (p_idx < 0 || p_idx >= num_partitions_) continue;
+        for (const auto& cand : candidate_cache_[p_idx]) {
+            result.push_back(&cand);
+        }
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// prefilter_candidates
+// ---------------------------------------------------------------------------
+
+std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
+    const std::vector<const CandidateData*>& candidates,
+    const std::vector<int>& pi,
+    int top_k,
+    const std::vector<int>& F_snapshot,
+    const std::vector<std::pair<int,int>>& E,
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    if (static_cast<int>(candidates.size()) <= top_k) return candidates;
+    if (top_k <= 0) return {};
+
+    using Pair = std::pair<double, const CandidateData*>;
+    std::vector<Pair> estimated;
+    estimated.reserve(candidates.size());
+    for (const auto* cand : candidates) {
+        const auto approx_output = estimate_candidate_output_layout(
+            *cand, pi, reverse);
+        const double est = routing_objective(
+            static_cast<double>(estimate_swap_count(*cand, pi, reverse)),
+            cand->cnot_count
+        ) + future_context_cost(
+            cand->partition_idx, approx_output, F_snapshot, E, reverse,
+            canonical_data);
+        estimated.push_back({est, cand});
+    }
+
+    std::stable_sort(
+        estimated.begin(),
+        estimated.end(),
+        [](const Pair& a, const Pair& b) {
+            if (a.first != b.first) return a.first < b.first;
+            if (a.second->partition_idx != b.second->partition_idx) {
+                return a.second->partition_idx < b.second->partition_idx;
+            }
+            return a.second->candidate_idx < b.second->candidate_idx;
+        }
+    );
+
+    const int min_per_partition =
+        std::max(0, config_.prefilter_min_per_partition);
+    const int min_3q = std::max(0, config_.prefilter_min_3q);
+
+    std::vector<const CandidateData*> result;
+    result.reserve(std::min(static_cast<int>(candidates.size()), top_k));
+    std::unordered_set<const CandidateData*> selected;
+
+    if (min_per_partition > 0 || min_3q > 0) {
+        std::unordered_map<int, int> quota_by_partition;
+        for (const auto& item : estimated) {
+            const CandidateData* cand = item.second;
+            int quota = min_per_partition;
+            if (cand->involved_qbits.size() >= 3) {
+                quota = std::max(quota, min_3q);
+            }
+            if (quota <= 0) continue;
+            auto it = quota_by_partition.find(cand->partition_idx);
+            if (it == quota_by_partition.end() || quota > it->second) {
+                quota_by_partition[cand->partition_idx] = quota;
+            }
+        }
+
+        std::unordered_map<int, int> selected_by_partition;
+        for (const auto& item : estimated) {
+            const CandidateData* cand = item.second;
+            auto quota_it = quota_by_partition.find(cand->partition_idx);
+            if (quota_it == quota_by_partition.end()) continue;
+            int& count = selected_by_partition[cand->partition_idx];
+            if (count >= quota_it->second) continue;
+            result.push_back(cand);
+            selected.insert(cand);
+            count++;
+        }
+    }
+
+    for (const auto& item : estimated) {
+        if (static_cast<int>(result.size()) >= top_k) break;
+        const CandidateData* cand = item.second;
+        if (selected.find(cand) != selected.end()) continue;
+        result.push_back(cand);
+        selected.insert(cand);
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// select_best_candidate
+// ---------------------------------------------------------------------------
+
+const CandidateData& SabreRouter::select_best_candidate(
+    const std::vector<const CandidateData*>& candidates,
+    const std::vector<double>& scores,
+    std::mt19937* rng
+) const {
+    (void)rng;
+
+    // Find minimum score
+    double min_score = scores[0];
+    size_t min_idx = 0;
+    for (size_t i = 1; i < scores.size(); i++) {
+        if (scores[i] < min_score) {
+            min_score = scores[i];
+            min_idx = i;
+        }
+    }
+
+    return *candidates[min_idx];
+}
+
+// ---------------------------------------------------------------------------
+// Boundary beam search helpers
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<int>, std::vector<uint8_t>>
+SabreRouter::advance_layout_frontier(
+    int selected_partition_idx,
+    const std::vector<int>& F,
+    const std::vector<uint8_t>& resolved,
+    const std::vector<std::vector<int>>& children_graph,
+    const std::vector<std::vector<int>>& parents_graph
+) const {
+    std::vector<int> F_next(F);
+    std::vector<uint8_t> resolved_next(resolved);
+
+    F_next.erase(
+        std::remove(F_next.begin(), F_next.end(), selected_partition_idx),
+        F_next.end()
+    );
+    if (
+        selected_partition_idx >= 0
+        && selected_partition_idx < static_cast<int>(resolved_next.size())
+    ) {
+        resolved_next[selected_partition_idx] = 1;
+    }
+
+    std::deque<int> stack;
+    for (int child : children_graph[selected_partition_idx]) {
+        stack.push_back(child);
+    }
+
+    while (!stack.empty()) {
+        const int child = stack.front();
+        stack.pop_front();
+
+        if (resolved_next[child]) continue;
+        if (std::find(F_next.begin(), F_next.end(), child) != F_next.end()) {
+            continue;
+        }
+
+        bool parents_ok = true;
+        for (int parent : parents_graph[child]) {
+            if (!resolved_next[parent]) {
+                parents_ok = false;
+                break;
+            }
+        }
+        if (!parents_ok) continue;
+
+        if (layout_partitions_[child].is_single) {
+            resolved_next[child] = 1;
+            for (int grandchild : children_graph[child]) {
+                stack.push_back(grandchild);
+            }
+        } else {
+            F_next.push_back(child);
+        }
+    }
+
+    return {std::move(F_next), std::move(resolved_next)};
+}
+
+size_t SabreRouter::boundary_beam_select_index(
+    const std::vector<const CandidateData*>& candidates,
+    const std::vector<double>& scores,
+    const std::vector<std::vector<std::pair<int,int>>>& cached_swaps,
+    const std::vector<std::vector<int>>& cached_pi,
+    const std::vector<int>& F_snapshot,
+    const std::vector<uint8_t>& resolved,
+    const std::vector<std::vector<int>>& children_graph,
+    const std::vector<std::vector<int>>& parents_graph,
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data,
+    SwapCache* swap_cache
+) const {
+    size_t fallback_idx = 0;
+    for (size_t i = 1; i < scores.size(); i++) {
+        if (scores[i] < scores[fallback_idx]) {
+            fallback_idx = i;
+        }
+    }
+
+    const int beam_width = std::max(1, config_.boundary_beam_width);
+    const int beam_depth = std::max(1, config_.boundary_beam_depth);
+    if (beam_width <= 1 || beam_depth <= 1 || candidates.size() <= 1) {
+        return fallback_idx;
+    }
+
+    bool has_three_qubit_candidate = false;
+    for (const auto* cand : candidates) {
+        if (cand->involved_qbits.size() >= 3) {
+            has_three_qubit_candidate = true;
+            break;
+        }
+    }
+    if (!has_three_qubit_candidate) {
+        return fallback_idx;
+    }
+
+    struct BeamState {
+        double rank_cost;
+        double total_cost;
+        std::vector<int> pi;
+        std::vector<int> F;
+        std::vector<uint8_t> resolved;
+        size_t first_idx;
+    };
+
+    auto transition_cost = [&](const CandidateData& cand, size_t idx) {
+        return routing_objective(
+            static_cast<double>(cached_swaps[idx].size()),
+            cand.cnot_count
+        );
+    };
+
+    auto sort_states = [](const BeamState& a, const BeamState& b) {
+        if (a.rank_cost != b.rank_cost) return a.rank_cost < b.rank_cost;
+        return a.first_idx < b.first_idx;
+    };
+
+    std::vector<BeamState> states;
+    states.reserve(candidates.size());
+    for (size_t idx = 0; idx < candidates.size(); idx++) {
+        if (cached_pi[idx].empty()) continue;
+        const auto& cand = *candidates[idx];
+        auto [F_next, resolved_next] = advance_layout_frontier(
+            cand.partition_idx,
+            F_snapshot,
+            resolved,
+            children_graph,
+            parents_graph
+        );
+        const double trans_cost = transition_cost(cand, idx);
+        states.push_back(BeamState{
+            scores[idx],
+            trans_cost,
+            cached_pi[idx],
+            std::move(F_next),
+            std::move(resolved_next),
+            idx
+        });
+    }
+
+    if (states.empty()) {
+        return fallback_idx;
+    }
+    std::sort(states.begin(), states.end(), sort_states);
+    if (static_cast<int>(states.size()) > beam_width) {
+        states.resize(beam_width);
+    }
+
+    for (int depth = 1; depth < beam_depth; depth++) {
+        std::vector<BeamState> expanded;
+
+        for (const auto& state : states) {
+            if (state.F.empty()) {
+                expanded.push_back(BeamState{
+                    state.total_cost,
+                    state.total_cost,
+                    state.pi,
+                    state.F,
+                    state.resolved,
+                    state.first_idx
+                });
+                continue;
+            }
+
+            auto E = generate_extended_set(
+                state.F,
+                state.resolved,
+                children_graph,
+                parents_graph
+            );
+
+            auto rollout_candidates = obtain_partition_candidates(state.F);
+            if (rollout_candidates.empty()) {
+                expanded.push_back(BeamState{
+                    state.total_cost,
+                    state.total_cost,
+                    state.pi,
+                    state.F,
+                    state.resolved,
+                    state.first_idx
+                });
+                continue;
+            }
+
+            rollout_candidates = prefilter_candidates(
+                rollout_candidates,
+                state.pi,
+                config_.prefilter_top_k,
+                state.F,
+                E,
+                reverse,
+                canonical_data
+            );
+
+            for (const CandidateData* cand : rollout_candidates) {
+                NeighborInfo neighbor_info = build_neighbor_info(
+                    cand->partition_idx,
+                    state.F,
+                    E,
+                    state.pi,
+                    canonical_data
+                );
+                std::vector<std::pair<int,int>> swaps;
+                std::vector<int> output_perm;
+                const double score = score_candidate(
+                    *cand,
+                    state.F,
+                    state.pi,
+                    E,
+                    reverse,
+                    canonical_data,
+                    swap_cache,
+                    nullptr,
+                    &swaps,
+                    &output_perm,
+                    &neighbor_info
+                );
+                const double trans_cost = routing_objective(
+                    static_cast<double>(swaps.size()),
+                    cand->cnot_count
+                );
+                const double future_cost = score - trans_cost;
+                const double new_total = state.total_cost + trans_cost;
+                const double rank_cost = new_total + future_cost;
+
+                auto [F_next, resolved_next] = advance_layout_frontier(
+                    cand->partition_idx,
+                    state.F,
+                    state.resolved,
+                    children_graph,
+                    parents_graph
+                );
+                expanded.push_back(BeamState{
+                    rank_cost,
+                    new_total,
+                    std::move(output_perm),
+                    std::move(F_next),
+                    std::move(resolved_next),
+                    state.first_idx
+                });
+            }
+        }
+
+        if (expanded.empty()) {
+            break;
+        }
+        std::sort(expanded.begin(), expanded.end(), sort_states);
+        if (static_cast<int>(expanded.size()) > beam_width) {
+            expanded.resize(beam_width);
+        }
+        states = std::move(expanded);
+    }
+
+    if (states.empty()) {
+        return fallback_idx;
+    }
+    return std::min_element(states.begin(), states.end(), sort_states)->first_idx;
+}
+
+// ---------------------------------------------------------------------------
+// heuristic_search (main loop)
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
+    const std::vector<int>& F_init,
+    std::vector<int> pi,
+    bool reverse,
+    std::mt19937* rng,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data,
+    const std::vector<std::vector<int>>& cg,
+    const std::vector<std::vector<int>>& pg,
+    ForwardRouteResult* route_trace
+) const {
+    (void)rng;
+
+    std::vector<int> F;
+    std::vector<int> queue;
+    std::vector<uint8_t> resolved(num_partitions_, 0);
+    std::vector<uint8_t> in_F(num_partitions_, 0);
+    double total_cost = 0.0;
+
+    // Split F_init into F (multi-qubit) and queue (single-qubit)
+    for (int p : F_init) {
+        if (layout_partitions_[p].is_single) {
+            queue.push_back(p);
+        } else {
+            F.push_back(p);
+            in_F[p] = 1;
+        }
+    }
+
+    // Flush initial single-qubit partitions
+    while (!queue.empty()) {
+        int p = queue.back();
+        queue.pop_back();
+
+        if (resolved[p]) continue;
+        resolved[p] = 1;
+        if (route_trace) {
+            RouteStep step;
+            step.type = 2;
+            step.partition_idx = p;
+            if (!layout_partitions_[p].involved_qbits.empty()) {
+                step.physical_qubit = pi[layout_partitions_[p].involved_qbits[0]];
+            }
+            route_trace->steps.push_back(std::move(step));
+        }
+
+        for (int child : cg[p]) {
+            if (!resolved[child] && !in_F[child]) {
+                bool parents_ok = true;
+                for (int par : pg[child]) {
+                    if (!resolved[par]) { parents_ok = false; break; }
+                }
+                if (parents_ok) {
+                    if (layout_partitions_[child].is_single) {
+                        queue.push_back(child);
+                    } else {
+                        F.push_back(child);
+                        in_F[child] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    // Swap cache for this search call (thread-local, on stack)
+    SwapCache swap_cache;
+    std::vector<double> decay(N_, 1.0);
+    int swap_heavy_partitions = 0;
+
+    // Main search loop
+    while (!F.empty()) {
+        if (
+            config_.swap_burst_budget > 0
+            && swap_heavy_partitions >= config_.swap_burst_budget
+        ) {
+            auto [valve_swaps, pi_bridged] = release_valve(
+                F,
+                pi,
+                canonical_data
+            );
+            if (!valve_swaps.empty()) {
+                total_cost += routing_objective(
+                    static_cast<double>(valve_swaps.size()),
+                    0,
+                    1.0,
+                    decay_factor_for_swaps(valve_swaps, decay)
+                );
+                if (route_trace) {
+                    RouteStep step;
+                    step.type = 0;
+                    step.swaps = valve_swaps;
+                    route_trace->cnot_count += static_cast<int>(valve_swaps.size()) * 3;
+                    route_trace->steps.push_back(std::move(step));
+                }
+                apply_decay_for_swaps(valve_swaps, decay);
+                pi = std::move(pi_bridged);
+                swap_heavy_partitions = 0;
+                continue;
+            }
+            reset_decay(decay);
+            swap_heavy_partitions = 0;
+        }
+
+        auto all_candidates = obtain_partition_candidates(F);
+        if (all_candidates.empty()) break;
+
+        // Generate extended set
+        auto E = generate_extended_set(F, resolved, cg, pg);
+
+        // Prefilter with a cheap estimate of the candidate's future context.
+        auto candidates = prefilter_candidates(
+            all_candidates, pi, config_.prefilter_top_k, F, E, reverse,
+            canonical_data);
+
+        // Group candidates by partition_idx so build_neighbor_info is shared
+        std::vector<size_t> order(candidates.size());
+        std::iota(order.begin(), order.end(), 0);
+        std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+            return candidates[a]->partition_idx < candidates[b]->partition_idx;
+        });
+
+        // Score all candidates and cache each one's transform output
+        std::vector<double> scores(candidates.size());
+        std::vector<std::vector<std::pair<int,int>>> cached_swaps(candidates.size());
+        std::vector<std::vector<int>> cached_pi(candidates.size());
+        int prev_partition_idx = -1;
+        NeighborInfo cached_ni;
+        for (size_t k_ord = 0; k_ord < order.size(); k_ord++) {
+            const size_t ci = order[k_ord];
+            const int p_idx = candidates[ci]->partition_idx;
+            if (p_idx != prev_partition_idx) {
+                cached_ni = build_neighbor_info(p_idx, F, E, pi, canonical_data);
+                prev_partition_idx = p_idx;
+            }
+            scores[ci] = score_candidate(
+                *candidates[ci],
+                F, pi, E, reverse, canonical_data,
+                &swap_cache, &decay,
+                &cached_swaps[ci], &cached_pi[ci],
+                &cached_ni
+            );
+        }
+
+        // Select best, optionally using boundary-layout beam rollout
+        const size_t best_ci = boundary_beam_select_index(
+            candidates,
+            scores,
+            cached_swaps,
+            cached_pi,
+            F,
+            resolved,
+            cg,
+            pg,
+            reverse,
+            canonical_data,
+            &swap_cache
+        );
+        const auto& best = *candidates[best_ci];
+
+        // Remove from F and mark resolved
+        F.erase(std::remove(F.begin(), F.end(), best.partition_idx), F.end());
+        in_F[best.partition_idx] = 0;
+        resolved[best.partition_idx] = 1;
+
+        // Reuse cached transform from scoring (F_snapshot \ {best} == F_after_erase
+        // because exclude_partition_idx == best.partition_idx in both cases)
+        std::vector<std::pair<int,int>> swaps = std::move(cached_swaps[best_ci]);
+        std::vector<int> pi_new = std::move(cached_pi[best_ci]);
+        const double decay_factor = swaps.empty()
+            ? 1.0
+            : decay_factor_for_swaps(swaps, decay);
+        total_cost += routing_objective(
+            static_cast<double>(swaps.size()),
+            best.cnot_count,
+            1.0,
+            decay_factor
+        );
+        if (route_trace) {
+            if (!swaps.empty()) {
+                RouteStep swap_step;
+                swap_step.type = 0;
+                swap_step.swaps = swaps;
+                route_trace->cnot_count += static_cast<int>(swaps.size()) * 3;
+                route_trace->steps.push_back(std::move(swap_step));
+            }
+            RouteStep part_step;
+            part_step.type = 1;
+            part_step.partition_idx = best.partition_idx;
+            part_step.candidate_idx = best.candidate_idx;
+            route_trace->cnot_count += best.cnot_count;
+            route_trace->steps.push_back(std::move(part_step));
+        }
+        pi = std::move(pi_new);
+        apply_decay_for_swaps(swaps, decay);
+        if (swaps.empty()) {
+            swap_heavy_partitions = 0;
+            reset_decay(decay);
+        } else {
+            swap_heavy_partitions++;
+        }
+
+        // Update F with newly eligible children
+        for (int child : cg[best.partition_idx]) {
+            if (!resolved[child] && !in_F[child]) {
+                bool parents_ok = true;
+                for (int par : pg[child]) {
+                    if (!resolved[par]) { parents_ok = false; break; }
+                }
+
+                if (parents_ok) {
+                    if (layout_partitions_[child].is_single) {
+                        resolved[child] = 1;
+                        if (route_trace) {
+                            RouteStep step;
+                            step.type = 2;
+                            step.partition_idx = child;
+                            if (!layout_partitions_[child].involved_qbits.empty()) {
+                                step.physical_qubit = pi[layout_partitions_[child].involved_qbits[0]];
+                            }
+                            route_trace->steps.push_back(std::move(step));
+                        }
+                        std::vector<int> stack;
+                        for (int gc : cg[child]) stack.push_back(gc);
+
+                        while (!stack.empty()) {
+                            int gc = stack.back();
+                            stack.pop_back();
+
+                            if (!resolved[gc] && !in_F[gc]) {
+                                bool gc_parents_ok = true;
+                                for (int p_gc : pg[gc]) {
+                                    if (!resolved[p_gc]) { gc_parents_ok = false; break; }
+                                }
+                                if (gc_parents_ok) {
+                                    if (layout_partitions_[gc].is_single) {
+                                        resolved[gc] = 1;
+                                        if (route_trace) {
+                                            RouteStep step;
+                                            step.type = 2;
+                                            step.partition_idx = gc;
+                                            if (!layout_partitions_[gc].involved_qbits.empty()) {
+                                                step.physical_qubit = pi[layout_partitions_[gc].involved_qbits[0]];
+                                            }
+                                            route_trace->steps.push_back(std::move(step));
+                                        }
+                                        for (int ggc : cg[gc]) stack.push_back(ggc);
+                                    } else {
+                                        F.push_back(gc);
+                                        in_F[gc] = 1;
+                                    }
+                                }
+                            }
+                        }
+                    } else {
+                        F.push_back(child);
+                        in_F[child] = 1;
+                    }
+                }
+            }
+        }
+    }
+
+    return {pi, total_cost};
+}
+
+ForwardRouteResult SabreRouter::route_forward(
+    const std::vector<int>& pi
+) const {
+    ForwardRouteResult result;
+    result.pi_initial = pi;
+    auto F_fwd = get_initial_layer();
+    auto routed = heuristic_search(
+        F_fwd,
+        pi,
+        false,
+        nullptr,
+        canonical_data_fwd_,
+        DAG_,
+        IDAG_,
+        &result
+    );
+    result.pi = std::move(routed.first);
+    return result;
+}
+
+
+// ---------------------------------------------------------------------------
+// run_trial (full implementation)
+// ---------------------------------------------------------------------------
+
+TrialResult SabreRouter::run_trial(
+    int trial_idx,
+    const std::vector<int>& seeded_pi,
+    int n_iterations,
+    int n_trials
+) const {
+    // RNG setup
+    std::mt19937 rng_gen(config_.random_seed + trial_idx);
+    std::mt19937* rng = (n_trials > 1) ? &rng_gen : nullptr;
+
+    std::vector<int> pi = sample_initial_layout(
+        trial_idx, n_trials, seeded_pi, rng_gen
+    );
+
+    auto F_rev = get_final_layer();
+    auto F_fwd = get_initial_layer();
+
+    // Forward-backward-forward iterations
+    for (int iteration = 0; iteration < n_iterations; iteration++) {
+        // Backward pass: swap DAG/IDAG
+        auto bwd_result = heuristic_search(F_rev, pi, true, rng, canonical_data_rev_, IDAG_, DAG_);
+        pi = std::move(bwd_result.first);
+
+        // Forward pass (skip on last iteration)
+        if (iteration < n_iterations - 1) {
+            auto fwd_result = heuristic_search(F_fwd, pi, false, rng, canonical_data_fwd_, DAG_, IDAG_);
+            pi = std::move(fwd_result.first);
+        }
+    }
+
+    // Deterministic evaluation pass on a copy of pi to score the trial.
+    auto eval_result = heuristic_search(F_fwd, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_);
+    double cost = eval_result.second;
+
+    // Return the layout from AFTER the backward pass, BEFORE the eval pass.
+    return TrialResult{std::move(pi), cost};
+}
+
+} // namespace squander::routing
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
new file mode 100644
index 000000000..24156b791
--- /dev/null
+++ b/squander/synthesis/PartAM.py
@@ -0,0 +1,3935 @@
+"""
+This is an implementation of Partition Aware Mapping.
+"""
+import csv
+import logging
+import multiprocessing as mp
+import os
+import time
+from collections import deque, defaultdict
+from itertools import combinations, permutations
+from multiprocessing import Pool
+from typing import List, Optional
+
+import numpy as np
+from tqdm import tqdm
+
+from squander.decomposition.qgd_N_Qubit_Decompositions_Wrapper import (
+    qgd_N_Qubit_Decomposition_adaptive as N_Qubit_Decomposition_adaptive,
+    qgd_N_Qubit_Decomposition_Tree_Search as N_Qubit_Decomposition_Tree_Search,
+    qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search,
+)
+from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
+from squander.partitioning.ilp import (
+    get_all_partitions,
+    _get_topo_order,
+    topo_sort_partitions,
+    ilp_global_optimal,
+)
+# Module-level globals for pool workers (set via Pool initializer)
+_worker_config = None
+
+def _init_decompose_worker(config):
+    global _worker_config
+    _worker_config = config
+
+def _decompose_one(Umtx, mini_topology):
+    """Pool worker function. Uses config set once by initializer instead of
+    pickling it per task."""
+    from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping
+    return qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(
+        Umtx, _worker_config, mini_topology
+    )
+
+def _available_cpus():
+    """Return the number of CPUs available to this process.
+
+    Respects affinity masks set by taskset, cgroups, SLURM, etc.
+    Falls back to mp.cpu_count() on platforms without sched_getaffinity.
+    """
+    try:
+        return len(os.sched_getaffinity(0))
+    except (AttributeError, OSError):
+        return mp.cpu_count()
+
+
+from squander.synthesis.PartAM_utils import (
+    get_subtopologies_of_type,
+    get_unique_subtopologies,
+    get_canonical_form,
+    get_node_mapping,
+    compute_automorphisms,
+    derive_result_from_automorphism,
+    SingleQubitPartitionResult,
+    PartitionSynthesisResult,
+    PartitionCandidate,
+    PartitionScoreData,
+    check_circuit_compatibility,
+    construct_swap_circuit,
+)
+
+_routing_worker_state = None
+
+
+class _DynamicMappedPartitionCandidate:
+    """Partition candidate remapped to the route-time physical layout."""
+
+    def __init__(self, candidate, node_mapping):
+        self.candidate = candidate
+        self.partition_idx = candidate.partition_idx
+        self.topology_idx = candidate.topology_idx
+        self.permutation_idx = candidate.permutation_idx
+        self.cnot_count = candidate.cnot_count
+        self.node_mapping = dict(node_mapping)
+
+    def get_final_circuit(self, optimized_partitions, N):
+        partition = optimized_partitions[self.partition_idx]
+        params = partition.synthesised_parameters[
+            self.topology_idx
+        ][self.permutation_idx]
+        circuit = partition.synthesised_circuits[
+            self.topology_idx
+        ][self.permutation_idx].get_Flat_Circuit()
+        return circuit.Remap_Qbits(self.node_mapping, N), params
+
+
+def _init_layout_trial_worker(state):
+    global _routing_worker_state
+    from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping
+
+    worker_config = dict(state["config"])
+    worker_config["progressbar"] = False
+
+    mapper = qgd_Partition_Aware_Mapping(worker_config)
+    mapper._adj = [list(neighbors) for neighbors in state["adj"]]
+    mapper._swap_cache = {}
+
+    _routing_worker_state = {
+        "mapper": mapper,
+        "seeded_pi": np.asarray(state["seeded_pi"]),
+        "DAG": state["DAG"],
+        "IDAG": state["IDAG"],
+        "layout_partitions": state["layout_partitions"],
+        "scoring_partitions": state["scoring_partitions"],
+        "D": np.asarray(state["D"]),
+        "candidate_cache": state["candidate_cache"],
+        "n_iterations": state["n_iterations"],
+        "n_trials": state["n_trials"],
+        "random_seed": state["random_seed"],
+    }
+
+
+def _run_layout_trial_worker(trial_idx):
+    state = _routing_worker_state
+    mapper = state["mapper"]
+
+    return mapper._run_single_layout_trial(
+        trial_idx=trial_idx,
+        seeded_pi=state["seeded_pi"],
+        DAG=state["DAG"],
+        IDAG=state["IDAG"],
+        layout_partitions=state["layout_partitions"],
+        scoring_partitions=state["scoring_partitions"],
+        D=state["D"],
+        candidate_cache=state["candidate_cache"],
+        n_iterations=state["n_iterations"],
+        n_trials=state["n_trials"],
+        random_seed=state["random_seed"],
+    )
+# ============================================================================
+# Main Class: qgd_Partition_Aware_Mapping
+# ============================================================================
+
+class qgd_Partition_Aware_Mapping:
+
+    # ------------------------------------------------------------------------
+    # Initialization & Configuration
+    # ------------------------------------------------------------------------
+
+    def __init__(self, config):
+        self.topology = config['topology']
+        self.config = config
+        self.config.setdefault('strategy', 'TreeSearch')
+        self.config.setdefault('parallel', 0 )
+        self.config.setdefault('verbosity', 0 )
+        self.config.setdefault('tolerance', 1e-8 )
+        self.config.setdefault('test_subcircuits', False )
+        self.config.setdefault('test_final_circuit', True )
+        self.config.setdefault('max_partition_size', 3 )
+        self.config.setdefault('pack_credit_weight', 0.0)
+        self.config.setdefault('topology', None)
+        self.config.setdefault('routed', False)
+        self.config.setdefault('optimizer', 'BFGS')
+        self.config.setdefault('use_osr', 0)
+        self.config.setdefault("use_graph_search", 0)
+        self.config.setdefault('n_layout_trials', 1)
+        self.config.setdefault('random_seed', 42)
+        self.config.setdefault('cleanup', True)
+        self.config.setdefault('prefilter_top_k', 50)
+        self.config.setdefault('prefilter_min_per_partition', 2)
+        self.config.setdefault('prefilter_min_3q', 12)
+        self.config.setdefault('cleanup_top_k', 3)
+        self.config.setdefault('decay_delta', 0.001)  # Qiskit LightSABRE DECAY_RATE
+        self.config.setdefault('swap_burst_budget', 5)  # Qiskit LightSABRE DECAY_RESET_INTERVAL
+        self.config.setdefault('path_tiebreak_weight', 0.2)
+        # Neighbor tie-breaker is added to A* f-values normalised to [0, 1];
+        # must stay < 0.5 to preserve swap-count optimality.
+        if self.config['path_tiebreak_weight'] >= 0.5:
+            logging.warning(
+                "path_tiebreak_weight=%.3f ≥ 0.5 may override SWAP-count "
+                "optimality; clamping to 0.49.",
+                self.config['path_tiebreak_weight'],
+            )
+            self.config['path_tiebreak_weight'] = 0.49
+        self.config.setdefault('cnot_cost', 1.0 / 3.0)  # 1 SWAP = 3 CNOTs
+        self.config.setdefault('three_qubit_exit_weight', 1.0)
+        self.config.setdefault('boundary_beam_width', 1)
+        self.config.setdefault('boundary_beam_depth', 1)
+        self.config.setdefault('layout_boundary_beam_width', None)
+        self.config.setdefault('layout_boundary_beam_depth', None)
+        self.config.setdefault('routing_trace_path', None)
+        self.config['partition_weight_model'] = 'window_turnover'
+        # ILP partition-selection weights. See _parts_to_window_turnover_weights
+        # for the full cost formula; defaults are calibrated against the
+        # synthesis-capacity / width-penalty pair below so saturation rewards
+        # match across widths.
+        self.config.setdefault('partition_density_weight', 1.0)
+        self.config.setdefault('partition_boundary_weight', 0.9)
+        self.config.setdefault('partition_depth_balance_weight', 0.25)
+        self.config.setdefault('partition_depth_balance_exponent', 2.0)
+        self.config.setdefault('partition_triangle_weight', 1.5)
+        self.config.setdefault('partition_triangle_threshold', 0.6)
+        self.config.setdefault('partition_triangle_window_radius', 8)
+        self.config.setdefault('partition_synthesis_cost_weight', 1.0)
+        self.config.setdefault('partition_routing_span_weight', 2.0)
+        self.config.setdefault('partition_turnover_weight', 0.5)
+        # Penalises chain-shaped width>=3 blocks; 0.0 disables it.
+        self.config.setdefault('partition_chain_penalty_weight', 2.0)
+        self.config.setdefault('partition_min_cost', 0.05)
+        self.config.setdefault(
+            'partition_width_penalties',
+            {1: 0.25, 2: 1.0, 3: 4.0, 4: 16.0},
+        )
+        # CNOT lower-bound synthesis budgets (Vidal–Dawson w=2,
+        # Shende–Markov–Bullock w=3, QSD w=4).
+        self.config.setdefault(
+            'partition_synthesis_capacity',
+            {1: 1, 2: 3, 3: 14, 4: 61},
+        )
+        strategy = self.config['strategy']
+        self.config.setdefault('parallel_layout_trials', False)
+        self.config.setdefault('layout_trial_workers', 0)
+        allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
+        if not strategy in allowed_strategies:
+            raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
+        allowed_partition_weight_models = ['window_turnover', 'ilp']
+        if self.config['partition_weight_model'] not in allowed_partition_weight_models:
+            raise Exception(
+                f"The partition_weight_model should be either of "
+                f"{allowed_partition_weight_models}, got "
+                f"{self.config['partition_weight_model']}."
+            )
+
+        # Initialize caches for performance optimization
+        self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
+        self._swap_cache = {}     # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)}
+        self._adj = None          # Precomputed adjacency list (built by compute_distances_bfs)
+        self._decomp_cache = {}   # {(rounded unitary bytes, topology): synthesis result}
+
+    # ------------------------------------------------------------------------
+    # Caching Methods
+    # ------------------------------------------------------------------------
+
+    def _get_subtopologies_of_type_cached(self, mini_topology):
+        """
+        Cached version of get_subtopologies_of_type.
+        Uses canonical form of mini_topology as cache key.
+        """
+
+        # Create canonical form key
+        target_qubits = set()
+        for u, v in mini_topology:
+            target_qubits.add(u)
+            target_qubits.add(v)
+        if not target_qubits:
+            return []
+
+        # Use canonical form as cache key
+        canonical_key = get_canonical_form(target_qubits, mini_topology)
+
+        if canonical_key not in self._topology_cache:
+            self._topology_cache[canonical_key] = get_subtopologies_of_type(self.topology, mini_topology)
+
+        return self._topology_cache[canonical_key]
+
+    # ------------------------------------------------------------------------
+    # Static Synthesis Helpers (extracted from SynthesizeWideCircuit)
+    # ------------------------------------------------------------------------
+
+    @staticmethod
+    def _part_support_and_active_pairs(part, gate_dict):
+        qubits_in_part = set()
+        active_pairs = set()
+        for gate_idx in part:
+            gate = gate_dict.get(gate_idx)
+            if gate is None:
+                continue
+            qbs = list(gate.get_Involved_Qbits())
+            qubits_in_part.update(qbs)
+            if len(qbs) < 2:
+                continue
+            for a in range(len(qbs)):
+                for b in range(a + 1, len(qbs)):
+                    active_pairs.add(
+                        (min(qbs[a], qbs[b]), max(qbs[a], qbs[b]))
+                    )
+        return frozenset(qubits_in_part), frozenset(active_pairs)
+
+    @staticmethod
+    def _two_qubit_gate_pair(gate):
+        qbs = list(gate.get_Involved_Qbits())
+        if len(qbs) != 2:
+            return None
+        return (min(qbs[0], qbs[1]), max(qbs[0], qbs[1]))
+
+    @staticmethod
+    def _part_two_qubit_gate_count(part, gate_dict):
+        count = 0
+        for gate_idx in part:
+            gate = gate_dict.get(gate_idx)
+            if gate is None:
+                continue
+            if qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate) is not None:
+                count += 1
+        return count
+
+    @staticmethod
+    def _synthesis_capacity(width, capacities=None):
+        """CNOT lower-bound budget for a generic width-qubit unitary.
+
+        Defaults reflect known synthesis bounds:
+          width 2 → 3   (Vidal–Dawson 2004, tight)
+          width 3 → 14  (Shende–Markov–Bullock 2004 counting bound)
+          width 4 → 61  (QSD recursion, practical upper bound)
+        Width 1 → 1 (no 2q gates; avoids division by zero in block_density).
+        Widths beyond 4 extrapolate as 61 · 4^(w−4), matching the asymptotic
+        (23/48)·4^w scaling of QSD.
+        """
+        if capacities is None:
+            capacities = {1: 1, 2: 3, 3: 14, 4: 61}
+
+        exact = None
+        if isinstance(capacities, dict):
+            exact = capacities.get(width)
+            if exact is None:
+                exact = capacities.get(str(width))
+        if exact is not None:
+            return float(max(exact, 1))
+
+        if width <= 1:
+            return 1.0
+        if width == 2:
+            return 3.0
+        if width == 3:
+            return 14.0
+        if width == 4:
+            return 61.0
+        return 61.0 * (4.0 ** (width - 4))
+
+    @staticmethod
+    def _configured_width_penalty(width, penalties):
+        if penalties is None:
+            penalties = {1: 0.25, 2: 1.0, 3: 4.0, 4: 16.0}
+
+        exact = None
+        if isinstance(penalties, dict):
+            exact = penalties.get(width)
+            if exact is None:
+                exact = penalties.get(str(width))
+        if exact is not None:
+            return float(exact)
+
+        if width <= 1:
+            return 0.25
+        if width == 2:
+            return 1.0
+        if width == 3:
+            return 4.0
+        if width == 4:
+            return 16.0
+        return 16.0 * (4.0 ** (width - 4))
+
+    @staticmethod
+    def _restricted_longest_path_depth(nodes, g, rg, topo_order):
+        nodes = set(nodes)
+        if not nodes:
+            return 0
+
+        depth = {}
+        best = 0
+        for gate_idx in topo_order:
+            if gate_idx not in nodes:
+                continue
+            pred_depth = 0
+            for pred in rg.get(gate_idx, ()):
+                if pred in nodes:
+                    pred_depth = max(pred_depth, depth.get(pred, 0))
+            depth[gate_idx] = pred_depth + 1
+            best = max(best, depth[gate_idx])
+        return best
+
+    @staticmethod
+    def _boundary_two_qubit_gate_set(part, support, g, rg, gate_dict,
+                                     max_partition_size):
+        """Return the set of adjacent 2q gates this candidate leaves over a boundary."""
+        support = set(support)
+        boundary_gates = set()
+        for gate_idx in part:
+            neighbors = set(g.get(gate_idx, ())) | set(rg.get(gate_idx, ()))
+            for other_idx in neighbors:
+                if other_idx in part:
+                    continue
+                gate = gate_dict.get(other_idx)
+                if gate is None:
+                    continue
+                if qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate) is None:
+                    continue
+                other_support = set(gate.get_Involved_Qbits())
+                if not (support & other_support):
+                    continue
+                if (
+                    max_partition_size is not None
+                    and len(support | other_support) > max_partition_size
+                ):
+                    continue
+                boundary_gates.add(other_idx)
+        return boundary_gates
+
+    @staticmethod
+    def _boundary_two_qubit_gate_count(part, support, g, rg, gate_dict,
+                                       max_partition_size):
+        """Count adjacent 2q gates that this candidate leaves over a boundary."""
+        return len(
+            qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_set(
+                part, support, g, rg, gate_dict, max_partition_size,
+            )
+        )
+
+    @staticmethod
+    def _pair_counts_in_topological_window(part, topo_order, topo_index,
+                                           gate_dict, radius):
+        if not part:
+            return defaultdict(int)
+
+        positions = [topo_index[g] for g in part if g in topo_index]
+        if not positions:
+            return defaultdict(int)
+
+        lo = max(0, min(positions) - radius)
+        hi = min(len(topo_order) - 1, max(positions) + radius)
+        pair_counts = defaultdict(int)
+        for pos in range(lo, hi + 1):
+            gate = gate_dict.get(topo_order[pos])
+            if gate is None:
+                continue
+            pair = qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate)
+            if pair is not None:
+                pair_counts[pair] += 1
+        return pair_counts
+
+    @staticmethod
+    def _triangle_density_from_pair_counts(support, pair_counts):
+        """Return a balanced local triangle score in [0, 1].
+
+        A chain has density zero because one triangle edge is missing.  A
+        balanced three-edge interaction has density one, while skewed triangles
+        are discounted by the weakest edge's share of the local interactions.
+        """
+        support = sorted(support)
+        if len(support) < 3:
+            return 0.0
+
+        best_density = 0.0
+        for a, b, c in combinations(support, 3):
+            counts = [
+                pair_counts.get((min(a, b), max(a, b)), 0),
+                pair_counts.get((min(a, c), max(a, c)), 0),
+                pair_counts.get((min(b, c), max(b, c)), 0),
+            ]
+            if min(counts) <= 0:
+                continue
+            total = sum(counts)
+            if total <= 0:
+                continue
+            density = (3.0 * min(counts)) / float(total)
+            best_density = max(best_density, min(density, 1.0))
+        return best_density
+
+    @staticmethod
+    def _turnover_between_supports(support_a, support_b):
+        if len(support_a) < 2 or len(support_b) < 2:
+            return None
+        return min(len(support_a), len(support_b)) - len(support_a & support_b)
+
+    @staticmethod
+    def _average_turnover(part_idx, part, neighbor_gate_sets,
+                          gate_to_parts, allparts, supports):
+        turnovers = []
+        for gate_set in neighbor_gate_sets:
+            for gate_idx in gate_set - part:
+                for other_idx in gate_to_parts.get(gate_idx, ()):
+                    if other_idx == part_idx:
+                        continue
+                    other_part = allparts[other_idx]
+                    if part & other_part:
+                        continue
+                    turnover = (
+                        qgd_Partition_Aware_Mapping._turnover_between_supports(
+                            supports[part_idx],
+                            supports[other_idx],
+                        )
+                    )
+                    if turnover is not None:
+                        turnovers.append(turnover)
+        if not turnovers:
+            return None
+        return sum(turnovers) / len(turnovers)
+
+    @staticmethod
+    def _parts_to_window_turnover_weights(allparts, gate_dict, g,
+                                          pack_credit_weight=0.0,
+                                          config=None,
+                                          max_partition_size=None,
+                                          topology_distances=None,
+                                          seed_layout=None):
+        """Linear ILP weights for local block quality.
+
+        The ILP accepts one linear cost per candidate part, so pairwise
+        interactions are approximated locally.  Lower cost is better.
+
+        Core cost terms:
+          * ``synthesis_cost_weight · width_penalty[width]`` — non-linear
+            penalty for synthesising a wider unitary block.
+          * ``− density_weight · (k_2q / synthesis_capacity[width])`` —
+            capacity-normalised density reward.  Each width has the same
+            saturation level (1.0), implicitly pricing that wider partitions
+            don't compress to zero body CNOTs.
+          * ``boundary_weight · boundary_crossings`` — penalises adjacent 2q
+            gates left across this candidate's boundary.
+          * Triangle bonus (only above a density threshold), depth-balance
+            penalty, optional routing-span penalty (heavily weighted to make
+            topology-spread wide partitions visibly expensive), and optional
+            turnover penalty as documented per knob below.
+
+        When ``topology_distances`` is supplied, also adds
+        ``routing_span_weight · Σ max(D[u,v]−1, 0)`` over the part's active 2q
+        pairs.  When ``seed_layout`` is also supplied, ``D`` is permuted
+        through the layout so the span penalty reflects *physical* qubit
+        distance under the routing layer's chosen placement.  When
+        ``turnover_weight`` is non-zero, also adds
+        ``turnover_weight · avg_turnover`` averaging
+        ``min(|supp_p|, |supp_q|) − |supp_p ∩ supp_q|`` over candidate
+        partitions ``q`` immediately downstream of ``p`` in the gate DAG.
+        """
+        cfg = {} if config is None else config
+        if max_partition_size is None:
+            max_partition_size = cfg.get("max_partition_size")
+
+        density_weight = float(cfg.get("partition_density_weight", 4.0))
+        boundary_weight = float(cfg.get("partition_boundary_weight", 0.9))
+        depth_balance_weight = float(
+            cfg.get("partition_depth_balance_weight", 0.25)
+        )
+        depth_balance_exponent = float(
+            cfg.get("partition_depth_balance_exponent", 2.0)
+        )
+        triangle_weight = float(cfg.get("partition_triangle_weight", 2.5))
+        triangle_threshold = float(
+            cfg.get("partition_triangle_threshold", 0.6)
+        )
+        triangle_threshold = min(max(triangle_threshold, 0.0), 1.0)
+        triangle_window_radius = max(
+            int(cfg.get("partition_triangle_window_radius", 8)),
+            0,
+        )
+        synthesis_cost_weight = float(
+            cfg.get("partition_synthesis_cost_weight", 1.0)
+        )
+        routing_span_weight = float(
+            cfg.get("partition_routing_span_weight", 0.0)
+        )
+        turnover_weight = float(cfg.get("partition_turnover_weight", 0.0))
+        chain_penalty_weight = float(
+            cfg.get("partition_chain_penalty_weight", 0.0)
+        )
+        min_cost = float(cfg.get("partition_min_cost", 0.05))
+        width_penalties = cfg.get("partition_width_penalties")
+        synthesis_capacities = cfg.get("partition_synthesis_capacity")
+
+        use_routing_span = (
+            topology_distances is not None and routing_span_weight
+        )
+        if topology_distances is not None and seed_layout is not None:
+            pi_arr = np.asarray(seed_layout, dtype=int)
+            layout_distances = topology_distances[np.ix_(pi_arr, pi_arr)]
+        else:
+            layout_distances = topology_distances
+        inf_distance_cap = float(
+            max(len(layout_distances) - 1, 1)
+        ) if layout_distances is not None else 0.0
+
+        N = max(len(allparts), 1)
+        supports = []
+        active_pairs_list = []
+        for part in allparts:
+            support, active_pairs = (
+                qgd_Partition_Aware_Mapping._part_support_and_active_pairs(
+                    part,
+                    gate_dict,
+                )
+            )
+            supports.append(support)
+            active_pairs_list.append(active_pairs)
+
+        rg = {gate_idx: set() for gate_idx in g}
+        for src, dsts in g.items():
+            for dst in dsts:
+                rg.setdefault(dst, set()).add(src)
+
+        use_turnover = turnover_weight != 0.0
+        if use_turnover:
+            gate_to_parts = defaultdict(list)
+            for idx, part in enumerate(allparts):
+                for gate_idx in part:
+                    gate_to_parts[gate_idx].append(idx)
+            successor_gate_sets = []
+            for part in allparts:
+                downstream = set()
+                for gate_idx in part:
+                    downstream.update(g.get(gate_idx, ()))
+                successor_gate_sets.append(downstream)
+        else:
+            gate_to_parts = None
+            successor_gate_sets = None
+
+        gate_to_qubit = {
+            gate_idx: set(gate.get_Involved_Qbits())
+            for gate_idx, gate in gate_dict.items()
+            if gate is not None
+        }
+        topo_order = _get_topo_order(g, rg, gate_to_qubit) if g else []
+        topo_index = {gate_idx: idx for idx, gate_idx in enumerate(topo_order)}
+        global_depth = max(
+            qgd_Partition_Aware_Mapping._restricted_longest_path_depth(
+                set(g), g, rg, topo_order
+            ),
+            1,
+        )
+
+
+        weights = []
+        for part_idx, part in enumerate(allparts):
+            support = supports[part_idx]
+            width = len(support)
+            width_penalty = (
+                qgd_Partition_Aware_Mapping._configured_width_penalty(
+                    width, width_penalties
+                )
+            )
+            two_qubit_gate_count = (
+                qgd_Partition_Aware_Mapping._part_two_qubit_gate_count(
+                    part, gate_dict
+                )
+            )
+            block_density = (
+                two_qubit_gate_count
+                / qgd_Partition_Aware_Mapping._synthesis_capacity(
+                    width, synthesis_capacities
+                )
+            )
+            boundary_crossings = (
+                qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_count(
+                    part,
+                    support,
+                    g,
+                    rg,
+                    gate_dict,
+                    max_partition_size,
+                )
+            )
+            if use_routing_span:
+                span_cost = 0.0
+                for u, v in active_pairs_list[part_idx]:
+                    d = layout_distances[u][v]
+                    if not np.isfinite(d):
+                        d = inf_distance_cap
+                    span_cost += max(float(d) - 1.0, 0.0)
+            else:
+                span_cost = 0.0
+            if use_turnover:
+                avg_turnover = (
+                    qgd_Partition_Aware_Mapping._average_turnover(
+                        part_idx,
+                        part,
+                        [successor_gate_sets[part_idx]],
+                        gate_to_parts,
+                        allparts,
+                        supports,
+                    )
+                )
+                turnover_cost = 0.0 if avg_turnover is None else float(avg_turnover)
+            else:
+                turnover_cost = 0.0
+            pair_counts = (
+                qgd_Partition_Aware_Mapping._pair_counts_in_topological_window(
+                    part,
+                    topo_order,
+                    topo_index,
+                    gate_dict,
+                    triangle_window_radius,
+                )
+            )
+            triangle_density = (
+                qgd_Partition_Aware_Mapping._triangle_density_from_pair_counts(
+                    support,
+                    pair_counts,
+                )
+            )
+            if triangle_threshold >= 1.0:
+                triangle_bonus = 0.0
+            else:
+                triangle_bonus = triangle_weight * max(
+                    triangle_density - triangle_threshold,
+                    0.0,
+                ) / (1.0 - triangle_threshold)
+
+            if (
+                chain_penalty_weight
+                and width >= 3
+                and triangle_threshold > 0.0
+            ):
+                chain_deficit = max(
+                    triangle_threshold - triangle_density, 0.0
+                ) / triangle_threshold
+                chain_penalty = (
+                    chain_penalty_weight * chain_deficit * (width - 2)
+                )
+            else:
+                chain_penalty = 0.0
+
+            internal_depth = (
+                qgd_Partition_Aware_Mapping._restricted_longest_path_depth(
+                    part, g, rg, topo_order
+                )
+            )
+            depth_fraction = internal_depth / float(global_depth)
+            depth_penalty = (
+                depth_balance_weight
+                * (depth_fraction ** depth_balance_exponent)
+                * max(width_penalty, 1.0)
+            )
+
+            density_bonus = density_weight * block_density
+            if pack_credit_weight:
+                density_bonus += (
+                    pack_credit_weight
+                    * block_density
+                    * max(two_qubit_gate_count - 1, 0)
+                )
+
+            conceptual_cost = (
+                synthesis_cost_weight * width_penalty
+                + boundary_weight * boundary_crossings
+                + routing_span_weight * span_cost
+                + turnover_weight * turnover_cost
+                + chain_penalty
+                + depth_penalty
+                - density_bonus
+                - triangle_bonus
+            )
+            conceptual_cost = max(conceptual_cost, min_cost)
+            weights.append((conceptual_cost - 1.0) / N)
+        return weights
+
+    @staticmethod
+    def _topo_key(mini_topology):
+        return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
+
+    @staticmethod
+    def _cache_key(Umtx, mini_topology):
+        topo_key = tuple(sorted(tuple(sorted(e)) for e in mini_topology))
+        return (np.round(Umtx, decimals=10).tobytes(), topo_key)
+
+    @staticmethod
+    def _get_auts(mini_topo, aut_cache):
+        key = tuple(sorted(tuple(sorted(e)) for e in mini_topo))
+        if key not in aut_cache:
+            aut_cache[key] = compute_automorphisms(mini_topo)
+        return aut_cache[key]
+
+    @staticmethod
+    def _build_permuted_unitary(meta, P_i, P_o):
+        N = meta['N']
+        circ_tmp = Circuit(N)
+        circ_tmp.add_Permutation(list(P_i))
+        circ_tmp.add_Circuit(meta['circuit'])
+        circ_tmp.add_Permutation(list(P_o))
+        return circ_tmp.get_Matrix(meta['params'])
+
+    @staticmethod
+    def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
+                              topology_idx, N, mini_topology, known_pairs, pair_key,
+                              use_auts, aut_cache):
+        """Add a synthesis result and derive automorphism equivalents."""
+        result.add_result(perm_pair, synth_circuit, synth_params, topology_idx)
+        if use_auts:
+            if pair_key not in known_pairs:
+                known_pairs[pair_key] = set()
+            known_pairs[pair_key].add(perm_pair)
+            P_i, P_o = perm_pair
+            auts = qgd_Partition_Aware_Mapping._get_auts(mini_topology, aut_cache)
+            identity = tuple(range(N))
+            for sigma in auts:
+                if sigma == identity:
+                    continue
+                new_P_i, new_P_o, new_circ, new_params = derive_result_from_automorphism(
+                    sigma, P_i, P_o, synth_circuit, synth_params, N
+                )
+                if (new_P_i, new_P_o) not in known_pairs[pair_key]:
+                    result.add_result((new_P_i, new_P_o), new_circ, new_params, topology_idx)
+                    known_pairs[pair_key].add((new_P_i, new_P_o))
+
+    @staticmethod
+    def _qiskit_routing_fallback(meta, mini_topology):
+        """Route original partition circuit on mini_topology using Qiskit transpiler.
+
+        Called when unitary synthesis fails to reach tolerance.  Routes the
+        original (un-permuted) circuit and returns it with identity P_i/P_o.
+        Returns (circuit, params) or (None, None) if Qiskit is unavailable or
+        routing fails.
+        """
+        try:
+            from squander.IO_interfaces.Qiskit_IO import get_Qiskit_Circuit, convert_Qiskit_to_Squander
+            from qiskit.compiler import transpile
+            from qiskit.transpiler import CouplingMap
+        except ImportError:
+            return None, None
+
+        try:
+            qk_circ = get_Qiskit_Circuit(meta['circuit'], meta['params'])
+            edges = []
+            for u, v in mini_topology:
+                edges.append([u, v])
+                edges.append([v, u])
+            coupling_map = CouplingMap(couplinglist=edges)
+            qk_routed = transpile(
+                qk_circ,
+                coupling_map=coupling_map,
+                optimization_level=1,
+                basis_gates=['cx', 'u3'],
+            )
+            return convert_Qiskit_to_Squander(qk_routed)
+        except Exception as exc:
+            logging.warning("Qiskit routing fallback failed: %s", exc)
+            return None, None
+
+    def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[PartitionScoreData]]:
+        """
+        Create lightweight, picklable views of partitions that contain only the
+        data required during heuristic scoring.
+        """
+        scoring_partitions: List[Optional[PartitionScoreData]] = []
+        for partition in optimized_partitions:
+            if isinstance(partition, SingleQubitPartitionResult):
+                scoring_partitions.append(None)
+                continue
+
+            mini_topologies = tuple(
+                tuple(tuple(edge) for edge in mini_topology)
+                for mini_topology in partition.mini_topologies
+            )
+
+            topology_candidates = []
+            for tdx, mini_topology in enumerate(partition.mini_topologies):
+                if hasattr(partition, "get_topology_candidates"):
+                    candidates = partition.get_topology_candidates(tdx)
+                else:
+                    candidates = self._get_subtopologies_of_type_cached(mini_topology)
+                topology_candidates.append(
+                    tuple(tuple(edge) for edge in candidates)
+                )
+
+            permutations_pairs = tuple(
+                tuple((tuple(P_i), tuple(P_o)) for (P_i, P_o) in partition.permutations_pairs[tdx])
+                for tdx in range(len(partition.mini_topologies))
+            )
+
+            circuit_structures = tuple(
+                tuple(tuple(struct) for struct in partition.circuit_structures[tdx])
+                for tdx in range(len(partition.mini_topologies))
+            )
+            cnot_counts = tuple(
+                tuple(int(cnot) for cnot in partition.cnot_counts[tdx])
+                for tdx in range(len(partition.mini_topologies))
+            )
+
+            scoring_partitions.append(
+                PartitionScoreData(
+                    mini_topologies=mini_topologies,
+                    topology_candidates=tuple(topology_candidates),
+                    permutations_pairs=permutations_pairs,
+                    circuit_structures=circuit_structures,
+                    cnot_counts=cnot_counts,
+                    qubit_map=dict(partition.qubit_map),
+                    involved_qbits=tuple(partition.involved_qbits),
+                )
+            )
+        return scoring_partitions
+    @staticmethod
+    def _partition_is_single(partition):
+        if isinstance(partition, dict):
+            return partition.get("is_single", False)
+        return isinstance(partition, SingleQubitPartitionResult)
+
+
+    @staticmethod
+    def _partition_involved_qbits(partition):
+        if isinstance(partition, dict):
+            return partition["involved_qbits"]
+        return partition.involved_qbits
+
+
+    @staticmethod
+    def _build_layout_partition_info(optimized_partitions):
+        return [
+            {
+                "is_single": isinstance(
+                    partition, SingleQubitPartitionResult
+                ),
+                "involved_qbits": tuple(partition.involved_qbits),
+            }
+            for partition in optimized_partitions
+        ]
+    def _build_partition_candidate_cache(self, scoring_partitions):
+        """
+        Precompute all PartitionCandidate objects once, grouped by partition_idx.
+
+        Returns:
+            tuple where candidate_cache[partition_idx] is a tuple of
+            PartitionCandidate objects for that partition. Single-qubit
+            partitions get an empty tuple.
+        """
+        candidate_cache = []
+
+        for partition_idx, partition in enumerate(scoring_partitions):
+            if partition is None:
+                candidate_cache.append(())
+                continue
+
+            cached_candidates = []
+            for tdx, mini_topology in enumerate(partition.mini_topologies):
+                topology_candidates = partition.topology_candidates[tdx]
+                permutation_pairs = partition.permutations_pairs[tdx]
+                circuit_structures = partition.circuit_structures[tdx]
+                cnot_counts = partition.cnot_counts[tdx]
+
+                for topology_candidate in topology_candidates:
+                    for pdx, permutation_pair in enumerate(permutation_pairs):
+                        circuit_structure = circuit_structures[pdx]
+                        cached_candidates.append(
+                            PartitionCandidate(
+                                partition_idx,
+                                tdx,
+                                pdx,
+                                circuit_structure,
+                                permutation_pair[0],
+                                permutation_pair[1],
+                                topology_candidate,
+                                mini_topology,
+                                partition.qubit_map,
+                                partition.involved_qbits,
+                                cnot_count=cnot_counts[pdx],
+                            )
+                        )
+
+            candidate_cache.append(tuple(cached_candidates))
+
+        return tuple(candidate_cache)
+    # ------------------------------------------------------------------------
+    # Partition Decomposition Methods
+    # ------------------------------------------------------------------------
+
+    @staticmethod
+    def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None, max_retries: int = 5) -> Circuit:
+        """
+        Call to decompose a partition. Retries up to max_retries times if the
+        decomposition error exceeds the configured tolerance.  Returns the
+        best-error attempt across all retries and logs a warning when no
+        attempt reaches ``config["tolerance"]``.
+        """
+        tolerance = config["tolerance"]
+        strategy = config["strategy"]
+
+        best_err = float('inf')
+        best_circuit = None
+        best_params = None
+
+        for attempt in range(max_retries):
+            if strategy == "TreeSearch":
+                cDecompose = N_Qubit_Decomposition_Tree_Search(Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
+            elif strategy == "TabuSearch":
+                cDecompose = N_Qubit_Decomposition_Tabu_Search(Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
+            elif strategy == "Adaptive":
+                cDecompose = N_Qubit_Decomposition_adaptive(Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology)
+            else:
+                raise Exception(f"Unsupported decomposition type: {strategy}")
+            cDecompose.set_Verbose(config["verbosity"])
+            cDecompose.set_Cost_Function_Variant(3)
+            cDecompose.set_Optimization_Tolerance(tolerance)
+            cDecompose.set_Optimizer(config["optimizer"])
+            cDecompose.Start_Decomposition()
+
+            err = cDecompose.get_Decomposition_Error()
+            if err < best_err:
+                best_err = err
+                best_circuit = cDecompose.get_Circuit()
+                best_params = cDecompose.get_Optimized_Parameters()
+
+            if best_err <= tolerance:
+                break
+
+        return best_circuit, best_params, best_err
+
+    # ------------------------------------------------------------------------
+    # Circuit Synthesis
+    # ------------------------------------------------------------------------
+
+    def SynthesizeWideCircuit(self, circ, orig_parameters):
+        """
+        Partition and synthesize a full circuit.
+
+        Flow:
+            1) Enumerate candidate partitions.
+            2) ILP-select the minimum-count non-overlapping cover (uniform weights).
+            3) Synthesize only the selected partitions via SeqPAM (two-stage P_i/P_o
+               sweep over mini_topologies, executed by _run_parallel_synthesis).
+
+        Args:
+            circ: The full quantum circuit (must be flat — no subcircuit blocks)
+            orig_parameters: Parameters for circ
+
+        Returns:
+            optimized_partitions: List of PartitionSynthesisResult /
+                SingleQubitPartitionResult, in topological order.
+        """
+        working_circ = circ
+        working_parameters = orig_parameters
+        qbit_num = circ.get_Qbit_Num()
+
+        # ---- Phase 0: Compute distance matrix ----
+        D = self.compute_distances_bfs(qbit_num)
+
+        # ---- Phase 0b: Compute seed layout for layout-aware scoring ----
+        # Empty partitions list makes _compute_seeded_layout skip the
+        # partition-weighted greedy fallback; it returns identity if VF2
+        # and SabrePreLayout-augmented VF2 both fail (safe no-op).
+        seed_layout = self._compute_seeded_layout([], D, qbit_num, working_circ)
+
+        # ---- Phase 1: Partition enumeration ----
+        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
+        gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())}
+
+        single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
+        single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
+        single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
+
+        # ---- Phase 2: ILP partition selection ----
+        # PartAM keeps one partitioning strategy: window_turnover.
+        ilp_weights = self._parts_to_window_turnover_weights(
+            allparts,
+            gate_dict,
+            g,
+            pack_credit_weight=self.config['pack_credit_weight'],
+            config=self.config,
+            max_partition_size=self.config["max_partition_size"],
+            topology_distances=D,
+            seed_layout=seed_layout,
+        )
+        partition_weight_model = self.config['partition_weight_model']
+        if partition_weight_model == 'ilp':
+            ilp_weights = None
+        L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
+
+        # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----
+        selected_surrounded_starts = set()
+        selected_parts_gates = []
+        for i in L_parts:
+            part = allparts[i]
+            surrounded = {t for s in part for t in go[s]
+                          if t in single_qubit_chains_prepost
+                          and go[single_qubit_chains_prepost[t][-1]]
+                          and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
+            gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded))
+            selected_parts_gates.append(gates)
+            selected_surrounded_starts.update(surrounded)
+
+        standalone_chains = []
+        for chain in single_qubit_chains:
+            if chain[0] not in selected_surrounded_starts:
+                selected_parts_gates.append(frozenset(chain))
+                standalone_chains.append(chain)
+
+        n_multi = len(L_parts)
+
+        size_counts = {}
+        for gates in selected_parts_gates:
+            involved = set()
+            for g in gates:
+                involved.update(gate_dict[g].get_Involved_Qbits())
+            size = len(involved)
+            size_counts[size] = size_counts.get(size, 0) + 1
+        self._selected_partition_counts = dict(size_counts)
+        if self.config.get('verbosity', 0) > 0:
+            selected_multi = sum(
+                count for size, count in size_counts.items() if size > 1
+            )
+            logging.info(
+                "Selected partitions: 2-qubit=%d, 3-qubit=%d, total_multi=%d",
+                size_counts.get(2, 0),
+                size_counts.get(3, 0),
+                selected_multi,
+            )
+
+        # ---- Phase 4: Assemble partitioned circuit from selected partitions only ----
+        partitioned_circuit = Circuit(qbit_num)
+        params = []
+
+        for gates in selected_parts_gates[:n_multi]:
+            c = Circuit(qbit_num)
+            for gate_idx in _get_topo_order({x: go[x] & gates for x in gates},
+                                            {x: rgo[x] & gates for x in gates},
+                                            gate_to_qubit):
+                c.add_Gate(gate_dict[gate_idx])
+                start = gate_dict[gate_idx].get_Parameter_Start_Index()
+                params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+            partitioned_circuit.add_Circuit(c)
+
+        for chain in standalone_chains:
+            c = Circuit(qbit_num)
+            for gate_idx in chain:
+                c.add_Gate(gate_dict[gate_idx])
+                start = gate_dict[gate_idx].get_Parameter_Start_Index()
+                params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+            partitioned_circuit.add_Circuit(c)
+
+        parameters = np.concatenate(params, axis=0)
+
+        # ---- Phase 5: SeqPAM synthesis on selected partitions only ----
+        subcircuits = partitioned_circuit.get_Gates()
+        optimized_results = [None] * len(subcircuits)
+        partition_meta = []
+        for partition_idx, subcircuit in enumerate(subcircuits):
+            start_idx = subcircuit.get_Parameter_Start_Index()
+            end_idx = start_idx + subcircuit.get_Parameter_Num()
+            subcircuit_parameters = parameters[start_idx:end_idx]
+            involved_qbits = subcircuit.get_Qbits()
+            qbit_num_sub = len(involved_qbits)
+            qbit_map = {involved_qbits[idx]: idx for idx in range(len(involved_qbits))}
+            remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num_sub)
+
+            if qbit_num_sub == 1:
+                optimized_results[partition_idx] = SingleQubitPartitionResult(
+                    remapped_subcircuit, subcircuit_parameters,
+                    original_qubits=list(involved_qbits)
+                )
+                partition_meta.append(None)
+            else:
+                mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub)
+                partition_meta.append({
+                    'N': qbit_num_sub,
+                    'circuit': remapped_subcircuit,
+                    'params': subcircuit_parameters,
+                    'mini_topologies': mini_topologies,
+                    'involved_qbits': involved_qbits,
+                    'qbit_map': qbit_map,
+                })
+
+        results_map = self._run_parallel_synthesis(partition_meta)
+        for partition_idx, result in results_map.items():
+            optimized_results[partition_idx] = result
+
+        # ---- Phase 6: Topologically order selected partitions ----
+        L = topo_sort_partitions(working_circ, selected_parts_gates)
+        return [optimized_results[idx] for idx in L]
+
+    def _run_parallel_synthesis(self, partition_meta):
+        """Phase 2: Run parallel synthesis for all multi-qubit partitions.
+
+        Args:
+            partition_meta: List of per-partition dicts (None for single-qubit partitions).
+
+        Returns:
+            results_map: Dict mapping partition_idx to PartitionSynthesisResult.
+        """
+        n_cpus = _available_cpus()
+        use_auts = self.config.get('use_automorphisms', True)
+        disable_pbar = self.config.get('progressbar', 0) == False
+        aut_cache = {}
+        decomp_cache = self._decomp_cache
+
+        with Pool(processes=n_cpus, initializer=_init_decompose_worker,
+                  initargs=(self.config,)) as pool:
+            # Initialize PartitionSynthesisResult for each multi-qubit partition
+            results_map = {}
+            for partition_idx, meta in enumerate(partition_meta):
+                if meta is None:
+                    continue
+                results_map[partition_idx] = PartitionSynthesisResult(
+                    meta['N'], meta['mini_topologies'], meta['involved_qbits'],
+                    meta['qbit_map'],
+                )
+
+            # ---- Stage 1: sweep all boundary permutations for small partitions ----
+            # For N<=3 the full (P_i, P_o) space is at most 36 pairs.  Routing
+            # needs that complete boundary-state set; otherwise 3q partitions
+            # expose less layout freedom than 2q partitions.
+            stage1_futures = []
+            stage1_cached = []
+            known_pairs = {}
+            full_enum_keys = set()  # (partition_idx, topology_idx) fully covered in S1
+
+            for partition_idx, meta in enumerate(partition_meta):
+                if meta is None:
+                    continue
+                N = meta['N']
+                perms_all = list(permutations(range(N)))
+                for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
+                    if N <= 3:
+                        full_enum_keys.add((partition_idx, topology_idx))
+                        po_sweep = perms_all
+                    else:
+                        po_sweep = [perms_all[np.random.choice(len(perms_all))]]
+                    for P_o in po_sweep:
+                        for P_i in perms_all:
+                            Umtx = self._build_permuted_unitary(meta, P_i, P_o)
+                            ck = self._cache_key(Umtx, mini_topology)
+                            if ck in decomp_cache:
+                                stage1_cached.append((partition_idx, topology_idx, P_i, P_o, ck))
+                            else:
+                                future = pool.apply_async(
+                                    _decompose_one, (Umtx, mini_topology)
+                                )
+                                stage1_futures.append((partition_idx, topology_idx, P_i, P_o, ck, future))
+
+            # Process Stage 1 cache hits immediately
+            for partition_idx, topology_idx, P_i, P_o, ck in stage1_cached:
+                meta = partition_meta[partition_idx]
+                N = meta['N']
+                mini_topology = meta['mini_topologies'][topology_idx]
+                synth_circuit, synth_params, synth_err = decomp_cache[ck]
+                if synth_err <= self.config['tolerance']:
+                    pair_key = (partition_idx, topology_idx)
+                    self._add_result_with_auts(
+                        results_map[partition_idx], (P_i, P_o),
+                        synth_circuit, synth_params, topology_idx,
+                        N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                    )
+
+            # Collect Stage 1 pool results
+            cache_hits_s1 = len(stage1_cached)
+            for partition_idx, topology_idx, P_i, P_o, ck, future in tqdm(
+                stage1_futures, desc=f"Stage 1 Synthesis ({cache_hits_s1} cached)",
+                disable=disable_pbar
+            ):
+                synth_circuit, synth_params, synth_err = future.get()
+                decomp_cache[ck] = (synth_circuit, synth_params, synth_err)
+                meta = partition_meta[partition_idx]
+                N = meta['N']
+                mini_topology = meta['mini_topologies'][topology_idx]
+                if synth_err <= self.config['tolerance']:
+                    pair_key = (partition_idx, topology_idx)
+                    self._add_result_with_auts(
+                        results_map[partition_idx], (P_i, P_o),
+                        synth_circuit, synth_params, topology_idx,
+                        N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                    )
+
+            # ---- Stage 2: fix top-k P_i from Stage 1, sweep all P_o ----
+            # Skipped for partitions already fully enumerated in Stage 1
+            # (currently all N<=3 partitions).
+            top_k_pi = self.config.get('top_k_pi', 1)
+            stage2_futures = []
+            stage2_cached = []
+
+            for partition_idx, meta in enumerate(partition_meta):
+                if meta is None:
+                    continue
+                N = meta['N']
+                perms_all = list(permutations(range(N)))
+                result = results_map[partition_idx]
+                for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
+                    if (partition_idx, topology_idx) in full_enum_keys:
+                        continue
+                    pair_key = (partition_idx, topology_idx)
+                    kp = known_pairs.get(pair_key, set()) if use_auts else set()
+                    for P_i_cand in result.get_top_k_results(topology_idx, top_k_pi):
+                        for P_o in perms_all:
+                            if use_auts and (tuple(P_i_cand), P_o) in kp:
+                                continue
+                            Umtx = self._build_permuted_unitary(meta, P_i_cand, P_o)
+                            ck = self._cache_key(Umtx, mini_topology)
+                            if ck in decomp_cache:
+                                stage2_cached.append((partition_idx, topology_idx, P_i_cand, P_o, ck))
+                            else:
+                                future = pool.apply_async(
+                                    _decompose_one, (Umtx, mini_topology)
+                                )
+                                stage2_futures.append((partition_idx, topology_idx, P_i_cand, P_o, ck, future))
+
+            # Process Stage 2 cache hits
+            for partition_idx, topology_idx, P_i_cand, P_o, ck in stage2_cached:
+                meta = partition_meta[partition_idx]
+                N = meta['N']
+                mini_topology = meta['mini_topologies'][topology_idx]
+                synth_circuit, synth_params, synth_err = decomp_cache[ck]
+                if synth_err <= self.config['tolerance']:
+                    pair_key = (partition_idx, topology_idx)
+                    self._add_result_with_auts(
+                        results_map[partition_idx], (tuple(P_i_cand), P_o),
+                        synth_circuit, synth_params, topology_idx,
+                        N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                    )
+
+            # Collect Stage 2 pool results
+            cache_hits_s2 = len(stage2_cached)
+            for partition_idx, topology_idx, P_i_cand, P_o, ck, future in tqdm(
+                stage2_futures, desc=f"Stage 2 Synthesis ({cache_hits_s2} cached)",
+                disable=disable_pbar
+            ):
+                synth_circuit, synth_params, synth_err = future.get()
+                decomp_cache[ck] = (synth_circuit, synth_params, synth_err)
+                meta = partition_meta[partition_idx]
+                N = meta['N']
+                mini_topology = meta['mini_topologies'][topology_idx]
+                if synth_err <= self.config['tolerance']:
+                    pair_key = (partition_idx, topology_idx)
+                    self._add_result_with_auts(
+                        results_map[partition_idx], (tuple(P_i_cand), P_o),
+                        synth_circuit, synth_params, topology_idx,
+                        N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                    )
+
+        # Qiskit routing fallback: for any (partition, topology) pair where all
+        # synthesis attempts failed (no results stored), route the original circuit
+        # with Qiskit and add the result with identity P_i/P_o permutations.
+        qiskit_fallback_cache = {}
+        for partition_idx, meta in enumerate(partition_meta):
+            if meta is None:
+                continue
+            N = meta['N']
+            for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
+                if results_map[partition_idx].permutations_pairs[topology_idx]:
+                    continue
+                fkey = (partition_idx, topology_idx)
+                if fkey not in qiskit_fallback_cache:
+                    fb_circuit, fb_params = self._qiskit_routing_fallback(meta, mini_topology)
+                    qiskit_fallback_cache[fkey] = (fb_circuit, fb_params)
+                fb_circuit, fb_params = qiskit_fallback_cache[fkey]
+                if fb_circuit is None:
+                    logging.warning(
+                        "Partition %d topology_idx %d: synthesis failed and Qiskit "
+                        "fallback unavailable; no result for this combination.",
+                        partition_idx, topology_idx,
+                    )
+                    continue
+                identity = tuple(range(N))
+                results_map[partition_idx].add_result(
+                    (identity, identity), fb_circuit, fb_params, topology_idx
+                )
+
+        return results_map
+
+    # ------------------------------------------------------------------------
+    # Main Public API
+    # ------------------------------------------------------------------------
+    def _run_single_layout_trial(
+        self,
+        trial_idx,
+        seeded_pi,
+        DAG,
+        IDAG,
+        layout_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        n_iterations,
+        n_trials,
+        random_seed,
+    ):
+        N = len(seeded_pi)
+        rng = (
+            np.random.RandomState(random_seed + trial_idx)
+            if n_trials > 1
+            else None
+        )
+        pi = self._sample_initial_layout(trial_idx, n_trials, seeded_pi, rng)
+
+        for iteration in range(n_iterations):
+            F_rev = self.get_final_layer(DAG, N, layout_partitions)
+            pi, _ = self._heuristic_search_layout_only(
+                F_rev,
+                pi,
+                IDAG,
+                DAG,
+                layout_partitions,
+                scoring_partitions,
+                D,
+                rng=rng,
+                reverse=True,
+                candidate_cache=candidate_cache,
+            )
+
+            if iteration < n_iterations - 1:
+                F_fwd = self.get_initial_layer(IDAG, N, layout_partitions)
+                pi, _ = self._heuristic_search_layout_only(
+                    F_fwd,
+                    pi,
+                    DAG,
+                    IDAG,
+                    layout_partitions,
+                    scoring_partitions,
+                    D,
+                    rng=rng,
+                    candidate_cache=candidate_cache,
+                )
+
+        F_eval = self.get_initial_layer(IDAG, N, layout_partitions)
+        _, cost = self._heuristic_search_layout_only(
+            F_eval,
+            pi.copy(),
+            DAG,
+            IDAG,
+            layout_partitions,
+            scoring_partitions,
+            D,
+            rng=None,
+            candidate_cache=candidate_cache,
+        )
+        return cost, pi
+
+
+    def _run_layout_trials(
+        self,
+        seeded_pi,
+        DAG,
+        IDAG,
+        layout_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        n_iterations,
+        n_trials,
+        random_seed,
+    ):
+        use_cpp = self.config.get('use_cpp_router', True)
+        if use_cpp:
+            return self._run_layout_trials_cpp(
+                seeded_pi, DAG, IDAG, layout_partitions,
+                scoring_partitions, D, candidate_cache,
+                n_iterations, n_trials, random_seed,
+            )
+
+        trial_indices = list(range(max(1, n_trials)))
+        use_parallel = (
+            self.config.get("parallel_layout_trials", False)
+            and len(trial_indices) > 1
+        )
+
+        if not use_parallel:
+            return [
+                self._run_single_layout_trial(
+                    trial_idx=trial_idx,
+                    seeded_pi=seeded_pi,
+                    DAG=DAG,
+                    IDAG=IDAG,
+                    layout_partitions=layout_partitions,
+                    scoring_partitions=scoring_partitions,
+                    D=D,
+                    candidate_cache=candidate_cache,
+                    n_iterations=n_iterations,
+                    n_trials=n_trials,
+                    random_seed=random_seed,
+                )
+                for trial_idx in trial_indices
+            ]
+
+        workers = self.config.get("layout_trial_workers", 0)
+        if workers <= 0:
+            workers = min(len(trial_indices), _available_cpus())
+
+        worker_state = {
+            "config": dict(self.config),
+            "adj": tuple(tuple(neighbors) for neighbors in self._adj),
+            "seeded_pi": np.asarray(seeded_pi),
+            "DAG": DAG,
+            "IDAG": IDAG,
+            "layout_partitions": layout_partitions,
+            "scoring_partitions": scoring_partitions,
+            "D": np.asarray(D),
+            "candidate_cache": candidate_cache,
+            "n_iterations": n_iterations,
+            "n_trials": n_trials,
+            "random_seed": random_seed,
+        }
+
+        with Pool(
+            processes=workers,
+            initializer=_init_layout_trial_worker,
+            initargs=(worker_state,),
+        ) as pool:
+            return pool.map(_run_layout_trial_worker, trial_indices)
+
+    def _run_layout_trials_cpp(
+        self,
+        seeded_pi,
+        DAG,
+        IDAG,
+        layout_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        n_iterations,
+        n_trials,
+        random_seed,
+    ):
+        from squander.synthesis._sabre_router import SabreRouter, SabreConfig
+
+        route_beam_width = self.config.get('boundary_beam_width', 1)
+        route_beam_depth = self.config.get('boundary_beam_depth', 1)
+        layout_beam_width = self.config.get(
+            'layout_boundary_beam_width', route_beam_width
+        )
+        layout_beam_depth = self.config.get(
+            'layout_boundary_beam_depth', route_beam_depth
+        )
+        if layout_beam_width is None:
+            layout_beam_width = route_beam_width
+        if layout_beam_depth is None:
+            layout_beam_depth = route_beam_depth
+
+        def make_cpp_config(beam_width, beam_depth):
+            cfg = SabreConfig()
+            cfg.prefilter_top_k = self.config.get('prefilter_top_k', 50)
+            if hasattr(cfg, 'prefilter_min_per_partition'):
+                cfg.prefilter_min_per_partition = self.config.get(
+                    'prefilter_min_per_partition', 2
+                )
+            if hasattr(cfg, 'prefilter_min_3q'):
+                cfg.prefilter_min_3q = self.config.get('prefilter_min_3q', 12)
+            cfg.max_E_size = self.config.get('max_E_size', 20)
+            cfg.max_lookahead = self.config.get('max_lookahead', 4)
+            cfg.E_weight = self.config.get('E_weight', 0.5)
+            cfg.E_alpha = self.config.get('E_alpha', 1.0)
+            cfg.cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
+            cfg.sabre_iterations = n_iterations
+            cfg.n_layout_trials = max(1, n_trials)
+            cfg.random_seed = random_seed
+            cfg.decay_delta = self.config.get('decay_delta', 0.001)
+            cfg.swap_burst_budget = self.config.get('swap_burst_budget', 5)
+            cfg.path_tiebreak_weight = self.config.get(
+                'path_tiebreak_weight', 0.2
+            )
+            if hasattr(cfg, 'three_qubit_exit_weight'):
+                cfg.three_qubit_exit_weight = self.config.get(
+                    'three_qubit_exit_weight', 1.0
+                )
+            if hasattr(cfg, 'boundary_beam_width'):
+                cfg.boundary_beam_width = beam_width
+            if hasattr(cfg, 'boundary_beam_depth'):
+                cfg.boundary_beam_depth = beam_depth
+            return cfg
+
+        layout_cfg = make_cpp_config(layout_beam_width, layout_beam_depth)
+        route_cfg = make_cpp_config(route_beam_width, route_beam_depth)
+        use_distinct_route_router = (
+            layout_beam_width != route_beam_width
+            or layout_beam_depth != route_beam_depth
+        )
+        self._routing_layout_boundary_beam = (
+            int(layout_beam_width),
+            int(layout_beam_depth),
+        )
+        self._routing_boundary_beam = (
+            int(route_beam_width),
+            int(route_beam_depth),
+        )
+        canonical_fwd = self._build_canonical_neighbor_data(
+            scoring_partitions, reverse=False
+        )
+        canonical_rev = self._build_canonical_neighbor_data(
+            scoring_partitions, reverse=True
+        )
+
+        # Convert candidate_cache: list of tuples -> list of lists
+        candidate_cache_lists = [list(cands) for cands in candidate_cache]
+
+        # Convert layout_partitions: list of dicts with tuple involved_qbits
+        layout_partitions_lists = [
+            {'is_single': lp['is_single'], 'involved_qbits': list(lp['involved_qbits'])}
+            for lp in layout_partitions
+        ]
+
+        trial_router = SabreRouter(
+            layout_cfg, D, self._adj, DAG, IDAG,
+            candidate_cache_lists, layout_partitions_lists,
+            canonical_fwd, canonical_rev,
+        )
+        router = trial_router
+        if use_distinct_route_router:
+            router = SabreRouter(
+                route_cfg, D, self._adj, DAG, IDAG,
+                candidate_cache_lists, layout_partitions_lists,
+                canonical_fwd, canonical_rev,
+            )
+
+        seeded_pi_list = [int(x) for x in seeded_pi]
+        n_trials_actual = max(1, n_trials)
+        trial_indices = list(range(n_trials_actual))
+
+        use_parallel = (
+            self.config.get("parallel_layout_trials", False)
+            and n_trials_actual > 1
+        )
+
+        if not use_parallel:
+            self._routing_layout_trial_workers = 1
+            layout_trials_t0 = time.time()
+            trial_results = [
+                trial_router.run_trial(
+                    idx, seeded_pi_list, n_iterations, n_trials_actual
+                )
+                for idx in trial_indices
+            ]
+        else:
+            from concurrent.futures import ThreadPoolExecutor
+            workers = self.config.get("layout_trial_workers", 0)
+            if workers <= 0:
+                workers = min(n_trials_actual, _available_cpus())
+
+            self._routing_layout_trial_workers = workers
+            layout_trials_t0 = time.time()
+            with ThreadPoolExecutor(max_workers=workers) as pool:
+                futures = [
+                    pool.submit(
+                        trial_router.run_trial,
+                        idx,
+                        seeded_pi_list,
+                        n_iterations,
+                        n_trials_actual,
+                    )
+                    for idx in trial_indices
+                ]
+                trial_results = [f.result() for f in futures]
+        self._routing_layout_trials_time = time.time() - layout_trials_t0
+
+        heuristic_ranked = sorted(trial_results, key=lambda x: x[0])
+        actual_rank_default = min(
+            max(1, self.config.get("cleanup_top_k", 3) * 2),
+            n_trials_actual,
+        )
+        actual_rank_top_k = self.config.get(
+            "actual_routing_rank_top_k", actual_rank_default
+        )
+        if actual_rank_top_k is None or actual_rank_top_k <= 0:
+            actual_rank_top_k = len(heuristic_ranked)
+        actual_rank_top_k = min(int(actual_rank_top_k), len(heuristic_ranked))
+
+        actual_rank_inputs = heuristic_ranked[:actual_rank_top_k]
+        self._routing_actual_rank_count = len(actual_rank_inputs)
+
+        def route_rank_input(item):
+            heuristic_cost, trial_pi = item
+            actual_cnot, pi_out, pi_init, steps = router.route_forward(
+                [int(x) for x in trial_pi]
+            )
+            return (actual_cnot, pi_out, heuristic_cost, pi_init, steps)
+
+        use_parallel_actual_routing = (
+            self.config.get("parallel_layout_trials", False)
+            and len(actual_rank_inputs) > 1
+        )
+        actual_rank_t0 = time.time()
+        if use_parallel_actual_routing:
+            from concurrent.futures import ThreadPoolExecutor
+            workers = self.config.get("layout_trial_workers", 0)
+            if workers <= 0:
+                workers = min(len(actual_rank_inputs), _available_cpus())
+
+            self._routing_actual_rank_workers = workers
+            with ThreadPoolExecutor(max_workers=workers) as pool:
+                futures = [
+                    pool.submit(route_rank_input, item)
+                    for item in actual_rank_inputs
+                ]
+                ranked = [f.result() for f in futures]
+        else:
+            self._routing_actual_rank_workers = 1
+            ranked = [
+                route_rank_input(item)
+                for item in actual_rank_inputs
+            ]
+        self._routing_actual_rank_time = time.time() - actual_rank_t0
+
+        ranked.sort(key=lambda x: (x[0], x[2]))
+        ranked.extend(
+            (float("inf"), pi, cost, None, None)
+            for cost, pi in heuristic_ranked[actual_rank_top_k:]
+        )
+        return ranked
+
+    @staticmethod
+    def _snapshot_single_qubit_circuits(optimized_partitions):
+        return {
+            i: p.circuit.copy()
+            for i, p in enumerate(optimized_partitions)
+            if isinstance(p, SingleQubitPartitionResult)
+        }
+
+    @staticmethod
+    def _restore_single_qubit_circuits(optimized_partitions, saved_circuits):
+        for idx, orig in saved_circuits.items():
+            optimized_partitions[idx].circuit = orig.copy()
+
+    @staticmethod
+    def _partition_order_cnot_breakdown(partition_order):
+        routing_cnot = 0
+        partition_cnot = 0
+        for part in partition_order:
+            if isinstance(part, Circuit):
+                routing_cnot += part.get_Gate_Nums().get('CNOT', 0)
+            elif isinstance(part, SingleQubitPartitionResult):
+                continue
+            else:
+                partition_cnot += int(getattr(part, 'cnot_count', 0))
+        return routing_cnot, partition_cnot
+
+    def _partition_order_from_cpp_steps(
+        self, steps, optimized_partitions, candidate_cache, N, pi_initial=None
+    ):
+        partition_order = []
+        pi = [int(x) for x in pi_initial] if pi_initial is not None else None
+        for step in steps:
+            kind = step[0]
+            if kind == "swap":
+                swaps = [(int(u), int(v)) for u, v in step[1]]
+                if swaps:
+                    partition_order.append(construct_swap_circuit(swaps, N))
+                    if pi is not None:
+                        pi = self._apply_swaps_to_pi(pi, swaps)
+            elif kind == "partition":
+                partition_idx = int(step[1])
+                candidate_idx = int(step[2])
+                candidate = candidate_cache[partition_idx][candidate_idx]
+                if (
+                    pi is not None
+                    and self._candidate_is_layout_transparent(candidate)
+                ):
+                    node_mapping = self._zero_cnot_dynamic_node_mapping(
+                        pi, candidate
+                    )
+                    partition_order.append(
+                        _DynamicMappedPartitionCandidate(
+                            candidate, node_mapping
+                        )
+                    )
+                    pi = self._apply_zero_cnot_candidate_exit_to_pi(
+                        pi, candidate, node_mapping
+                    )
+                else:
+                    partition_order.append(candidate)
+                    if pi is not None:
+                        pi = self._apply_candidate_exit_to_pi(pi, candidate)
+            elif kind == "single":
+                partition_idx = int(step[1])
+                physical_qubit = int(step[2])
+                part = optimized_partitions[partition_idx]
+                circuit_qubit = int(part.circuit.get_Qbits()[0])
+                part.circuit = part.circuit.Remap_Qbits(
+                    {circuit_qubit: physical_qubit}, N
+                )
+                partition_order.append(part)
+        return partition_order
+
+    @staticmethod
+    def _csv_list(values):
+        return " ".join(str(int(v)) for v in values)
+
+    @staticmethod
+    def _csv_edges(edges):
+        return " ".join(f"{int(u)}-{int(v)}" for u, v in edges)
+
+    @staticmethod
+    def _candidate_physical_nodes(candidate):
+        nodes = set()
+        for u, v in candidate.topology:
+            nodes.add(int(u))
+            nodes.add(int(v))
+        if not nodes:
+            nodes.update(int(v) for v in candidate.node_mapping.values())
+        return sorted(nodes)
+
+    @staticmethod
+    def _candidate_has_multi_qubit_body(candidate):
+        return bool(getattr(candidate, "circuit_structure", ()))
+
+    @staticmethod
+    def _candidate_is_layout_transparent(candidate):
+        return not qgd_Partition_Aware_Mapping._candidate_has_multi_qubit_body(
+            candidate
+        )
+
+    @staticmethod
+    def _apply_candidate_exit_to_pi(pi, candidate):
+        pi_out = [int(x) for x in pi]
+        qbit_map_inverse = {v: k for k, v in candidate.qbit_map.items()}
+        for q_star, mapped_qstar in enumerate(candidate.P_o):
+            if q_star in qbit_map_inverse:
+                logical_q = qbit_map_inverse[q_star]
+                pi_out[logical_q] = candidate.node_mapping[mapped_qstar]
+        return pi_out
+
+    @staticmethod
+    def _zero_cnot_dynamic_node_mapping(pi, candidate):
+        P_i_inv = [candidate.P_i.index(i) for i in range(len(candidate.P_i))]
+        node_mapping = {}
+        for logical_q, q_star in candidate.qbit_map.items():
+            node_mapping[P_i_inv[q_star]] = int(pi[int(logical_q)])
+        return node_mapping
+
+    @staticmethod
+    def _apply_zero_cnot_candidate_exit_to_pi(pi, candidate, node_mapping):
+        pi_out = [int(x) for x in pi]
+        qbit_map_inverse = {v: k for k, v in candidate.qbit_map.items()}
+        for q_star, mapped_qstar in enumerate(candidate.P_o):
+            if q_star in qbit_map_inverse:
+                logical_q = qbit_map_inverse[q_star]
+                pi_out[logical_q] = node_mapping[mapped_qstar]
+        return pi_out
+
+    @staticmethod
+    def _immediate_multi_successors(partition_idx, DAG, layout_partitions):
+        successors = []
+        seen = set()
+        queue = deque(DAG[partition_idx])
+        while queue:
+            child = queue.popleft()
+            if child in seen:
+                continue
+            seen.add(child)
+            if layout_partitions[child]["is_single"]:
+                queue.extend(DAG[child])
+            else:
+                successors.append(child)
+        return successors
+
+    @staticmethod
+    def _support_overlap_summary(partition_idx, successors, layout_partitions):
+        support = set(layout_partitions[partition_idx]["involved_qbits"])
+        summary = []
+        max_overlap = 0
+        min_turnover = None
+        for child in successors:
+            child_support = set(layout_partitions[child]["involved_qbits"])
+            overlap = len(support & child_support)
+            turnover = min(len(support), len(child_support)) - overlap
+            max_overlap = max(max_overlap, overlap)
+            min_turnover = (
+                turnover
+                if min_turnover is None
+                else min(min_turnover, turnover)
+            )
+            summary.append(f"{child}:{overlap}/{turnover}")
+        return (
+            max_overlap,
+            0 if min_turnover is None else min_turnover,
+            " ".join(summary),
+        )
+
+    @staticmethod
+    def _eligible_multi_frontier(resolved, IDAG, layout_partitions):
+        frontier = []
+        for idx, info in enumerate(layout_partitions):
+            if resolved[idx] or info["is_single"]:
+                continue
+            if all(resolved[parent] for parent in IDAG[idx]):
+                frontier.append(idx)
+        return frontier
+
+    def _write_cpp_routing_trace(
+        self,
+        trace_path,
+        steps,
+        pi_initial,
+        candidate_cache,
+        layout_partitions,
+        DAG,
+        IDAG,
+        N,
+    ):
+        """Write a CSV trace for the final selected C++ route."""
+        if not trace_path:
+            return
+
+        trace_dir = os.path.dirname(os.path.abspath(trace_path))
+        if trace_dir:
+            os.makedirs(trace_dir, exist_ok=True)
+
+        pi = [int(x) for x in pi_initial]
+        resolved = [False] * len(layout_partitions)
+        pending_swaps = []
+        cumulative_swaps = 0
+        cumulative_body_cnot = 0
+        rows = []
+
+        for route_step_idx, step in enumerate(steps):
+            kind = step[0]
+            if kind == "swap":
+                swaps = [(int(u), int(v)) for u, v in step[1]]
+                if swaps:
+                    pending_swaps.extend(swaps)
+                    pi = self._apply_swaps_to_pi(pi, swaps)
+                continue
+
+            if kind == "single":
+                partition_idx = int(step[1])
+                logical_qubits = tuple(
+                    layout_partitions[partition_idx]["involved_qbits"]
+                )
+                physical_qubit = int(step[2])
+                resolved[partition_idx] = True
+                rows.append({
+                    "row": len(rows),
+                    "route_step": route_step_idx,
+                    "kind": "single",
+                    "partition_idx": partition_idx,
+                    "candidate_idx": "",
+                    "topology_idx": "",
+                    "permutation_idx": "",
+                    "logical_qubits": self._csv_list(logical_qubits),
+                    "physical_nodes": str(physical_qubit),
+                    "topology_edges": "",
+                    "entry_layout": self._csv_list(
+                        pi[q] for q in logical_qubits
+                    ),
+                    "exit_layout": self._csv_list(
+                        pi[q] for q in logical_qubits
+                    ),
+                    "swap_count": 0,
+                    "routing_cnot": 0,
+                    "body_cnot": 0,
+                    "cumulative_swap_count": cumulative_swaps,
+                    "cumulative_routing_cnot": 3 * cumulative_swaps,
+                    "cumulative_body_cnot": cumulative_body_cnot,
+                    "frontier_size": len(
+                        self._eligible_multi_frontier(
+                            resolved, IDAG, layout_partitions
+                        )
+                    ),
+                    "successor_count": 0,
+                    "max_successor_overlap": 0,
+                    "min_successor_turnover": 0,
+                    "successor_overlap": "",
+                    "swaps": "",
+                })
+                continue
+
+            if kind != "partition":
+                continue
+
+            partition_idx = int(step[1])
+            candidate_idx = int(step[2])
+            candidate = candidate_cache[partition_idx][candidate_idx]
+            logical_qubits = tuple(int(q) for q in candidate.involved_qbits)
+            entry_layout = [int(pi[q]) for q in logical_qubits]
+            if self._candidate_is_layout_transparent(candidate):
+                dynamic_node_mapping = self._zero_cnot_dynamic_node_mapping(
+                    pi, candidate
+                )
+                exit_pi = self._apply_zero_cnot_candidate_exit_to_pi(
+                    pi, candidate, dynamic_node_mapping
+                )
+                physical_nodes = sorted(dynamic_node_mapping.values())
+                topology_edges = ""
+            else:
+                exit_pi = self._apply_candidate_exit_to_pi(pi, candidate)
+                physical_nodes = self._candidate_physical_nodes(candidate)
+                topology_edges = self._csv_edges(candidate.topology)
+            exit_layout = [int(exit_pi[q]) for q in logical_qubits]
+            successors = self._immediate_multi_successors(
+                partition_idx, DAG, layout_partitions
+            )
+            max_overlap, min_turnover, overlap_summary = (
+                self._support_overlap_summary(
+                    partition_idx, successors, layout_partitions
+                )
+            )
+            frontier_size = len(
+                self._eligible_multi_frontier(
+                    resolved, IDAG, layout_partitions
+                )
+            )
+            swap_count = len(pending_swaps)
+            cumulative_swaps += swap_count
+            cumulative_body_cnot += int(candidate.cnot_count)
+            rows.append({
+                "row": len(rows),
+                "route_step": route_step_idx,
+                "kind": "partition",
+                "partition_idx": partition_idx,
+                "candidate_idx": candidate_idx,
+                "topology_idx": int(candidate.topology_idx),
+                "permutation_idx": int(candidate.permutation_idx),
+                "logical_qubits": self._csv_list(logical_qubits),
+                "physical_nodes": self._csv_list(physical_nodes),
+                "topology_edges": topology_edges,
+                "entry_layout": self._csv_list(entry_layout),
+                "exit_layout": self._csv_list(exit_layout),
+                "swap_count": swap_count,
+                "routing_cnot": 3 * swap_count,
+                "body_cnot": int(candidate.cnot_count),
+                "cumulative_swap_count": cumulative_swaps,
+                "cumulative_routing_cnot": 3 * cumulative_swaps,
+                "cumulative_body_cnot": cumulative_body_cnot,
+                "frontier_size": frontier_size,
+                "successor_count": len(successors),
+                "max_successor_overlap": max_overlap,
+                "min_successor_turnover": min_turnover,
+                "successor_overlap": overlap_summary,
+                "swaps": self._csv_edges(pending_swaps),
+            })
+            resolved[partition_idx] = True
+            pi = exit_pi
+            pending_swaps = []
+
+        fieldnames = [
+            "row",
+            "route_step",
+            "kind",
+            "partition_idx",
+            "candidate_idx",
+            "topology_idx",
+            "permutation_idx",
+            "logical_qubits",
+            "physical_nodes",
+            "topology_edges",
+            "entry_layout",
+            "exit_layout",
+            "swap_count",
+            "routing_cnot",
+            "body_cnot",
+            "cumulative_swap_count",
+            "cumulative_routing_cnot",
+            "cumulative_body_cnot",
+            "frontier_size",
+            "successor_count",
+            "max_successor_overlap",
+            "min_successor_turnover",
+            "successor_overlap",
+            "swaps",
+        ]
+        with open(trace_path, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(rows)
+        self._routing_trace_path = trace_path
+
+
+    def _rank_layout_trials_by_actual_routing(
+        self,
+        trial_results,
+        DAG,
+        IDAG,
+        optimized_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        rank_top_k=None,
+    ):
+        """Reroute a bounded candidate set and rank it by actual CNOT count."""
+        if trial_results and len(trial_results[0]) >= 5:
+            return sorted(trial_results, key=lambda x: (x[0], x[2]))
+        heuristic_ranked = sorted(trial_results, key=lambda x: x[0])
+        if rank_top_k is None or rank_top_k <= 0:
+            rank_top_k = len(heuristic_ranked)
+        rank_top_k = min(int(rank_top_k), len(heuristic_ranked))
+        actual_candidates = heuristic_ranked[:rank_top_k]
+        heuristic_tail = heuristic_ranked[rank_top_k:]
+
+        saved_sq_circuits = self._snapshot_single_qubit_circuits(
+            optimized_partitions
+        )
+        ranked_results = []
+        old_progressbar = self.config.get("progressbar", 0)
+        self.config["progressbar"] = False
+        try:
+            for heuristic_cost, trial_pi in actual_candidates:
+                self._restore_single_qubit_circuits(
+                    optimized_partitions, saved_sq_circuits
+                )
+                F_trial = self.get_initial_layer(
+                    IDAG, len(trial_pi), optimized_partitions
+                )
+                partition_order, _, _ = self.Heuristic_Search(
+                    F_trial,
+                    np.asarray(trial_pi, dtype=np.int64).copy(),
+                    DAG,
+                    IDAG,
+                    optimized_partitions,
+                    scoring_partitions,
+                    D,
+                    candidate_cache=candidate_cache,
+                )
+                trial_circuit, _ = self.Construct_circuit_from_HS(
+                    partition_order, optimized_partitions, len(trial_pi)
+                )
+                actual_cnot = trial_circuit.get_Gate_Nums().get("CNOT", 0)
+                ranked_results.append((actual_cnot, trial_pi, heuristic_cost, None, None))
+        finally:
+            if old_progressbar is None:
+                self.config.pop("progressbar", None)
+            else:
+                self.config["progressbar"] = old_progressbar
+            self._restore_single_qubit_circuits(
+                optimized_partitions, saved_sq_circuits
+            )
+
+        ranked_results.sort(key=lambda x: (x[0], x[2]))
+        ranked_results.extend(
+            (float("inf"), pi, cost, None, None) for cost, pi in heuristic_tail
+        )
+        return ranked_results
+
+    def Partition_Aware_Mapping(
+        self, circ: Circuit, orig_parameters: np.ndarray
+    ):
+        N = circ.get_Qbit_Num()
+
+        optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
+
+        for partition in optimized_partitions:
+            if isinstance(partition, PartitionSynthesisResult):
+                partition._topology = self.topology
+                partition._topology_cache = self._topology_cache
+
+        DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
+
+        D = self.compute_distances_bfs(N)
+        scoring_partitions = self._build_scoring_partitions(optimized_partitions)
+        candidate_cache = self._build_partition_candidate_cache(
+            scoring_partitions
+        )
+        layout_partitions = self._build_layout_partition_info(
+            optimized_partitions
+        )
+        seeded_pi = self._compute_seeded_layout(
+            optimized_partitions, D, N, circ
+        )
+
+        n_iterations = self.config.get('sabre_iterations', 1)
+        n_trials = self.config.get('n_layout_trials', 1)
+        random_seed = self.config.get('random_seed', 42)
+        do_cleanup = self.config.get('cleanup', True)
+
+        routing_start = time.time()
+        routing_swap_cnot = 0
+        partition_body_cnot = 0
+        routing_elapsed_before_cleanup = None
+        cleanup_total = 0.0
+        final_route_steps = None
+        final_route_pi_initial = None
+
+        if n_iterations == 0:
+            F = self.get_initial_layer(IDAG, N, optimized_partitions)
+            partition_order, pi, pi_initial = self.Heuristic_Search(
+                F,
+                pi=seeded_pi.copy(),
+                DAG=DAG,
+                IDAG=IDAG,
+                optimized_partitions=optimized_partitions,
+                scoring_partitions=scoring_partitions,
+                D=D,
+                candidate_cache=candidate_cache,
+            )
+            final_circuit, final_parameters = self.Construct_circuit_from_HS(
+                partition_order, optimized_partitions, N
+            )
+            routing_swap_cnot, partition_body_cnot = (
+                self._partition_order_cnot_breakdown(partition_order)
+            )
+
+        else:
+            trial_results = self._run_layout_trials(
+                seeded_pi=seeded_pi,
+                DAG=DAG,
+                IDAG=IDAG,
+                layout_partitions=layout_partitions,
+                scoring_partitions=scoring_partitions,
+                D=D,
+                candidate_cache=candidate_cache,
+                n_iterations=n_iterations,
+                n_trials=max(1, n_trials),
+                random_seed=random_seed,
+            )
+            actual_rank_default = min(
+                max(1, self.config.get("cleanup_top_k", 3) * 2),
+                max(1, n_trials),
+            )
+            actual_rank_top_k = self.config.get(
+                "actual_routing_rank_top_k", actual_rank_default
+            )
+            trial_results = self._rank_layout_trials_by_actual_routing(
+                trial_results,
+                DAG,
+                IDAG,
+                optimized_partitions,
+                scoring_partitions,
+                D,
+                candidate_cache,
+                rank_top_k=actual_rank_top_k,
+            )
+            routing_elapsed_before_cleanup = time.time() - routing_start
+
+            # Pick the best trial (already ranked by actual routing).
+            _, best_pi, _, trace_pi_init, route_steps = trial_results[0]
+
+            if route_steps is not None:
+                partition_order = self._partition_order_from_cpp_steps(
+                    route_steps,
+                    optimized_partitions,
+                    candidate_cache,
+                    N,
+                    pi_initial=trace_pi_init,
+                )
+                pi = np.asarray(best_pi, dtype=np.int64)
+                pi_initial = np.asarray(trace_pi_init, dtype=np.int64)
+                final_route_steps = route_steps
+                final_route_pi_initial = pi_initial.copy()
+            else:
+                F = self.get_initial_layer(IDAG, N, optimized_partitions)
+                partition_order, pi, pi_initial = self.Heuristic_Search(
+                    F,
+                    best_pi.copy(),
+                    DAG,
+                    IDAG,
+                    optimized_partitions,
+                    scoring_partitions,
+                    D,
+                    candidate_cache=candidate_cache,
+                )
+
+            trial_circuit, trial_params = self.Construct_circuit_from_HS(
+                partition_order, optimized_partitions, N
+            )
+            routing_swap_cnot, partition_body_cnot = (
+                self._partition_order_cnot_breakdown(partition_order)
+            )
+            pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get('CNOT', 0)
+
+            if do_cleanup:
+                from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+                    qgd_Wide_Circuit_Optimization,
+                )
+
+                cleanup_config = dict(self.config)
+                cleanup_config['topology'] = self.topology
+                cleanup_config['routed'] = True
+                cleanup_config['test_subcircuits'] = False
+                cleanup_config['test_final_circuit'] = False
+                cleanup_config['global_min'] = True
+                cleanup_config['use_osr'] = 1
+                cleanup_config['use_graph_search'] = 1
+                cleanup_config['max_partition_size'] = 4
+
+                wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+
+                cleanup_t0 = time.time()
+                final_circuit, final_parameters = wco.OptimizeWideCircuit(
+                    trial_circuit.get_Flat_Circuit(),
+                    trial_params,
+                )
+                cleanup_total += time.time() - cleanup_t0
+            else:
+                final_circuit, final_parameters = trial_circuit, trial_params
+
+        if do_cleanup and n_iterations > 0:
+            self._routing_time = routing_elapsed_before_cleanup
+            self._cleanup_time = cleanup_total
+            self._cnot_pre_cleanup = pre_cleanup_cnots
+        else:
+            self._routing_time = time.time() - routing_start
+            self._cleanup_time = 0.0
+            self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get(
+                'CNOT', 0
+            )
+
+            if self.config.get('cleanup', True):
+                from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+                    qgd_Wide_Circuit_Optimization,
+                )
+
+                cleanup_config = dict(self.config)
+                cleanup_config['topology'] = self.topology
+                cleanup_config['routed'] = True
+                cleanup_config['test_subcircuits'] = False
+                cleanup_config['test_final_circuit'] = False
+                cleanup_config['global_min'] = True
+                wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+
+                final_circuit, final_parameters = wco.OptimizeWideCircuit(
+                    final_circuit.get_Flat_Circuit(), final_parameters
+                )
+
+        self._routing_swap_cnot = routing_swap_cnot
+        self._partition_body_cnot = partition_body_cnot
+
+        routing_trace_path = self.config.get("routing_trace_path", None)
+        if routing_trace_path:
+            if final_route_steps is not None and final_route_pi_initial is not None:
+                self._write_cpp_routing_trace(
+                    routing_trace_path,
+                    final_route_steps,
+                    final_route_pi_initial,
+                    candidate_cache,
+                    layout_partitions,
+                    DAG,
+                    IDAG,
+                    N,
+                )
+            else:
+                logging.warning(
+                    "routing_trace_path was set, but no C++ route steps were "
+                    "available for the selected route."
+                )
+
+        return final_circuit, final_parameters, pi_initial, pi
+
+    # ------------------------------------------------------------------------
+    # Heuristic Search
+    # ------------------------------------------------------------------------
+
+    def _select_best_candidate(self, partition_candidates, scores, rng=None):
+        """Select the lowest-scoring candidate deterministically."""
+        del rng
+        scores_array = np.array(scores)
+        return partition_candidates[np.argmin(scores_array)]
+
+    def _prefilter_candidates(
+        self,
+        partition_candidates,
+        pi,
+        D,
+        top_k,
+        F=None,
+        E=None,
+        candidate_cache=None,
+        layout_partitions=None,
+        reverse=False,
+        W=0.5,
+        alpha=1.0,
+        canonical_data=None,
+    ):
+        """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
+        if top_k <= 0:
+            return []
+        if len(partition_candidates) <= top_k:
+            return partition_candidates
+        cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
+        estimates = np.array([
+            (
+                self._routing_objective(
+                    pc.estimate_swap_count(pi, D, reverse=reverse),
+                    pc.cnot_count,
+                    cnot_cost,
+                )
+                + self._future_context_cost(
+                    pc.partition_idx,
+                    self._estimate_candidate_output_layout(
+                        pc, pi, reverse=reverse
+                    ),
+                    F or (),
+                    E or (),
+                    D,
+                    candidate_cache,
+                    reverse=reverse,
+                    cnot_cost=cnot_cost,
+                    W=W,
+                    alpha=alpha,
+                    layout_partitions=layout_partitions,
+                    canonical_data=canonical_data,
+                )
+            )
+            for pc in partition_candidates
+        ])
+        selected = set()
+        min_per_partition = int(
+            self.config.get('prefilter_min_per_partition', 0) or 0
+        )
+        min_3q = int(self.config.get('prefilter_min_3q', 0) or 0)
+        if min_per_partition > 0 or min_3q > 0:
+            by_partition = defaultdict(list)
+            for idx, pc in enumerate(partition_candidates):
+                by_partition[pc.partition_idx].append(idx)
+            for indices in by_partition.values():
+                sample = partition_candidates[indices[0]]
+                quota = min_per_partition
+                if len(sample.involved_qbits) >= 3:
+                    quota = max(quota, min_3q)
+                if quota <= 0:
+                    continue
+                ranked = sorted(indices, key=lambda i: estimates[i])
+                selected.update(ranked[:min(quota, len(ranked))])
+
+        remaining = max(0, top_k - len(selected))
+        if remaining > 0:
+            ranked_global = np.argsort(estimates)
+            for idx in ranked_global:
+                selected.add(int(idx))
+                if len(selected) >= top_k:
+                    break
+
+        if not selected:
+            top_k_indices = np.argpartition(estimates, top_k)[:top_k]
+            selected.update(int(i) for i in top_k_indices)
+
+        return [
+            partition_candidates[i]
+            for i in sorted(selected, key=lambda idx: estimates[idx])
+        ]
+
+    @staticmethod
+    def _decay_factor_for_swaps(swaps, decay):
+        if not swaps:
+            return 1.0
+        return max(max(decay[u], decay[v]) for u, v in swaps)
+
+    @staticmethod
+    def _routing_objective(
+        route_cost,
+        cnot_count,
+        cnot_cost,
+        cnot_weight=1.0,
+        decay_factor=1.0,
+    ):
+        return decay_factor * (
+            float(route_cost)
+            + cnot_weight * cnot_cost * float(cnot_count)
+        )
+
+    def _apply_decay_for_swaps(self, swaps, decay):
+        delta = self.config.get("decay_delta", 0.001)
+        if delta <= 0:
+            return
+        for u, v in swaps:
+            decay[u] += delta
+            decay[v] += delta
+
+    @staticmethod
+    def _reset_decay(decay):
+        for idx in range(len(decay)):
+            decay[idx] = 1.0
+
+    @staticmethod
+    def _apply_swaps_to_pi(pi, swaps):
+        pi_new = [int(x) for x in pi]
+        n = len(pi_new)
+        p2v = [0] * n
+        for q in range(n):
+            p2v[pi_new[q]] = q
+        for P1, P2 in swaps:
+            q1, q2 = p2v[P1], p2v[P2]
+            p2v[P1], p2v[P2] = q2, q1
+            pi_new[q1], pi_new[q2] = P2, P1
+        return pi_new
+
+    def _sample_initial_layout(self, trial_idx, n_trials, seeded_pi, rng):
+        seeded_pi = np.asarray(seeded_pi, dtype=np.int64)
+        if n_trials <= 1 or rng is None or trial_idx == 0:
+            return seeded_pi.copy()
+
+        return rng.permutation(len(seeded_pi))
+
+    def _bfs_shortest_path(self, src, dst):
+        if src == dst:
+            return [src]
+        parent = {src: None}
+        q = deque([src])
+        while q:
+            node = q.popleft()
+            for nb in self._adj[node]:
+                if nb in parent:
+                    continue
+                parent[nb] = node
+                if nb == dst:
+                    path = [dst]
+                    while parent[path[-1]] is not None:
+                        path.append(parent[path[-1]])
+                    path.reverse()
+                    return path
+                q.append(nb)
+        return []
+
+    @staticmethod
+    def _entry_future_cost(entry, output_perm_arr, D_arr):
+        eu = entry.get("edges_u")
+        if eu is None:
+            return 0.0
+        phys_u = output_perm_arr[eu]
+        phys_v = output_perm_arr[entry["edges_v"]]
+        return float(np.maximum(0, D_arr[phys_u, phys_v] - 1).sum())
+
+    @staticmethod
+    def _estimate_candidate_output_layout(partition_candidate, pi, reverse=False):
+        P_exit = partition_candidate.P_i if reverse else partition_candidate.P_o
+        pi_output = [int(x) for x in pi]
+        qbit_map_inverse = {
+            v: k for k, v in partition_candidate.qbit_map.items()
+        }
+        for q_star in range(len(P_exit)):
+            if q_star in qbit_map_inverse:
+                k = qbit_map_inverse[q_star]
+                pi_output[k] = partition_candidate.node_mapping[P_exit[q_star]]
+        return pi_output
+
+    @staticmethod
+    def _future_context_cost(
+        exclude_partition_idx,
+        pi,
+        F,
+        E,
+        D,
+        candidate_cache,
+        reverse=False,
+        cnot_cost=1.0 / 3.0,
+        W=0.5,
+        alpha=1.0,
+        layout_partitions=None,
+        canonical_data=None,
+    ):
+        del cnot_cost, layout_partitions
+
+        # Candidate-aware lower bound: for each future partition, use the best
+        # available candidate entry cost under this layout.  This preserves the
+        # monotone distance signal while allowing 3q line blocks to distinguish
+        # which logical qubit should sit on the path center.
+        pi_arr = np.asarray(pi, dtype=np.intp)
+        D_arr = np.asarray(D)
+
+        def partition_cost(p_idx):
+            if candidate_cache is not None and 0 <= p_idx < len(candidate_cache):
+                candidates = candidate_cache[p_idx]
+                if candidates and len(candidates[0].involved_qbits) >= 3:
+                    return min(
+                        cand.estimate_swap_count(pi, D, reverse=reverse)
+                        for cand in candidates
+                    )
+            if canonical_data is None:
+                return None
+            entry = canonical_data.get(p_idx)
+            if entry is None:
+                return None
+            return qgd_Partition_Aware_Mapping._entry_future_cost(
+                entry, pi_arr, D_arr
+            )
+
+        f_sum = 0.0
+        n_other = 0
+        for p_idx in F:
+            if p_idx == exclude_partition_idx:
+                continue
+            cost = partition_cost(p_idx)
+            if cost is None:
+                continue
+            f_sum += cost
+            n_other += 1
+        score = f_sum / n_other if n_other > 0 else 0.0
+
+        if E:
+            e_sum = 0.0
+            e_count = 0
+            for p_idx, depth in E:
+                if p_idx == exclude_partition_idx:
+                    continue
+                cost = partition_cost(p_idx)
+                if cost is None:
+                    continue
+                e_sum += (alpha ** depth) * cost
+                e_count += 1
+            if e_count:
+                score += W * e_sum / e_count
+        return score
+
+    def _release_valve(self, F, pi, D, canonical_data):
+        pi_arr = np.asarray(pi, dtype=np.intp)
+        D_arr = np.asarray(D)
+        best = None
+        for p_idx in F:
+            entry = canonical_data.get(p_idx)
+            if entry is None:
+                continue
+            eu = entry.get("edges_u")
+            if eu is None:
+                continue
+            ev = entry["edges_v"]
+            phys_u = pi_arr[eu]
+            phys_v = pi_arr[ev]
+            dists = D_arr[phys_u, phys_v]
+            if dists.size == 0:
+                continue
+            worst_idx = int(np.argmax(dists))
+            worst_d = float(dists[worst_idx])
+            if worst_d <= 1:
+                continue
+            if best is None or worst_d > best[0] or (
+                worst_d == best[0] and p_idx < best[1]
+            ):
+                best = (worst_d, p_idx, int(eu[worst_idx]), int(ev[worst_idx]))
+
+        if best is None:
+            return [], list(pi)
+
+        _, _, u, v = best
+        path = self._bfs_shortest_path(int(pi[u]), int(pi[v]))
+        if len(path) < 2:
+            return [], list(pi)
+
+        k = len(path) - 1
+        m = k // 2
+        swaps = []
+        for i in range(m):
+            swaps.append((path[i], path[i + 1]))
+        for i in range(k, m + 1, -1):
+            swaps.append((path[i], path[i - 1]))
+
+        return swaps, self._apply_swaps_to_pi(pi, swaps)
+
+    @staticmethod
+    def _build_neighbor_info(
+        partition_idx,
+        F,
+        E,
+        pi,
+        canonical_data,
+        weight=0.2,
+        W=0.5,
+        alpha=0.9,
+        layout_partitions=None,
+    ):
+        if weight <= 0 or layout_partitions is None:
+            return None
+
+        edge_weights = {}
+        qubits = set()
+
+        def add_edges(target_idx, edge_weight):
+            if target_idx == partition_idx or edge_weight <= 0:
+                return
+            if target_idx >= len(layout_partitions):
+                return
+            entry = canonical_data.get(target_idx) if canonical_data else None
+            if entry is not None and entry.get("edges_u") is not None:
+                for u, v in zip(entry["edges_u"], entry["edges_v"]):
+                    u = int(u)
+                    v = int(v)
+                    qubits.add(u)
+                    qubits.add(v)
+                    key = (u, v) if u <= v else (v, u)
+                    edge_weights[key] = (
+                        edge_weights.get(key, 0.0) + edge_weight
+                    )
+                return
+
+            involved = qgd_Partition_Aware_Mapping._partition_involved_qbits(
+                layout_partitions[target_idx]
+            )
+            for i, u in enumerate(involved):
+                for v in involved[i + 1:]:
+                    u = int(u)
+                    v = int(v)
+                    qubits.add(u)
+                    qubits.add(v)
+                    key = (u, v) if u <= v else (v, u)
+                    edge_weights[key] = (
+                        edge_weights.get(key, 0.0) + edge_weight
+                    )
+
+        for future_idx in F:
+            add_edges(future_idx, 1.0)
+        if E:
+            for future_idx, depth in E:
+                add_edges(future_idx, W * (alpha ** depth))
+
+        if not edge_weights:
+            return None
+
+        neighbor_vqs = sorted(qubits)
+        q_to_idx = {q: idx for idx, q in enumerate(neighbor_vqs)}
+        edges = [
+            (q_to_idx[u], q_to_idx[v], edge_weight)
+            for (u, v), edge_weight in edge_weights.items()
+        ]
+        return {
+            "neighbor_vqs": neighbor_vqs,
+            "initial_pos": tuple(int(pi[q]) for q in neighbor_vqs),
+            "edges": edges,
+            "weight": weight,
+        }
+
+    def _advance_layout_frontier(
+        self,
+        selected_partition_idx,
+        F,
+        resolved_partitions,
+        DAG,
+        IDAG,
+        optimized_partitions,
+    ):
+        """Advance a copied frontier without mutating circuits.
+
+        This mirrors the layout-only single-qubit elision logic and is used by
+        the boundary beam rollout.  It intentionally tracks only dependency
+        state and layout; final circuit construction still happens through the
+        concrete chosen route.
+        """
+        F_next = list(F)
+        resolved_next = list(resolved_partitions)
+
+        if selected_partition_idx in F_next:
+            F_next.remove(selected_partition_idx)
+        resolved_next[selected_partition_idx] = True
+
+        stack = deque(DAG[selected_partition_idx])
+        while stack:
+            child = stack.popleft()
+            if resolved_next[child] or child in F_next:
+                continue
+            if not all(resolved_next[parent] for parent in IDAG[child]):
+                continue
+            if self._partition_is_single(optimized_partitions[child]):
+                resolved_next[child] = True
+                stack.extend(DAG[child])
+            else:
+                F_next.append(child)
+
+        return tuple(F_next), tuple(resolved_next)
+
+    def _boundary_beam_select_index(
+        self,
+        partition_candidates,
+        scores,
+        cached_swaps,
+        cached_pi,
+        F_snapshot,
+        resolved_partitions,
+        DAG,
+        IDAG,
+        optimized_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        canonical_data,
+        reverse=False,
+        W=0.5,
+        alpha=1.0,
+        cnot_cost=1.0 / 3.0,
+        adj=None,
+    ):
+        """Choose the next candidate by rolling out boundary-layout states.
+
+        The ordinary SABRE selector commits to the locally best candidate. This
+        keeps a small beam of possible boundary layouts across several future
+        partitions, then returns the first candidate from the best rollout.
+        """
+        beam_width = int(self.config.get("boundary_beam_width", 1) or 1)
+        beam_depth = int(self.config.get("boundary_beam_depth", 1) or 1)
+        fallback_idx = int(np.argmin(np.asarray(scores)))
+        if beam_width <= 1 or beam_depth <= 1 or len(partition_candidates) <= 1:
+            return fallback_idx
+        if not any(len(cand.involved_qbits) >= 3 for cand in partition_candidates):
+            return fallback_idx
+
+        max_E_size = self.config.get("max_E_size", 20)
+        max_lookahead = self.config.get("max_lookahead", 4)
+        top_k = self.config.get("prefilter_top_k", 50)
+        path_weight = self.config.get("path_tiebreak_weight", 0.2)
+        three_q_weight = self.config.get("three_qubit_exit_weight", 1.0)
+
+        def transition_cost(cand, swaps):
+            return self._routing_objective(
+                len(swaps or ()),
+                cand.cnot_count,
+                cnot_cost,
+            )
+
+        states = []
+        for idx, cand in enumerate(partition_candidates):
+            if cached_pi[idx] is None:
+                continue
+            trans_cost = transition_cost(cand, cached_swaps[idx])
+            F_next, resolved_next = self._advance_layout_frontier(
+                cand.partition_idx,
+                F_snapshot,
+                resolved_partitions,
+                DAG,
+                IDAG,
+                optimized_partitions,
+            )
+            states.append(
+                (
+                    float(scores[idx]),
+                    float(trans_cost),
+                    tuple(int(x) for x in cached_pi[idx]),
+                    F_next,
+                    resolved_next,
+                    idx,
+                )
+            )
+
+        if not states:
+            return fallback_idx
+
+        states.sort(key=lambda item: (item[0], item[5]))
+        states = states[:beam_width]
+
+        for _ in range(1, beam_depth):
+            expanded = []
+            for _, total_cost, pi_state, F_state, resolved_state, first_idx in states:
+                if not F_state:
+                    expanded.append(
+                        (total_cost, total_cost, pi_state, F_state, resolved_state, first_idx)
+                    )
+                    continue
+
+                resolved_list = list(resolved_state)
+                F_list = list(F_state)
+                E = self.generate_extended_set(
+                    F_list,
+                    DAG,
+                    IDAG,
+                    resolved_list,
+                    optimized_partitions,
+                    max_E_size=max_E_size,
+                    max_lookahead=max_lookahead,
+                )
+                candidates = self.obtain_partition_candidates(
+                    F_list,
+                    optimized_partitions,
+                    candidate_cache=candidate_cache,
+                )
+                if not candidates:
+                    expanded.append(
+                        (total_cost, total_cost, pi_state, F_state, resolved_state, first_idx)
+                    )
+                    continue
+                candidates = self._prefilter_candidates(
+                    candidates,
+                    list(pi_state),
+                    D,
+                    top_k,
+                    F=F_state,
+                    E=E,
+                    candidate_cache=candidate_cache,
+                    layout_partitions=optimized_partitions,
+                    reverse=reverse,
+                    W=W,
+                    alpha=alpha,
+                    canonical_data=canonical_data,
+                )
+
+                for cand in candidates:
+                    neighbor_info = self._build_neighbor_info(
+                        cand.partition_idx,
+                        F_state,
+                        E,
+                        pi_state,
+                        canonical_data,
+                        weight=path_weight,
+                        W=W,
+                        alpha=alpha,
+                        layout_partitions=optimized_partitions,
+                    )
+                    score, swaps, output_perm = self.score_partition_candidate(
+                        cand,
+                        F_state,
+                        list(pi_state),
+                        scoring_partitions,
+                        D,
+                        self._swap_cache,
+                        E=E,
+                        W=W,
+                        alpha=alpha,
+                        reverse=reverse,
+                        canonical_data=canonical_data,
+                        adj=adj,
+                        cnot_cost=cnot_cost,
+                        path_tiebreak_weight=path_weight,
+                        cached_neighbor_info=neighbor_info,
+                        candidate_cache=candidate_cache,
+                        layout_partitions=optimized_partitions,
+                        return_transforms=True,
+                        three_qubit_exit_weight=three_q_weight,
+                    )
+                    trans_cost = transition_cost(cand, swaps)
+                    future_cost = float(score) - trans_cost
+                    new_total = total_cost + trans_cost
+                    rank_cost = new_total + future_cost
+                    F_next, resolved_next = self._advance_layout_frontier(
+                        cand.partition_idx,
+                        F_state,
+                        resolved_state,
+                        DAG,
+                        IDAG,
+                        optimized_partitions,
+                    )
+                    expanded.append(
+                        (
+                            rank_cost,
+                            new_total,
+                            tuple(int(x) for x in output_perm),
+                            F_next,
+                            resolved_next,
+                            first_idx,
+                        )
+                    )
+
+            if not expanded:
+                break
+            expanded.sort(key=lambda item: (item[0], item[5]))
+            states = expanded[:beam_width]
+
+        if not states:
+            return fallback_idx
+        return int(min(states, key=lambda item: (item[0], item[5]))[5])
+
+    def Heuristic_Search(
+        self,
+        F,
+        pi,
+        DAG,
+        IDAG,
+        optimized_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache=None,
+    ):
+        pi_initial = pi.copy()
+        F = list(F)
+
+        resolved_partitions = [False] * len(DAG)
+        partition_order = []
+        resolved_count = 0
+
+        queue = deque(
+            p
+            for p in F
+            if isinstance(optimized_partitions[p], SingleQubitPartitionResult)
+        )
+        while queue:
+            partition_idx = queue.pop()
+            if resolved_partitions[partition_idx]:
+                continue
+            if partition_idx in F:
+                F.remove(partition_idx)
+
+            single_qubit_part = optimized_partitions[partition_idx]
+            original_qubit = int(single_qubit_part.involved_qbits[0])
+            circuit_qubit = int(single_qubit_part.circuit.get_Qbits()[0])
+            single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits(
+                {circuit_qubit: int(pi[original_qubit])},
+                max(D.shape),
+            )
+            partition_order.append(single_qubit_part)
+            resolved_partitions[partition_idx] = True
+            resolved_count += 1
+
+            for child in DAG[partition_idx]:
+                if not resolved_partitions[child] and child not in F:
+                    if all(resolved_partitions[p] for p in IDAG[child]):
+                        if isinstance(
+                            optimized_partitions[child],
+                            SingleQubitPartitionResult,
+                        ):
+                            queue.append(child)
+                        else:
+                            F.append(child)
+
+        total_partitions = len(DAG)
+        pbar = tqdm(
+            total=total_partitions,
+            desc="Heuristic Search",
+            bar_format=(
+                "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved"
+            ),
+            disable=self.config.get("progressbar", 0) is False,
+            mininterval=0.2,
+        )
+        if resolved_count:
+            pbar.update(resolved_count)
+
+        max_E_size = self.config.get("max_E_size", 20)
+        max_lookahead = self.config.get("max_lookahead", 4)
+        E_W = self.config.get("E_weight", 0.5)
+        E_alpha = self.config.get("E_alpha", 1.0)
+        swap_burst_budget = self.config.get("swap_burst_budget", 5)
+
+        canonical_data = self._build_canonical_neighbor_data(
+            scoring_partitions, reverse=False
+        )
+        decay = [1.0] * len(pi)
+        swap_heavy_partitions = 0
+
+        while F:
+            if (
+                swap_burst_budget > 0
+                and swap_heavy_partitions >= swap_burst_budget
+            ):
+                valve_swaps, pi_bridged = self._release_valve(
+                    F, pi, D, canonical_data
+                )
+                if valve_swaps:
+                    partition_order.append(
+                        construct_swap_circuit(valve_swaps, len(pi))
+                    )
+                    self._apply_decay_for_swaps(valve_swaps, decay)
+                    pi = np.asarray(pi_bridged)
+                    swap_heavy_partitions = 0
+                    continue
+                self._reset_decay(decay)
+                swap_heavy_partitions = 0
+
+            F_snapshot = tuple(F)
+            E = self.generate_extended_set(
+                F,
+                DAG,
+                IDAG,
+                resolved_partitions,
+                optimized_partitions,
+                max_E_size=max_E_size,
+                max_lookahead=max_lookahead,
+            )
+
+            partition_candidates = self.obtain_partition_candidates(
+                F,
+                optimized_partitions,
+                candidate_cache=candidate_cache,
+            )
+            if not partition_candidates:
+                break
+
+            top_k = self.config.get("prefilter_top_k", 50)
+            partition_candidates = self._prefilter_candidates(
+                partition_candidates,
+                pi,
+                D,
+                top_k,
+                F=F_snapshot,
+                E=E,
+                candidate_cache=candidate_cache,
+                layout_partitions=optimized_partitions,
+                W=E_W,
+                alpha=E_alpha,
+                canonical_data=canonical_data,
+            )
+
+            # Group candidates by partition_idx to reuse _build_neighbor_info
+            candidate_order = sorted(
+                range(len(partition_candidates)),
+                key=lambda i: partition_candidates[i].partition_idx
+            )
+            scores = [0.0] * len(partition_candidates)
+            cached_swaps = [None] * len(partition_candidates)
+            cached_pi = [None] * len(partition_candidates)
+            prev_partition_idx = None
+            cached_neighbor_info = None
+            for ci in candidate_order:
+                cand = partition_candidates[ci]
+                if cand.partition_idx != prev_partition_idx:
+                    cached_neighbor_info = self._build_neighbor_info(
+                        cand.partition_idx,
+                        F_snapshot,
+                        E,
+                        pi,
+                        canonical_data,
+                        weight=self.config.get("path_tiebreak_weight", 0.2),
+                        W=E_W,
+                        alpha=E_alpha,
+                        layout_partitions=optimized_partitions,
+                    )
+                    prev_partition_idx = cand.partition_idx
+                score, swaps, output_perm = self.score_partition_candidate(
+                    cand,
+                    F_snapshot,
+                    pi,
+                    scoring_partitions,
+                    D,
+                    self._swap_cache,
+                    E=E,
+                    W=E_W,
+                    alpha=E_alpha,
+                    canonical_data=canonical_data,
+                    adj=self._adj,
+                    cnot_cost=self.config.get("cnot_cost", 1.0 / 3.0),
+                    path_tiebreak_weight=self.config.get(
+                        "path_tiebreak_weight", 0.2
+                    ),
+                    decay=decay,
+                    cached_neighbor_info=cached_neighbor_info,
+                    candidate_cache=candidate_cache,
+                    layout_partitions=optimized_partitions,
+                    return_transforms=True,
+                    three_qubit_exit_weight=self.config.get(
+                        "three_qubit_exit_weight", 1.0
+                    ),
+                )
+                scores[ci] = score
+                cached_swaps[ci] = swaps
+                cached_pi[ci] = output_perm
+
+            best_idx = self._boundary_beam_select_index(
+                partition_candidates,
+                scores,
+                cached_swaps,
+                cached_pi,
+                F_snapshot,
+                resolved_partitions,
+                DAG,
+                IDAG,
+                optimized_partitions,
+                scoring_partitions,
+                D,
+                candidate_cache,
+                canonical_data,
+                W=E_W,
+                alpha=E_alpha,
+                cnot_cost=self.config.get("cnot_cost", 1.0 / 3.0),
+                adj=self._adj,
+            )
+            min_partition_candidate = partition_candidates[best_idx]
+
+            F.remove(min_partition_candidate.partition_idx)
+            resolved_partitions[min_partition_candidate.partition_idx] = True
+            resolved_count += 1
+            pbar.update(1)
+
+            swap_order, pi = cached_swaps[best_idx], cached_pi[best_idx]
+            if swap_order:
+                partition_order.append(construct_swap_circuit(swap_order, len(pi)))
+                self._apply_decay_for_swaps(swap_order, decay)
+                swap_heavy_partitions += 1
+            else:
+                swap_heavy_partitions = 0
+                self._reset_decay(decay)
+
+            partition_order.append(min_partition_candidate)
+
+            children = deque(DAG[min_partition_candidate.partition_idx])
+            while children:
+                child = children.popleft()
+                parents_resolved = all(
+                    resolved_partitions[parent] for parent in IDAG[child]
+                )
+                if (not resolved_partitions[child] and child not in F) and (
+                    parents_resolved
+                ):
+                    if isinstance(
+                        optimized_partitions[child], SingleQubitPartitionResult
+                    ):
+                        child_partition = optimized_partitions[child]
+                        original_qubit = int(child_partition.involved_qbits[0])
+                        circuit_qubit = int(child_partition.circuit.get_Qbits()[0])
+                        child_partition.circuit = child_partition.circuit.Remap_Qbits(
+                            {circuit_qubit: int(pi[original_qubit])},
+                            max(D.shape),
+                        )
+                        partition_order.append(child_partition)
+                        resolved_partitions[child] = True
+                        resolved_count += 1
+                        pbar.update(1)
+                        children.extend(DAG[child])
+                    else:
+                        F.append(child)
+
+        pbar.close()
+        return partition_order, pi, pi_initial
+
+    def _heuristic_search_layout_only(
+        self,
+        F,
+        pi,
+        DAG,
+        IDAG,
+        optimized_partitions,
+        scoring_partitions,
+        D,
+        rng=None,
+        reverse=False,
+        candidate_cache=None,
+    ):
+        """Run heuristic search but only track layout (pi). No circuit modification.
+
+        Args:
+            reverse: When True, swap P_i/P_o roles in scoring and layout
+                    updates (used for backward passes in SABRE iterations).
+
+        Returns:
+            (pi, total_cost): final layout and layout-only heuristic score.
+            Trial ranking reroutes returned layouts and sorts by actual
+            constructed-circuit CNOT count; this score is only a tie-breaker.
+        """
+        F = list(F)
+        resolved_partitions = [False] * len(DAG)
+        total_cost = 0.0
+
+        queue = deque(
+            p for p in F if self._partition_is_single(optimized_partitions[p])
+        )
+        while queue:
+            partition_idx = queue.pop()
+            if resolved_partitions[partition_idx]:
+                continue
+            if partition_idx in F:
+                F.remove(partition_idx)
+            resolved_partitions[partition_idx] = True
+
+            for child in DAG[partition_idx]:
+                if not resolved_partitions[child] and child not in F:
+                    if all(resolved_partitions[p] for p in IDAG[child]):
+                        if self._partition_is_single(optimized_partitions[child]):
+                            queue.append(child)
+                        else:
+                            F.append(child)
+
+        max_E_size = self.config.get("max_E_size", 20)
+        max_lookahead = self.config.get("max_lookahead", 4)
+        E_W = self.config.get("E_weight", 0.5)
+        E_alpha = self.config.get("E_alpha", 1.0)
+        cnot_cost = self.config.get("cnot_cost", 1.0 / 3.0)
+        swap_burst_budget = self.config.get("swap_burst_budget", 5)
+
+        canonical_data = self._build_canonical_neighbor_data(
+            scoring_partitions, reverse=reverse
+        )
+        decay = [1.0] * len(pi)
+        swap_heavy_partitions = 0
+
+        while F:
+            if (
+                swap_burst_budget > 0
+                and swap_heavy_partitions >= swap_burst_budget
+            ):
+                valve_swaps, pi = self._release_valve(F, pi, D, canonical_data)
+                if valve_swaps:
+                    total_cost += self._routing_objective(
+                        len(valve_swaps),
+                        0,
+                        cnot_cost,
+                        decay_factor=self._decay_factor_for_swaps(
+                            valve_swaps, decay
+                        ),
+                    )
+                    self._apply_decay_for_swaps(valve_swaps, decay)
+                    swap_heavy_partitions = 0
+                    continue
+                self._reset_decay(decay)
+                swap_heavy_partitions = 0
+
+            F_snapshot = tuple(F)
+            E = self.generate_extended_set(
+                F,
+                DAG,
+                IDAG,
+                resolved_partitions,
+                optimized_partitions,
+                max_E_size=max_E_size,
+                max_lookahead=max_lookahead,
+            )
+
+            partition_candidates = self.obtain_partition_candidates(
+                F,
+                optimized_partitions,
+                candidate_cache=candidate_cache,
+            )
+            if not partition_candidates:
+                break
+
+            top_k = self.config.get("prefilter_top_k", 50)
+            partition_candidates = self._prefilter_candidates(
+                partition_candidates,
+                pi,
+                D,
+                top_k,
+                F=F_snapshot,
+                E=E,
+                candidate_cache=candidate_cache,
+                layout_partitions=optimized_partitions,
+                reverse=reverse,
+                W=E_W,
+                alpha=E_alpha,
+                canonical_data=canonical_data,
+            )
+
+            # Group candidates by partition_idx to reuse _build_neighbor_info
+            candidate_order = sorted(
+                range(len(partition_candidates)),
+                key=lambda i: partition_candidates[i].partition_idx
+            )
+            scores = [0.0] * len(partition_candidates)
+            cached_swaps = [None] * len(partition_candidates)
+            cached_pi = [None] * len(partition_candidates)
+            prev_partition_idx = None
+            cached_neighbor_info = None
+            for ci in candidate_order:
+                cand = partition_candidates[ci]
+                if cand.partition_idx != prev_partition_idx:
+                    cached_neighbor_info = self._build_neighbor_info(
+                        cand.partition_idx,
+                        F_snapshot,
+                        E,
+                        pi,
+                        canonical_data,
+                        weight=self.config.get("path_tiebreak_weight", 0.2),
+                        W=E_W,
+                        alpha=E_alpha,
+                        layout_partitions=optimized_partitions,
+                    )
+                    prev_partition_idx = cand.partition_idx
+                score, swaps, output_perm = self.score_partition_candidate(
+                    cand,
+                    F_snapshot,
+                    pi,
+                    scoring_partitions,
+                    D,
+                    self._swap_cache,
+                    E=E,
+                    W=E_W,
+                    alpha=E_alpha,
+                    reverse=reverse,
+                    canonical_data=canonical_data,
+                    adj=self._adj,
+                    cnot_cost=cnot_cost,
+                    path_tiebreak_weight=self.config.get(
+                        "path_tiebreak_weight", 0.2
+                    ),
+                    decay=decay,
+                    cached_neighbor_info=cached_neighbor_info,
+                    candidate_cache=candidate_cache,
+                    layout_partitions=optimized_partitions,
+                    return_transforms=True,
+                    three_qubit_exit_weight=self.config.get(
+                        "three_qubit_exit_weight", 1.0
+                    ),
+                )
+                scores[ci] = score
+                cached_swaps[ci] = swaps
+                cached_pi[ci] = output_perm
+
+            best_idx = self._boundary_beam_select_index(
+                partition_candidates,
+                scores,
+                cached_swaps,
+                cached_pi,
+                F_snapshot,
+                resolved_partitions,
+                DAG,
+                IDAG,
+                optimized_partitions,
+                scoring_partitions,
+                D,
+                candidate_cache,
+                canonical_data,
+                reverse=reverse,
+                W=E_W,
+                alpha=E_alpha,
+                cnot_cost=cnot_cost,
+                adj=self._adj,
+            )
+            best = partition_candidates[best_idx]
+            F.remove(best.partition_idx)
+            resolved_partitions[best.partition_idx] = True
+
+            swaps, pi = cached_swaps[best_idx], cached_pi[best_idx]
+            decay_factor = 1.0
+            if swaps:
+                decay_factor = self._decay_factor_for_swaps(swaps, decay)
+            total_cost += self._routing_objective(
+                len(swaps),
+                best.cnot_count,
+                cnot_cost,
+                decay_factor=decay_factor,
+            )
+            if swaps:
+                self._apply_decay_for_swaps(swaps, decay)
+                swap_heavy_partitions += 1
+            else:
+                swap_heavy_partitions = 0
+                self._reset_decay(decay)
+
+            for child in DAG[best.partition_idx]:
+                if not resolved_partitions[child] and child not in F:
+                    if all(resolved_partitions[p] for p in IDAG[child]):
+                        if self._partition_is_single(optimized_partitions[child]):
+                            resolved_partitions[child] = True
+                            stack = deque(DAG[child])
+                            while stack:
+                                gc = stack.pop()
+                                if not resolved_partitions[gc] and gc not in F:
+                                    if all(
+                                        resolved_partitions[p]
+                                        for p in IDAG[gc]
+                                    ):
+                                        if self._partition_is_single(
+                                            optimized_partitions[gc]
+                                        ):
+                                            resolved_partitions[gc] = True
+                                            stack.extend(DAG[gc])
+                                        else:
+                                            F.append(gc)
+                        else:
+                            F.append(child)
+
+        return pi, total_cost
+    # ------------------------------------------------------------------------
+    # Circuit Construction
+    # ------------------------------------------------------------------------
+
+    def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
+        final_circuit = Circuit(N)
+        final_parameters = []
+        perm_count = 0
+        partition_count = 0
+
+        for part in partition_order:
+            if isinstance(part, Circuit):
+                final_circuit.add_Circuit(part)
+                perm_count += 1
+            elif isinstance(part, SingleQubitPartitionResult):
+                final_circuit.add_Circuit(part.circuit)
+                final_parameters.append(part.parameters)
+                partition_count += 1
+            else:
+                part_circ, part_parameters = part.get_final_circuit(optimized_partitions,N)
+                final_circuit.add_Circuit(part_circ)
+                final_parameters.append(part_parameters)
+                partition_count += 1
+
+        if final_parameters:
+            final_parameters = np.concatenate([np.atleast_1d(p).ravel() for p in final_parameters], axis=0)
+        else:
+            final_parameters = np.array([])
+        if not check_circuit_compatibility(final_circuit,self.topology):
+            logging.error("Final circuit is not compatible with device topology")
+        return final_circuit, final_parameters
+
+    # ------------------------------------------------------------------------
+    # Scoring
+    # ------------------------------------------------------------------------
+
+    def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
+        """Build a compact future-routing surrogate per partition.
+
+        For each partition, pick the edge pattern with the lowest CNOT count;
+        the router uses this as a canonical "best still-available option" when
+        scoring future partitions.
+        """
+        data = {}
+        for idx, partition in enumerate(scoring_partitions):
+            if partition is None:
+                continue
+            qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
+            variant_map = {}
+            for tdx, mini_topology in enumerate(partition.mini_topologies):
+                for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
+                    cnot = partition.cnot_counts[tdx][pdx]
+                    P_route = P_o if reverse else P_i
+                    if mini_topology:
+                        edge_key = tuple(
+                            sorted(
+                                tuple(
+                                    sorted(
+                                        (
+                                            qbit_map_inv[P_route[u]],
+                                            qbit_map_inv[P_route[v]],
+                                        )
+                                    )
+                                )
+                                for u, v in mini_topology
+                            )
+                        )
+                    else:
+                        edge_key = tuple()
+                    prev_cnot = variant_map.get(edge_key)
+                    if prev_cnot is None or cnot < prev_cnot:
+                        variant_map[edge_key] = cnot
+            if not variant_map:
+                continue
+            edge_key, cnot = min(
+                variant_map.items(),
+                key=lambda item: (item[1], len(item[0]), item[0]),
+            )
+            if edge_key:
+                eu = np.array([e[0] for e in edge_key], dtype=np.intp)
+                ev = np.array([e[1] for e in edge_key], dtype=np.intp)
+            else:
+                eu = ev = None
+            data[idx] = {"edges_u": eu, "edges_v": ev, "cnot": cnot}
+        return data
+
+    @staticmethod
+    def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
+                                  E=None, W=0.5, alpha=0.9, reverse=False,
+                                  canonical_data=None, adj=None,
+                                  cnot_cost=1.0 / 3.0,
+                                  path_tiebreak_weight=0.2, decay=None,
+                                  cached_neighbor_info=None,
+                                  candidate_cache=None,
+                                  layout_partitions=None,
+                                  return_transforms=False,
+                                  three_qubit_exit_weight=1.0):
+        """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
+
+        H = |swaps|
+          + cnot_cost * cand.cnot_count
+          + (1/|F'|) * average routing cost over F \\ {cand}
+          + (W/|E|)  * alpha^d-decayed routing cost over E
+        """
+        if cached_neighbor_info is not None:
+            neighbor_info = cached_neighbor_info
+        else:
+            neighbor_info = qgd_Partition_Aware_Mapping._build_neighbor_info(
+                partition_candidate.partition_idx,
+                F,
+                E,
+                pi,
+                canonical_data,
+                weight=path_tiebreak_weight,
+                W=W,
+                alpha=alpha,
+                layout_partitions=layout_partitions,
+            )
+        swaps, output_perm = partition_candidate.transform_pi(
+            pi,
+            D,
+            swap_cache,
+            reverse=reverse,
+            adj=adj,
+            neighbor_info=neighbor_info,
+        )
+        decay_factor = 1.0
+        if decay is not None and swaps:
+            decay_factor = qgd_Partition_Aware_Mapping._decay_factor_for_swaps(
+                swaps, decay
+            )
+        score = qgd_Partition_Aware_Mapping._routing_objective(
+            len(swaps),
+            partition_candidate.cnot_count,
+            cnot_cost,
+            decay_factor=decay_factor,
+        )
+
+        if candidate_cache is None:
+            if return_transforms:
+                return score, swaps, output_perm
+            return score
+
+        cand_idx = partition_candidate.partition_idx
+        future_score = qgd_Partition_Aware_Mapping._future_context_cost(
+            cand_idx,
+            output_perm,
+            F,
+            E,
+            D,
+            candidate_cache,
+            reverse=reverse,
+            cnot_cost=cnot_cost,
+            W=W,
+            alpha=alpha,
+            layout_partitions=layout_partitions,
+            canonical_data=canonical_data,
+        )
+        if len(partition_candidate.involved_qbits) >= 3:
+            future_score *= three_qubit_exit_weight
+        score += future_score
+
+        if return_transforms:
+            return score, swaps, output_perm
+        return score
+
+    # ------------------------------------------------------------------------
+    # Extended Set
+    # ------------------------------------------------------------------------
+
+    @staticmethod
+    def generate_extended_set(
+        F,
+        DAG,
+        IDAG,
+        resolved_partitions,
+        optimized_partitions,
+        max_E_size=20,
+        max_lookahead=4,
+    ):
+        """
+        Generate SABRE-style extended set: multi-qubit partitions near the
+        front layer, up to ``max_lookahead`` levels deep and ``max_E_size``
+        entries. Returns list of (partition_idx, depth) tuples.
+        """
+        E = []
+        E_set = set()
+        F_set = set(F)
+
+        for front_idx in F:
+            if len(E) >= max_E_size:
+                break
+
+            queue = deque((child, 1) for child in DAG[front_idx])
+
+            while queue and len(E) < max_E_size:
+                child_idx, depth = queue.popleft()
+                if depth > max_lookahead:
+                    continue
+                if child_idx in E_set or child_idx in F_set:
+                    continue
+                if resolved_partitions[child_idx]:
+                    continue
+
+                parents_resolved = all(
+                    resolved_partitions[p] or p in F_set for p in IDAG[child_idx]
+                )
+                if not parents_resolved:
+                    continue
+
+                if qgd_Partition_Aware_Mapping._partition_is_single(
+                    optimized_partitions[child_idx]
+                ):
+                    for grandchild in DAG[child_idx]:
+                        queue.append((grandchild, depth))
+                    continue
+
+                E.append((child_idx, depth))
+                E_set.add(child_idx)
+
+                if depth < max_lookahead:
+                    for grandchild in DAG[child_idx]:
+                        queue.append((grandchild, depth + 1))
+
+        return E
+
+    # ------------------------------------------------------------------------
+    # Candidate Generation
+    # ------------------------------------------------------------------------
+
+    def obtain_partition_candidates(
+        self,
+        F,
+        optimized_partitions=None,
+        candidate_cache=None,
+    ):
+        if candidate_cache is not None:
+            partition_candidates = []
+            for partition_idx in F:
+                cached = candidate_cache[partition_idx]
+                if cached:
+                    partition_candidates.extend(cached)
+            return partition_candidates
+
+        partition_candidates = []
+        for partition_idx in F:
+            partition = optimized_partitions[partition_idx]
+            for tdx, mini_topology in enumerate(partition.mini_topologies):
+                if hasattr(partition, 'get_topology_candidates'):
+                    topology_candidates = partition.get_topology_candidates(tdx)
+                else:
+                    topology_candidates = self._get_subtopologies_of_type_cached(
+                        mini_topology
+                    )
+                for topology_candidate in topology_candidates:
+                    for pdx, permutation_pair in enumerate(
+                        partition.permutations_pairs[tdx]
+                    ):
+                        partition_candidates.append(
+                            PartitionCandidate(
+                                partition_idx,
+                                tdx,
+                                pdx,
+                                partition.circuit_structures[tdx][pdx],
+                                permutation_pair[0],
+                                permutation_pair[1],
+                                topology_candidate,
+                                mini_topology,
+                                partition.qubit_map,
+                                partition.involved_qbits,
+                                cnot_count=partition.cnot_counts[tdx][pdx],
+                            )
+                        )
+        return partition_candidates
+
+    # ------------------------------------------------------------------------
+    # Graph Construction
+    # ------------------------------------------------------------------------
+
+    def get_initial_layer(self, IDAG, N, optimized_partitions):
+        del N, optimized_partitions
+        return [idx for idx in range(len(IDAG)) if not IDAG[idx]]
+
+
+    def get_final_layer(self, DAG, N, optimized_partitions):
+        del N, optimized_partitions
+        return [idx for idx in range(len(DAG) - 1, -1, -1) if not DAG[idx]]
+
+    def construct_DAG_and_IDAG(self, optimized_partitions):
+        DAG = []
+        IDAG = []
+        for idx in range(len(optimized_partitions)):
+            parents = []
+            children = []
+            if idx != len(optimized_partitions)-1:
+                involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
+                for next_idx in range(idx+1, len(optimized_partitions)):
+                    involved_qbits_next = optimized_partitions[next_idx].involved_qbits
+                    intersection = [i for i in involved_qbits_current if i in involved_qbits_next]
+                    if len(intersection) > 0:
+                        children.append(next_idx)
+                    for intersection_qbit in intersection:
+                        involved_qbits_current.remove(intersection_qbit)
+                    if len(involved_qbits_current) == 0:
+                        break
+            if idx != 0:
+                involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
+                for prev_idx in range(idx-1, -1, -1):
+                    involved_qbits_prev = optimized_partitions[prev_idx].involved_qbits
+                    intersection = [i for i in involved_qbits_current if i in involved_qbits_prev]
+                    if len(intersection) > 0:
+                        parents.append(prev_idx)
+                    for intersection_qbit in intersection:
+                        involved_qbits_current.remove(intersection_qbit)
+                    if len(involved_qbits_current) == 0:
+                        break
+            DAG.append(children)
+            IDAG.append(parents)
+        return DAG, IDAG
+
+    # ------------------------------------------------------------------------
+    # Distance & Layout
+    # ------------------------------------------------------------------------
+
+    def compute_distances_bfs(self, N):
+        """BFS distance computation - faster than Floyd-Warshall."""
+        D = np.ones((N, N)) * np.inf
+
+        # Build adjacency list
+        adj = defaultdict(list)
+        for u, v in self.config['topology']:
+            adj[u].append(v)
+            adj[v].append(u)
+
+        # BFS from each vertex
+        for start in range(N):
+            D[start][start] = 0
+            queue = deque([(start, 0)])
+            visited = {start}
+
+            while queue:
+                node, dist = queue.popleft()
+                for neighbor in adj[node]:
+                    if neighbor not in visited:
+                        visited.add(neighbor)
+                        D[start][neighbor] = dist + 1
+                        queue.append((neighbor, dist + 1))
+
+        # Store adjacency list for reuse by A* routing
+        self._adj = [list(adj[i]) for i in range(N)]
+
+        return D
+
+    def _compute_seeded_layout(self, optimized_partitions, D, N, circ):
+        """VF2Layout + SabrePreLayout seeded initial layout (LightSABRE §II.3).
+
+        The interaction graph is built from the circuit's two-qubit gate pairs
+        (matching the paper's gate-level approach), not from partition cliques.
+        Partition-level weights are used only for the greedy fallback.
+
+        Steps:
+        1. VF2Layout: subgraph isomorphism of gate interaction graph into
+           hardware topology.  If a mapping exists, every gate qubit pair
+           lands on adjacent physical qubits (zero SWAPs).
+        2. SabrePreLayout: augment topology with distance-d edges (d=2),
+           retry VF2 — handles "almost perfect" embeddings.
+        3. Fallback: greedy weighted-distance placement from partition weights.
+        """
+        if not self.topology:
+            return np.arange(N)
+
+        # --- build gate-level interaction graph from circuit CNOT pairs ---
+        gate_edges = set()
+        for g in circ.get_Gates():
+            gname = str(type(g).__name__)
+            if 'CNOT' in gname or 'CX' in gname:
+                ctrl = g.get_Control_Qbit()
+                tgt = g.get_Target_Qbit()
+                gate_edges.add((min(ctrl, tgt), max(ctrl, tgt)))
+
+        if not gate_edges:
+            return np.arange(N)
+
+        # --- try rustworkx VF2 approaches ---
+        try:
+            import rustworkx as rx
+        except ImportError:
+            return self._greedy_seeded_layout(optimized_partitions, D, N)
+
+        G_int = rx.PyGraph()
+        G_int.add_nodes_from(range(N))
+        for u, v in gate_edges:
+            G_int.add_edge(u, v, None)
+
+        G_hw = rx.PyGraph()
+        G_hw.add_nodes_from(range(N))
+        for u, v in self.topology:
+            G_hw.add_edge(u, v, None)
+
+        # Step 1: VF2Layout — exact subgraph isomorphism
+        pi = self._try_vf2_layout(G_int, G_hw, N)
+        if pi is not None:
+            return pi
+
+        # Step 2: SabrePreLayout — augment topology with distance-2 edges
+        G_aug = rx.PyGraph()
+        G_aug.add_nodes_from(range(N))
+        seen = set()
+        for u, v in self.topology:
+            G_aug.add_edge(u, v, None)
+            seen.add((min(u, v), max(u, v)))
+        for i in range(N):
+            for j in range(i + 1, N):
+                if (i, j) not in seen and D[i][j] <= 2:
+                    G_aug.add_edge(i, j, None)
+                    seen.add((i, j))
+
+        pi = self._try_vf2_layout(G_int, G_aug, N)
+        if pi is not None:
+            return pi
+
+        # Step 3: greedy fallback using partition-level weights
+        return self._greedy_seeded_layout(optimized_partitions, D, N)
+
+    def _try_vf2_layout(self, G_int, G_hw, N):
+        """Try VF2 subgraph isomorphism of G_int into G_hw.
+
+        Returns pi (logical->physical mapping) or None if no embedding exists.
+        Uses induced=False to allow non-edges in the interaction graph to
+        correspond to edges in the hardware graph (monotone subgraph iso).
+        """
+        import rustworkx as rx
+
+        try:
+            vf2_iter = rx.vf2_mapping(G_hw, G_int, subgraph=True, induced=False)
+            mapping = next(vf2_iter)  # {hw_node: int_node}
+        except StopIteration:
+            return None
+
+        # Invert: pi[logical_q] = physical_q
+        pi = np.zeros(N, dtype=int)
+        inv = {v: k for k, v in mapping.items()}
+        used = set(inv.values())
+        free = [p for p in range(N) if p not in used]
+        fi = 0
+        for q in range(N):
+            if q in inv:
+                pi[q] = inv[q]
+            else:
+                pi[q] = free[fi]
+                fi += 1
+        return pi
+
+    def _greedy_seeded_layout(self, optimized_partitions, D, N):
+        """Greedy weighted-distance placement (fallback when VF2 fails)."""
+        # Build interaction weights from partitions
+        interaction_weight = defaultdict(float)
+        for partition in optimized_partitions:
+            if isinstance(partition, SingleQubitPartitionResult):
+                continue
+            if not isinstance(partition, PartitionSynthesisResult):
+                continue
+            involved = list(partition.involved_qbits)
+            if len(involved) < 2:
+                continue
+            best_cnot = float('inf')
+            for tdx in range(len(partition.cnot_counts)):
+                if not partition.cnot_counts[tdx]:
+                    continue
+                cnot_min = min(partition.cnot_counts[tdx])
+                if cnot_min < best_cnot:
+                    best_cnot = cnot_min
+            if best_cnot == float('inf'):
+                continue
+            for i in range(len(involved)):
+                for j in range(i + 1, len(involved)):
+                    key = (min(involved[i], involved[j]),
+                           max(involved[i], involved[j]))
+                    interaction_weight[key] += best_cnot
+
+        if not interaction_weight:
+            return np.arange(N)
+
+        pi = np.arange(N)
+        placed_logical = set()
+        placed_physical = set()
+
+        (q1, q2), _ = max(interaction_weight.items(), key=lambda x: x[1])
+        p1, p2 = self.topology[0]
+
+        holder1 = np.where(pi == p1)[0][0]
+        pi[q1], pi[holder1] = p1, pi[q1]
+        holder2 = np.where(pi == p2)[0][0]
+        pi[q2], pi[holder2] = p2, pi[q2]
+        placed_logical.update([q1, q2])
+        placed_physical.update([p1, p2])
+
+        remaining = [q for q in range(N) if q not in placed_logical]
+
+        def _score(q):
+            return sum(
+                interaction_weight.get((min(q, pq), max(q, pq)), 0.0)
+                for pq in placed_logical
+            )
+
+        remaining.sort(key=_score, reverse=True)
+
+        for logical_q in remaining:
+            best_physical = None
+            best_dist = float('inf')
+
+            for physical_q in range(N):
+                if physical_q in placed_physical:
+                    continue
+
+                total_dist = 0.0
+                total_w = 0.0
+                for other_q in placed_logical:
+                    key = (min(logical_q, other_q), max(logical_q, other_q))
+                    w = interaction_weight.get(key, 0.0)
+                    if w > 0:
+                        total_dist += D[physical_q][pi[other_q]] * w
+                        total_w += w
+
+                avg = total_dist / total_w if total_w > 0 else 0.0
+                if avg < best_dist:
+                    best_dist = avg
+                    best_physical = physical_q
+
+            if best_physical is not None:
+                holder = np.where(pi == best_physical)[0][0]
+                pi[logical_q], pi[holder] = best_physical, pi[logical_q]
+                placed_logical.add(logical_q)
+                placed_physical.add(best_physical)
+
+        return pi
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
new file mode 100644
index 000000000..69c2c9732
--- /dev/null
+++ b/squander/synthesis/PartAM_utils.py
@@ -0,0 +1,688 @@
+import heapq
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from itertools import combinations, permutations
+from typing import Dict, FrozenSet, List, Set, Tuple
+
+import numpy as np
+
+from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
+
+
+# ============================================================================
+# SWAP Routing Algorithms
+# ============================================================================
+def _neighbor_signature(neighbor_info):
+    """Stable hash-friendly signature of an active neighbor_info.
+
+    Returns None when the neighbor heuristic is inactive (no info, zero
+    weight, or empty edge list) — callers treat all such calls as cache-
+    compatible.  Otherwise returns a tuple of (sorted edges as
+    (min(u,v), max(u,v), weight), initial_pos tuple, rounded weight).
+    """
+    if neighbor_info is None:
+        return None
+    weight = neighbor_info.get('weight', 0.0)
+    edges = neighbor_info.get('edges') or ()
+    if weight == 0.0 or not edges:
+        return None
+    canonical_edges = tuple(sorted(
+        (min(int(u), int(v)), max(int(u), int(v)), float(w))
+        for u, v, w in edges
+    ))
+    initial_pos = tuple(int(p) for p in neighbor_info.get('initial_pos', ()))
+    return (canonical_edges, initial_pos, round(float(weight), 6))
+
+
+def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix, adj=None, neighbor_info=None):
+    """
+    Route partition qubits to their target physical positions using A* over
+    the k-dimensional state space of partition qubit positions only.
+
+    For k partition qubits on an n-node topology the state space has at most
+    n^k entries (n*(n-1)*...*(n-k+1) distinct states).  For the typical case
+    of k=2 or k=3 and n≤20 this is tiny (≤2744 states) so the search
+    completes in microseconds while still finding an optimal SWAP sequence.
+
+    The original full-state A* had O(n!) state space which was exponentially
+    slow.  The naive greedy replacement oscillated when two adjacent partition
+    qubits needed to move in the same direction.  This implementation avoids
+    both problems.
+
+    Args:
+        pi_A        : List[int], pi_A[q] = current physical position of virtual qubit q.
+        pi_B_dict   : Dict {q: target_physical} for the qubits that need routing.
+        dist_matrix : n×n distance/cost matrix; dist[i][j]==1 means i and j are adjacent.
+
+    Returns:
+        swaps            : List of (P1, P2) adjacent-qubit SWAP operations (optimal).
+        final_permutation: Updated virtual→physical mapping after all SWAPs.
+    """
+    n = len(pi_A)
+
+    # Build adjacency list from dist_matrix if not provided
+    if adj is None:
+        adj = [[] for _ in range(n)]
+        for i in range(n):
+            for j in range(i + 1, n):
+                if dist_matrix[i][j] == 1:
+                    adj[i].append(j)
+                    adj[j].append(i)
+
+    partition_qubits = sorted(pi_B_dict.keys())
+    k = len(partition_qubits)
+
+    initial_positions = tuple(int(pi_A[q]) for q in partition_qubits)
+    target_positions  = tuple(int(pi_B_dict[q]) for q in partition_qubits)
+
+    if initial_positions == target_positions:
+        return [], list(pi_A)
+
+    def heuristic(positions):
+        # Admissible lower bound: sum of individual distances / 2
+        return sum(dist_matrix[positions[i]][target_positions[i]] for i in range(k)) / 2
+
+    # SABRE-aware tiebreaker: prefer SWAP paths that keep future-partition
+    # qubits closer together.  The weight is small enough to never override
+    # optimality (same SWAP count), only break ties among equal-length paths.
+    if neighbor_info is not None and neighbor_info['edges']:
+        n_vqs = neighbor_info['neighbor_vqs']
+        n_edges = neighbor_info['edges']       # list of (idx_u, idx_v, edge_weight)
+        n_weight = neighbor_info['weight']
+        initial_n_pos = neighbor_info['initial_pos']
+        # Reverse map: physical position → index in n_vqs (for displacement tracking)
+        _n_len = len(n_vqs)
+        use_neighbor = True
+
+        # Normalize so neighbor_heuristic returns values in [0, 1].
+        # This guarantees n_weight * neighbor_heuristic < 1 (for n_weight < 1),
+        # so the tiebreaker never overrides SWAP-count optimality.
+        _total_edge_weight = sum(w for _, _, w in n_edges)
+        _diameter = int(np.max(dist_matrix[dist_matrix < np.inf])) if n > 1 else 1
+        _norm = max(1.0, _total_edge_weight * _diameter)
+
+        def neighbor_heuristic(n_pos):
+            return sum(w * dist_matrix[n_pos[i]][n_pos[j]] for i, j, w in n_edges) / _norm
+    else:
+        initial_n_pos = ()
+        n_weight = 0.0
+        _n_len = 0
+        use_neighbor = False
+
+        def neighbor_heuristic(n_pos):
+            return 0.0
+
+    # A* over k-dimensional state space.
+    # Each state is a tuple of physical positions, one per partition qubit.
+    # Paths are reconstructed via a parent-pointer dict to avoid copying lists
+    # on every heap push (which would be O(depth²) total).
+    counter = 0  # tiebreak counter so tuples never compare paths
+    # When the neighbor tie-breaker is active, the full search state must
+    # include the tracked future-qubit positions.  Otherwise two equal-length
+    # paths to the same partition positions but different bystander layouts
+    # collapse into one visited entry, defeating the downstream-layout signal.
+    initial_state = (
+        (initial_positions, initial_n_pos) if use_neighbor else initial_positions
+    )
+    parent = {}  # state_key → (parent_state_key, swap) for path reconstruction
+    parent[initial_state] = None
+
+    h0 = heuristic(initial_positions)
+    nh0 = n_weight * neighbor_heuristic(initial_n_pos) if use_neighbor else 0.0
+    heap = []
+    heapq.heappush(heap, (h0 + nh0, 0, counter, initial_positions, initial_n_pos))
+    visited = {initial_state: 0}
+
+    while heap:
+        f, g, _, positions, n_pos = heapq.heappop(heap)
+
+        state_key = (positions, n_pos) if use_neighbor else positions
+
+        if positions == target_positions:
+            # Reconstruct swap path via parent pointers
+            path = []
+            state = state_key
+            while parent[state] is not None:
+                prev_state, swap = parent[state]
+                path.append(swap)
+                state = prev_state
+            path.reverse()
+
+            # Replay swaps on the full mapping to get final virt→phys
+            final_v2p = list(pi_A)
+            final_p2v = [0] * n
+            for q_idx in range(n):
+                final_p2v[int(final_v2p[q_idx])] = q_idx
+            for P1, P2 in path:
+                q1, q2 = final_p2v[P1], final_p2v[P2]
+                final_p2v[P1], final_p2v[P2] = q2, q1
+                final_v2p[q1], final_v2p[q2] = P2, P1
+            return path, final_v2p
+
+        if visited.get(state_key, float('inf')) < g:
+            continue
+
+        # Quick lookup: physical position → index within partition_qubits list
+        pos_to_k_idx = {p: i for i, p in enumerate(positions)}
+
+        # Build reverse map for neighbor displacement tracking
+        if use_neighbor:
+            n_phys_to_idx = {n_pos[idx]: idx for idx in range(_n_len)}
+
+        # Expand: try every SWAP that moves at least one partition qubit
+        for i, p in enumerate(positions):
+            for nb in adj[p]:
+                new_positions = list(positions)
+                new_positions[i] = nb
+                # If the neighbor also holds a partition qubit, swap it too
+                if nb in pos_to_k_idx:
+                    j = pos_to_k_idx[nb]
+                    new_positions[j] = p
+                new_positions = tuple(new_positions)
+
+                new_g = g + 1
+                # When a partition qubit swaps into nb, a tracked neighbor at nb
+                # is displaced to p AND a tracked neighbor at p (if it overlaps
+                # with a partition qubit) moves to nb. Update both sides.
+                if use_neighbor:
+                    new_n_pos = list(n_pos)
+                    if nb in n_phys_to_idx:
+                        new_n_pos[n_phys_to_idx[nb]] = p
+                    if p in n_phys_to_idx:
+                        new_n_pos[n_phys_to_idx[p]] = nb
+                    new_n_pos = tuple(new_n_pos)
+                    new_nh = n_weight * neighbor_heuristic(new_n_pos)
+                else:
+                    new_n_pos = n_pos
+                    new_nh = 0.0
+
+                new_state_key = (
+                    (new_positions, new_n_pos) if use_neighbor else new_positions
+                )
+                if visited.get(new_state_key, float('inf')) <= new_g:
+                    continue
+
+                visited[new_state_key] = new_g
+                swap_key = (min(p, nb), max(p, nb))
+                parent[new_state_key] = (state_key, swap_key)
+                counter += 1
+                heapq.heappush(heap, (new_g + heuristic(new_positions) + new_nh,
+                                      new_g, counter, new_positions, new_n_pos))
+
+    logging.warning(
+        "find_constrained_swaps_partial: failed to route %s → %s",
+        initial_positions, target_positions,
+    )
+    return [], list(pi_A)
+
+
+# ============================================================================
+# Topology Utilities
+# ============================================================================
+
+def _get_induced_edges(edges: List[Tuple[int, int]], qubit_subset: Set[int]) -> List[Tuple[int, int]]:
+    return [edge for edge in edges if edge[0] in qubit_subset and edge[1] in qubit_subset]
+
+def _is_connected(nodes: Set[int], edges: List[Tuple[int, int]]) -> bool:
+    if len(nodes) <= 1:
+        return True
+    adj = defaultdict(set)
+    for u, v in edges:
+        if u in nodes and v in nodes:
+            adj[u].add(v)
+            adj[v].add(u)
+    start = next(iter(nodes))
+    visited = {start}
+    stack = [start]
+    while stack:
+        node = stack.pop()
+        for neighbor in adj[node]:
+            if neighbor not in visited:
+                visited.add(neighbor)
+                stack.append(neighbor)
+    return visited == nodes
+
+def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]:
+    qubits = sorted(qubit_subset)
+    n = len(qubits)
+    best_edges = None
+    for perm in permutations(range(n)):
+        mapping = {qubits[i]: perm[i] for i in range(n)}
+        relabeled = tuple(sorted([tuple(sorted([mapping[u], mapping[v]])) for u, v in induced_edges]))
+        if best_edges is None or relabeled < best_edges:
+            best_edges = relabeled
+    return frozenset(best_edges)
+
+def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
+    """Return one representative locally-labeled (0..k-1) edge list per unique k-node
+    connected subgraph isomorphism class found in the graph defined by *edges*."""
+    if k <= 0:
+        return []
+    if k == 1:
+        return [[]]
+    nodes = set()
+    for u, v in edges:
+        nodes.add(u)
+        nodes.add(v)
+    nodes = sorted(nodes)
+    if len(nodes) < k:
+        return []
+    canonical_forms = {}
+    for subset in combinations(nodes, k):
+        subset_set = set(subset)
+        induced = _get_induced_edges(edges, subset_set)
+        if not _is_connected(subset_set, induced):
+            continue
+        canonical = get_canonical_form(subset_set, induced)
+        if canonical not in canonical_forms:
+            # Store locally-labeled edges (0..k-1) so the decomposer always
+            # receives a valid k-qubit topology regardless of global qubit indices.
+            canonical_forms[canonical] = sorted(canonical)
+    return list(canonical_forms.values())
+
+def get_subtopologies_of_type(edges: List[Tuple[int, int]], target_topology: List[Tuple[int, int]]) -> List[List[Tuple[int, int]]]:
+    """Return all connected k-node subgraphs of *edges* that are isomorphic to
+    *target_topology*, each expressed with the original global qubit labels
+    (needed for physical routing decisions)."""
+    target_qubits = set()
+    for u, v in target_topology:
+        target_qubits.add(u)
+        target_qubits.add(v)
+    k = len(target_qubits) if target_qubits else 1
+    if k <= 0:
+        return []
+    nodes = set()
+    for u, v in edges:
+        nodes.add(u)
+        nodes.add(v)
+    if k == 1:
+        return [[] for _ in nodes]
+    nodes = sorted(nodes)
+    if len(nodes) < k:
+        return []
+    target_canonical = get_canonical_form(target_qubits, target_topology)
+    matches = []
+    for subset in combinations(nodes, k):
+        subset_set = set(subset)
+        induced = _get_induced_edges(edges, subset_set)
+        if not _is_connected(subset_set, induced):
+            continue
+        canonical = get_canonical_form(subset_set, induced)
+        if canonical == target_canonical:
+            matches.append(induced)  # global labels retained for routing
+    return matches
+
+_node_mapping_cache = {}
+
+def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int, int]]) -> dict:
+    cache_key = (tuple(tuple(e) for e in topology1), tuple(tuple(e) for e in topology2))
+    cached = _node_mapping_cache.get(cache_key)
+    if cached is not None:
+        return cached
+
+    qubits1 = set()
+    for u, v in topology1:
+        qubits1.add(u)
+        qubits1.add(v)
+    qubits2 = set()
+    for u, v in topology2:
+        qubits2.add(u)
+        qubits2.add(v)
+    if len(qubits1) != len(qubits2):
+        _node_mapping_cache[cache_key] = {}
+        return {}
+    sorted_qubits1 = sorted(qubits1)
+    sorted_qubits2 = sorted(qubits2)
+    n = len(sorted_qubits1)
+    for perm in permutations(range(n)):
+        mapping = {sorted_qubits1[i]: sorted_qubits2[perm[i]] for i in range(n)}
+        mapped_edges = set()
+        for u, v in topology1:
+            mapped_edges.add(tuple(sorted([mapping[u], mapping[v]])))
+        original_edges = set(tuple(sorted([u, v])) for u, v in topology2)
+        if mapped_edges == original_edges:
+            _node_mapping_cache[cache_key] = mapping
+            return mapping
+    _node_mapping_cache[cache_key] = {}
+    return {}
+
+
+def compute_automorphisms(mini_topology: List[Tuple[int, int]]) -> List[Tuple[int, ...]]:
+    """Compute all automorphisms of a locally-labeled mini_topology (nodes 0..N-1).
+
+    An automorphism is a permutation sigma of {0,...,N-1} that preserves the
+    undirected edge set.  For N<=4 (typical partition size) brute-forcing all
+    N! permutations is at most 24 checks.
+
+    Returns:
+        List of permutation tuples. Always includes the identity as the first
+        element.
+    """
+    nodes = set()
+    for u, v in mini_topology:
+        nodes.add(u)
+        nodes.add(v)
+    if not nodes:
+        return [()]
+    N = max(nodes) + 1
+    edge_set = set()
+    for u, v in mini_topology:
+        edge_set.add((min(u, v), max(u, v)))
+
+    automorphisms = []
+    for perm in permutations(range(N)):
+        mapped = set()
+        for u, v in mini_topology:
+            mapped.add((min(perm[u], perm[v]), max(perm[u], perm[v])))
+        if mapped == edge_set:
+            automorphisms.append(perm)
+    return automorphisms
+
+
+def derive_result_from_automorphism(sigma, P_i, P_o, circuit, parameters, N):
+    """Derive an equivalent decomposition result from a topology automorphism.
+
+    Given that C(theta) approximates P_o . U . P_i on topology T, the circuit
+    sigma(C)(theta) approximates (sigma . P_o) . U . (P_i . sigma^-1) on T
+    (since sigma preserves T).
+
+    Returns:
+        (new_P_i, new_P_o, new_circuit, parameters)
+        Parameters are returned as-is (identical values, different qubit labels).
+    """
+    sigma_inv = [0] * N
+    for i in range(N):
+        sigma_inv[sigma[i]] = i
+
+    new_P_i = tuple(P_i[sigma_inv[j]] for j in range(N))
+    new_P_o = tuple(sigma[P_o[j]] for j in range(N))
+
+    remap = {i: sigma[i] for i in range(N)}
+    new_circuit = circuit.Remap_Qbits(remap, N)
+
+    return new_P_i, new_P_o, new_circuit, parameters
+
+
+# ============================================================================
+# Data Classes
+# ============================================================================
+
+class SingleQubitPartitionResult:
+
+    def __init__(self, circuit_in, parameters_in, original_qubits=None):
+        self.circuit = circuit_in
+        self.parameters = parameters_in
+        self.involved_qbits = original_qubits if original_qubits is not None else circuit_in.get_Qbits()
+
+# Virtual qubits q, reduced virtual qubits (the remapped circuit only up to partition_size) q*
+# Physical qubits Q, reduced physical qubits Q*
+class PartitionSynthesisResult:
+
+    def __init__(self, N, mini_topologies, involved_qbits, qubit_map, topology=None, topology_cache=None):
+        # Physical mini_topology of the partition q*
+        self.mini_topologies = mini_topologies
+        # Qubit num of the partition
+        self.N = N
+        # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc
+        # P_o in Q*->q* permutation pattern [Q*1 Q*0 Q*2] This means that the current output of Q*1 is equal to q*0
+        self.permutations_pairs = [[] for _ in range(len(mini_topologies))]
+        # Synthesis results
+        self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
+        self.synthesised_parameters = [[] for _ in range(len(mini_topologies))]
+        self.cnot_counts = [[] for _ in range(len(mini_topologies))]
+        self.circuit_structures = [[] for _ in range(len(mini_topologies))]
+        # Involved q qubits on the circuit
+        self.involved_qbits = involved_qbits
+        # {q:q*}
+        self.qubit_map = qubit_map
+        # Lazy per-topology candidate cache
+        self._topology_candidates = [None] * len(mini_topologies)
+        self._topology = topology
+        self._topology_cache = topology_cache
+
+    def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
+        from squander.utils import circuit_to_CNOT_basis
+
+        flat_circuit = synthesised_circuit.get_Flat_Circuit()
+        flat_circuit, synthesised_parameters = circuit_to_CNOT_basis(
+            flat_circuit,
+            np.asarray(synthesised_parameters),
+        )
+        unsupported_multi = [
+            gate.get_Name()
+            for gate in flat_circuit.get_Gates()
+            if len(gate.get_Involved_Qbits()) > 1
+            and gate.get_Name() != "CNOT"
+        ]
+        if unsupported_multi:
+            raise ValueError(
+                "Partition synthesis produced non-CNOT multi-qubit gates "
+                f"after CNOT-basis conversion: {unsupported_multi}"
+            )
+        self.permutations_pairs[topology_idx].append(permutations_pair)
+        self.synthesised_circuits[topology_idx].append(flat_circuit)
+        self.synthesised_parameters[topology_idx].append(synthesised_parameters)
+        self.cnot_counts[topology_idx].append(flat_circuit.get_Gate_Nums().get('CNOT', 0))
+        self.circuit_structures[topology_idx].append(self.extract_circuit_structure(flat_circuit))
+
+    def extract_circuit_structure(self, circuit):
+        circuit_structure = []
+        for gate in circuit.get_Gates():
+            if gate.get_Name() == "Permutation":
+                continue
+            involved_qbits = gate.get_Involved_Qbits()
+            if len(involved_qbits) != 1:
+                circuit_structure.append(involved_qbits)
+        return circuit_structure
+
+    def get_best_result(self, topology_idx):
+        best_index = np.argmin(self.cnot_counts[topology_idx])
+        return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index]
+
+    def get_top_k_results(self, topology_idx, k):
+        counts = self.cnot_counts[topology_idx]
+        pairs = self.permutations_pairs[topology_idx]
+        if not counts:
+            return []
+        indices = np.argsort(counts)
+        seen_pi = set()
+        result = []
+        for i in indices:
+            pi_key = tuple(pairs[i][0])
+            if pi_key not in seen_pi:
+                seen_pi.add(pi_key)
+                result.append(pairs[i][0])
+                if len(result) >= k:
+                    break
+        return result
+
+    def get_topology_candidates(self, topology_idx):
+        """
+        Get topology candidates for a given topology index, using cache if available.
+        """
+        if self._topology_candidates[topology_idx] is None:
+            mini_topology = self.mini_topologies[topology_idx]
+            if self._topology_cache is not None:
+                # Use cached version if available
+                target_qubits = set()
+                for u, v in mini_topology:
+                    target_qubits.add(u)
+                    target_qubits.add(v)
+                if target_qubits:
+                    canonical_key = get_canonical_form(target_qubits, mini_topology)
+                    if canonical_key in self._topology_cache:
+                        self._topology_candidates[topology_idx] = self._topology_cache[canonical_key]
+                    else:
+                        # Compute and cache
+                        if self._topology is not None:
+                            candidates = get_subtopologies_of_type(self._topology, mini_topology)
+                            self._topology_cache[canonical_key] = candidates
+                            self._topology_candidates[topology_idx] = candidates
+                        else:
+                            self._topology_candidates[topology_idx] = []
+                else:
+                    self._topology_candidates[topology_idx] = []
+            else:
+                # No cache, compute directly
+                if self._topology is not None:
+                    self._topology_candidates[topology_idx] = get_subtopologies_of_type(self._topology, mini_topology)
+                else:
+                    self._topology_candidates[topology_idx] = []
+        return self._topology_candidates[topology_idx]
+
+
+
+class PartitionCandidate:
+
+    def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits, cnot_count=0):
+        #Which partition does this belong to
+        self.partition_idx = partition_idx
+        #the index of the Q* topology
+        self.topology_idx = topology_idx
+        #the index of the P_i and P_o pair
+        self.permutation_idx = permutation_idx
+        # the structure of the circuit in Q*
+        self.circuit_structure = circuit_structure
+        # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc
+        self.P_i = P_i
+        # P_o in Q*->q* permutation pattern [Q*1 Q*0 Q*2] This means that the current output of Q*1 is equal to q*0
+        self.P_o = P_o
+        #The mini_topology in Q
+        self.topology = topology
+        #The mini topology in Q*
+        self.mini_topology = mini_topology
+        # {q:q*}
+        self.qbit_map = qbit_map
+        # q belonging to the original circuit
+        self.involved_qbits = involved_qbits
+        self.cnot_count = cnot_count
+        # {Q*:Q}
+        self.node_mapping = get_node_mapping(mini_topology, topology)
+
+    def transform_pi(self, pi, D, swap_cache=None, reverse=False, adj=None, neighbor_info=None):
+        # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
+        #
+        # Forward (reverse=False):
+        #   Route qubits to input positions derived from P_i_inv, then
+        #   update pi to output positions derived from P_o.
+        #
+        # Reverse (reverse=True):
+        #   We traverse the partition backwards, so the "entry" is the output
+        #   side and the "exit" is the input side.  Swap P_i <-> P_o roles.
+        if not reverse:
+            P_route_inv = [self.P_i.index(i) for i in range(len(self.P_i))]
+            P_exit = self.P_o
+        else:
+            P_route_inv = [self.P_o.index(i) for i in range(len(self.P_o))]
+            P_exit = self.P_i
+
+        qbit_map_input = {k : self.node_mapping[P_route_inv[v]] for k,v in self.qbit_map.items()}
+        # Convert pi to plain Python list of ints (may contain np.int64)
+        pi_list = [int(x) for x in pi]
+        n = len(pi_list)
+
+        # Cache is keyed on (pi, qbit_map, neighbor_signature). The signature
+        # captures the neighbor-heuristic context so hits across calls with
+        # the same active neighbor_info are safe.
+        if swap_cache is not None:
+            pi_tuple = tuple(pi_list)
+            qbit_map_frozen = frozenset(qbit_map_input.items())
+            neighbor_sig = _neighbor_signature(neighbor_info)
+            cache_key = (pi_tuple, qbit_map_frozen, neighbor_sig)
+            if cache_key in swap_cache:
+                swaps, pi_init = swap_cache[cache_key]
+            else:
+                swaps, pi_init = find_constrained_swaps_partial(
+                    pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info)
+                swap_cache[cache_key] = (swaps, pi_init)
+        else:
+            swaps, pi_init = find_constrained_swaps_partial(
+                pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info)
+
+        pi_output = pi_init.copy()
+        qbit_map_inverse = {v: k for k, v in self.qbit_map.items()}
+        for q_star in range(len(P_exit)):
+            if q_star in qbit_map_inverse:
+                k = qbit_map_inverse[q_star]
+                pi_output[k] = self.node_mapping[P_exit[q_star]]
+        return swaps, pi_output
+
+    def estimate_swap_count(self, pi, D, reverse=False) -> int:
+        """O(n) lower-bound on the number of SWAPs needed to route this
+        partition's virtual qubits to their target physical positions.
+        Uses the same admissible heuristic as the A* search internaly:
+            floor(sum_of_distances / 2)
+        """
+        P_route = self.P_o if reverse else self.P_i
+        P_i_inv = [P_route.index(i) for i in range(len(P_route))]
+        total = 0.0
+        for k, v in self.qbit_map.items():
+            target_P = self.node_mapping[P_i_inv[v]]
+            current_P = int(pi[k])
+            d = D[current_P][target_P]
+            if not np.isinf(d):
+                total += d
+        return int(total / 2)
+
+    def get_final_circuit(self,optimized_partitions,N):
+        partition = optimized_partitions[self.partition_idx]
+        part_parameters = partition.synthesised_parameters[self.topology_idx][self.permutation_idx]
+        part_circuit = partition.synthesised_circuits[self.topology_idx][self.permutation_idx].get_Flat_Circuit()
+        part_circuit = part_circuit.Remap_Qbits(self.node_mapping, N)
+        return part_circuit, part_parameters
+
+
+@dataclass(frozen=True)
+class PartitionScoreData:
+    mini_topologies: Tuple[Tuple[Tuple[int, int], ...], ...]
+    topology_candidates: Tuple[Tuple[Tuple[int, int], ...], ...]
+    permutations_pairs: Tuple[
+        Tuple[Tuple[Tuple[int, ...], Tuple[int, ...]], ...], ...
+    ]
+    circuit_structures: Tuple[Tuple[Tuple[int, ...], ...], ...]
+    cnot_counts: Tuple[Tuple[int, ...], ...]
+    qubit_map: Dict[int, int]
+    involved_qbits: Tuple[int, ...]
+
+
+# ============================================================================
+# Circuit Utilities
+# ============================================================================
+
+def check_circuit_compatibility(circuit: Circuit, topology):
+    circuit_topology = []
+
+    def collect_two_qubit_edges(gate):
+        if isinstance(gate, Circuit):
+            for subgate in gate.get_Gates():
+                collect_two_qubit_edges(subgate)
+            return
+
+        qubits = gate.get_Involved_Qbits()
+        if len(qubits) == 1:
+            return
+        if len(qubits) == 2:
+            qubits = tuple(qubits)
+            if qubits not in circuit_topology and qubits[::-1] not in circuit_topology:
+                circuit_topology.append(qubits)
+            return
+
+        for subgate in gate.get_Gates():
+            collect_two_qubit_edges(subgate)
+
+    for gate in circuit.get_Gates():
+        collect_two_qubit_edges(gate)
+
+    for qubits in circuit_topology:
+        if qubits not in topology and qubits[::-1] not in topology:
+            return False
+    return True
+
+def construct_swap_circuit(swap_order, N):
+    swap_circ = Circuit(N)
+    for swap in swap_order:
+        swap_circ.add_CNOT(swap[0],swap[1])
+        swap_circ.add_CNOT(swap[1],swap[0])
+        swap_circ.add_CNOT(swap[0],swap[1])
+    return swap_circ
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
new file mode 100644
index 000000000..6a930d095
--- /dev/null
+++ b/squander/synthesis/bindings.cpp
@@ -0,0 +1,243 @@
+/*
+Copyright 2025 SQUANDER Contributors
+
+pybind11 bindings for the SABRE routing engine.
+*/
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <stdexcept>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "sabre_router.hpp"
+
+namespace py = pybind11;
+using namespace squander::routing;
+
+// ---------------------------------------------------------------------------
+// Helper: extract fields from a Python PartitionCandidate object into CandidateData
+// ---------------------------------------------------------------------------
+
+static CandidateData extract_candidate(py::handle pc) {
+    CandidateData cd;
+    cd.partition_idx = pc.attr("partition_idx").cast<int>();
+    cd.topology_idx = pc.attr("topology_idx").cast<int>();
+    cd.permutation_idx = pc.attr("permutation_idx").cast<int>();
+    cd.cnot_count = pc.attr("cnot_count").cast<int>();
+    cd.has_multi_qubit_body = py::len(pc.attr("circuit_structure")) > 0;
+
+    // P_i, P_o: tuples of ints
+    cd.P_i = pc.attr("P_i").cast<std::vector<int>>();
+    cd.P_o = pc.attr("P_o").cast<std::vector<int>>();
+
+    // node_mapping: dict {Q* -> Q} -> flatten to dense array
+    py::dict nm = pc.attr("node_mapping");
+    int max_qstar = -1;
+    for (auto [key, val] : nm) {
+        int qs = key.cast<int>();
+        if (qs > max_qstar) max_qstar = qs;
+    }
+    cd.node_mapping_flat.resize(max_qstar + 1, -1);
+    for (auto [key, val] : nm) {
+        cd.node_mapping_flat[key.cast<int>()] = val.cast<int>();
+    }
+
+    // qbit_map: dict {q -> q*}
+    py::dict qm = pc.attr("qbit_map");
+    cd.qbit_map_keys.reserve(py::len(qm));
+    cd.qbit_map_vals.reserve(py::len(qm));
+    for (auto [key, val] : qm) {
+        cd.qbit_map_keys.push_back(key.cast<int>());
+        cd.qbit_map_vals.push_back(val.cast<int>());
+    }
+
+    // involved_qbits: tuple of ints
+    cd.involved_qbits = pc.attr("involved_qbits").cast<std::vector<int>>();
+
+    return cd;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: extract canonical_data dict -> unordered_map
+// ---------------------------------------------------------------------------
+
+static std::vector<int> extract_int_array(py::handle obj) {
+    std::vector<int> result;
+    auto arr = py::array_t<int, py::array::c_style | py::array::forcecast>::ensure(obj);
+    if (!arr) {
+        return result;
+    }
+    auto acc = arr.unchecked<1>();
+    result.resize(acc.shape(0));
+    for (py::ssize_t i = 0; i < acc.shape(0); i++) {
+        result[i] = acc(i);
+    }
+    return result;
+}
+
+static std::unordered_map<int, CanonicalEntry> extract_canonical_data(py::dict cd) {
+    std::unordered_map<int, CanonicalEntry> result;
+    for (auto [key, val] : cd) {
+        int pidx = key.cast<int>();
+        CanonicalEntry entry;
+        py::dict d = py::reinterpret_borrow<py::dict>(val);
+        if (d.contains("edges_u") && !d["edges_u"].is_none()) {
+            entry.edges_u = extract_int_array(d["edges_u"]);
+        }
+        if (d.contains("edges_v") && !d["edges_v"].is_none()) {
+            entry.edges_v = extract_int_array(d["edges_v"]);
+        }
+        entry.cnot = d["cnot"].cast<int>();
+        result[pidx] = std::move(entry);
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: extract layout_partitions list -> vector<LayoutPartInfo>
+// ---------------------------------------------------------------------------
+
+static std::vector<LayoutPartInfo> extract_layout_partitions(py::list lp) {
+    std::vector<LayoutPartInfo> result;
+    result.reserve(py::len(lp));
+    for (auto item : lp) {
+        py::dict d = py::reinterpret_borrow<py::dict>(item);
+        LayoutPartInfo info;
+        info.is_single = d["is_single"].cast<bool>();
+        info.involved_qbits = d["involved_qbits"].cast<std::vector<int>>();
+        result.push_back(std::move(info));
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// Module definition
+// ---------------------------------------------------------------------------
+
+PYBIND11_MODULE(_sabre_router, m) {
+    m.doc() = "SQUANDER SABRE Routing Engine - C++ Backend";
+
+    // Bind SabreConfig
+    py::class_<SabreConfig>(m, "SabreConfig")
+        .def(py::init<>())
+        .def_readwrite("prefilter_top_k", &SabreConfig::prefilter_top_k)
+        .def_readwrite("prefilter_min_per_partition", &SabreConfig::prefilter_min_per_partition)
+        .def_readwrite("prefilter_min_3q", &SabreConfig::prefilter_min_3q)
+        .def_readwrite("max_E_size", &SabreConfig::max_E_size)
+        .def_readwrite("max_lookahead", &SabreConfig::max_lookahead)
+        .def_readwrite("E_weight", &SabreConfig::E_weight)
+        .def_readwrite("E_alpha", &SabreConfig::E_alpha)
+        .def_readwrite("cnot_cost", &SabreConfig::cnot_cost)
+        .def_readwrite("sabre_iterations", &SabreConfig::sabre_iterations)
+        .def_readwrite("n_layout_trials", &SabreConfig::n_layout_trials)
+        .def_readwrite("random_seed", &SabreConfig::random_seed)
+        .def_readwrite("decay_delta", &SabreConfig::decay_delta)
+        .def_readwrite("swap_burst_budget", &SabreConfig::swap_burst_budget)
+        .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
+        .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight)
+        .def_readwrite("boundary_beam_width", &SabreConfig::boundary_beam_width)
+        .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth);
+
+    // Bind SabreRouter with data-converting constructor
+    py::class_<SabreRouter>(m, "SabreRouter")
+        .def(py::init(
+            [](const SabreConfig& config,
+               py::array_t<double, py::array::c_style> D_arr,
+               std::vector<std::vector<int>> adj,
+               std::vector<std::vector<int>> DAG,
+               std::vector<std::vector<int>> IDAG,
+               py::list candidate_cache_py,
+               py::list layout_partitions_py,
+               py::dict canonical_data_fwd_py,
+               py::dict canonical_data_rev_py
+            ) {
+                // Extract D matrix
+                auto buf = D_arr.request();
+                if (buf.ndim != 2 || buf.shape[0] != buf.shape[1]) {
+                    throw std::invalid_argument("D must be a square 2D array");
+                }
+                int N = static_cast<int>(buf.shape[0]);
+                std::vector<double> D_flat(N * N);
+                auto* ptr = static_cast<const double*>(buf.ptr);
+                std::copy(ptr, ptr + N * N, D_flat.begin());
+
+                // Convert candidate_cache: list of lists of PartitionCandidate
+                std::vector<std::vector<CandidateData>> cc;
+                cc.reserve(py::len(candidate_cache_py));
+                for (auto part_cands : candidate_cache_py) {
+                    std::vector<CandidateData> cands;
+                    py::list cl = py::reinterpret_borrow<py::list>(part_cands);
+                    cands.reserve(py::len(cl));
+                    for (auto c : cl) {
+                        auto cd = extract_candidate(c);
+                        cd.candidate_idx = static_cast<int>(cands.size());
+                        cands.push_back(std::move(cd));
+                    }
+                    cc.push_back(std::move(cands));
+                }
+
+                auto lp = extract_layout_partitions(layout_partitions_py);
+                auto cd_fwd = extract_canonical_data(canonical_data_fwd_py);
+                auto cd_rev = extract_canonical_data(canonical_data_rev_py);
+
+                return new SabreRouter(
+                    config, N, std::move(D_flat), std::move(adj), std::move(DAG), std::move(IDAG),
+                    std::move(cc), std::move(lp), std::move(cd_fwd), std::move(cd_rev)
+                );
+            }),
+            py::arg("config"),
+            py::arg("D"),
+            py::arg("adj"),
+            py::arg("DAG"),
+            py::arg("IDAG"),
+            py::arg("candidate_cache"),
+            py::arg("layout_partitions"),
+            py::arg("canonical_data_fwd"),
+            py::arg("canonical_data_rev")
+        )
+        .def("route_forward",
+            [](const SabreRouter& self,
+               const std::vector<int>& pi
+            ) -> py::tuple {
+                py::gil_scoped_release release;
+                auto result = self.route_forward(pi);
+                py::gil_scoped_acquire acquire;
+                py::list steps;
+                for (const auto& step : result.steps) {
+                    if (step.type == 0) {
+                        steps.append(py::make_tuple("swap", step.swaps));
+                    } else if (step.type == 1) {
+                        steps.append(py::make_tuple("partition", step.partition_idx, step.candidate_idx));
+                    } else {
+                        steps.append(py::make_tuple("single", step.partition_idx, step.physical_qubit));
+                    }
+                }
+                return py::make_tuple(result.cnot_count, result.pi, result.pi_initial, steps);
+            },
+            py::arg("pi"),
+            "Run actual forward routing and return CNOT count, final pi, initial pi, and route steps"
+        )
+        .def("run_trial",
+            [](const SabreRouter& self,
+               int trial_idx,
+               const std::vector<int>& seeded_pi,
+               int n_iterations,
+               int n_trials
+            ) -> py::tuple {
+                py::gil_scoped_release release;
+                auto result = self.run_trial(trial_idx, seeded_pi, n_iterations, n_trials);
+                py::gil_scoped_acquire acquire;
+                return py::make_tuple(result.total_cost, result.pi);
+            },
+            py::arg("trial_idx"),
+            py::arg("seeded_pi"),
+            py::arg("n_iterations"),
+            py::arg("n_trials"),
+            "Run a single layout trial (GIL-free, thread-safe)"
+        );
+}
diff --git a/squander/synthesis/qgd_SABRE.py b/squander/synthesis/qgd_SABRE.py
index 924bf361d..aecad803a 100644
--- a/squander/synthesis/qgd_SABRE.py
+++ b/squander/synthesis/qgd_SABRE.py
@@ -68,11 +68,11 @@ def _compute_smart_initial_layout(self, circuit):
         gates = circuit.get_Gates()
 
         for gate in gates:
-            if gate.get_Control_Qbit() != -1:
-                q1 = gate.get_Target_Qbit()
-                q2 = gate.get_Control_Qbit()
-                if q1 < self.circuit_qbit_num and q2 < self.circuit_qbit_num:
-                    key = (min(q1, q2), max(q1, q2))
+            q_control = gate.get_Control_Qbit()
+            if q_control != -1:
+                q_target = gate.get_Target_Qbit()
+                if q_target < self.circuit_qbit_num and q_control < self.circuit_qbit_num:
+                    key = (min(q_target, q_control), max(q_target, q_control))
                     interaction_count[key] += 1
 
         if not interaction_count:
diff --git a/squander/utils.py b/squander/utils.py
index d33eec17b..1ea558a51 100644
--- a/squander/utils.py
+++ b/squander/utils.py
@@ -130,7 +130,7 @@ def qasm_to_squander_circuit(filename: str, return_transpiled=False):
         for n in dir(gate)
         if not n.startswith("_")
         and issubclass(getattr(gate, n), gate.Gate)
-        and n not in ("Gate", "CROT", "CR", "SYC")
+        and n not in ("Gate", "CROT", "CR", "SYC","Permutation")
     }
     if any(gate.operation.name not in SUPPORTED_GATES_NAMES for gate in qc.data):
         qc_transpiled = qiskit.transpile(
@@ -460,6 +460,7 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray):
         RXX,
         RYY,
         RZZ,
+        Permutation,
     )
 
     gates = circ.get_Gates()
@@ -652,6 +653,24 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray):
             circuit.add_CNOT(t2, t1)
             circuit.add_CNOT(t1, t2)
             params.append([])
+        elif isinstance(gate, Permutation):
+            pattern = list(gate.get_Pattern())
+            inverse_pattern = [0] * len(pattern)
+            for idx, mapped_idx in enumerate(pattern):
+                inverse_pattern[mapped_idx] = idx
+            current = list(range(len(pattern)))
+            for idx, target in enumerate(inverse_pattern):
+                swap_idx = current.index(target)
+                if swap_idx == idx:
+                    continue
+                circuit.add_CNOT(idx, swap_idx)
+                circuit.add_CNOT(swap_idx, idx)
+                circuit.add_CNOT(idx, swap_idx)
+                current[idx], current[swap_idx] = (
+                    current[swap_idx],
+                    current[idx],
+                )
+            params.append([])
         elif isinstance(gate, RXX):
             t1, t2 = gate.get_Target_Qbits()
             circuit.add_CNOT(t1, t2)
@@ -695,6 +714,8 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray):
                 ]
             )
 
+    if not params:
+        return circuit, np.array([])
     return circuit, np.concatenate(params)
 
 
diff --git a/tests/decomposition/test_IBM.py b/tests/decomposition/test_IBM.py
index 2e2479307..eafd42fab 100644
--- a/tests/decomposition/test_IBM.py
+++ b/tests/decomposition/test_IBM.py
@@ -251,10 +251,12 @@ def test_IBM_Chellenge_tree_search(self):
         data = loadmat('data/Umtx.mat')  
         # The unitary to be decomposed  
         Umtx = data['Umtx']
-        
+        #turn off OSR
+        config = {"use_osr":0}
+
 
         # creating a class to decompose the unitary
-        cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T )
+        cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config = config )
 
 
         # setting the verbosity of the decomposition
@@ -305,10 +307,11 @@ def test_IBM_Chellenge_tabu_search(self):
         data = loadmat('data/Umtx.mat')  
         # The unitary to be decomposed  
         Umtx = data['Umtx']
-        
+        #turn off OSR
+        config = {"use_osr":0}
 
         # creating a class to decompose the unitary
-        cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T )
+        cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T,config=config )
 
 
         # setting the verbosity of the decomposition
diff --git a/tests/gates/test_Permutation.py b/tests/gates/test_Permutation.py
new file mode 100644
index 000000000..4a4ecbb06
--- /dev/null
+++ b/tests/gates/test_Permutation.py
@@ -0,0 +1,490 @@
+'''
+Copyright 2020 Peter Rakyta, Ph.D.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see http://www.gnu.org/licenses/.
+'''
+
+import numpy as np
+import pytest
+from itertools import permutations
+
+from squander.gates.gates_Wrapper import Permutation
+from squander.gates.qgd_Circuit import qgd_Circuit
+
+
+class Test_Permutation:
+    """Test class for Permutation gate"""
+
+    def test_permutation_creation_identity(self):
+        """
+        Test creating identity permutation gates
+        """
+        for qbit_num in range(1, 6):
+            # Identity permutation: [0, 1, 2, ..., n-1]
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            assert perm_gate.get_Parameter_Num() == 0
+            pattern_retrieved = perm_gate.get_Pattern()
+            assert pattern_retrieved == pattern
+
+    def test_permutation_creation_swap(self):
+        """
+        Test creating swap permutation gates
+        """
+        for qbit_num in range(2, 6):
+            # Swap first and last qubits: [n-1, 1, 2, ..., n-2, 0]
+            pattern = list(range(qbit_num))
+            pattern[0], pattern[-1] = pattern[-1], pattern[0]
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            pattern_retrieved = perm_gate.get_Pattern()
+            assert pattern_retrieved == pattern
+
+    def test_permutation_creation_reverse(self):
+        """
+        Test creating reverse permutation gates
+        """
+        for qbit_num in range(1, 6):
+            # Reverse permutation: [n-1, n-2, ..., 1, 0]
+            pattern = list(range(qbit_num))[::-1]
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            pattern_retrieved = perm_gate.get_Pattern()
+            assert pattern_retrieved == pattern
+
+    def test_permutation_creation_random(self):
+        """
+        Test creating random permutation gates
+        """
+        np.random.seed(42)
+        for qbit_num in range(2, 6):
+            # Random permutation
+            pattern = list(range(qbit_num))
+            np.random.shuffle(pattern)
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            pattern_retrieved = perm_gate.get_Pattern()
+            assert pattern_retrieved == pattern
+
+    def test_permutation_creation_invalid_size(self):
+        """
+        Test that creating permutation with wrong pattern size raises error
+        """
+        qbit_num = 3
+        # Pattern too small
+        with pytest.raises(ValueError, match="Pattern size.*does not match"):
+            Permutation(qbit_num, [0, 1])
+        
+        # Pattern too large
+        with pytest.raises(ValueError, match="Pattern size.*does not match"):
+            Permutation(qbit_num, [0, 1, 2, 3])
+
+    def test_permutation_creation_invalid_range(self):
+        """
+        Test that creating permutation with out-of-range indices raises error
+        """
+        qbit_num = 3
+        # Negative index
+        with pytest.raises(ValueError, match="out of range"):
+            Permutation(qbit_num, [-1, 1, 2])
+        
+        # Index too large
+        with pytest.raises(ValueError, match="out of range"):
+            Permutation(qbit_num, [0, 1, 3])
+
+    def test_permutation_creation_duplicates(self):
+        """
+        Test that creating permutation with duplicate values raises error
+        """
+        qbit_num = 3
+        # Duplicate values
+        with pytest.raises(ValueError, match="duplicate"):
+            Permutation(qbit_num, [0, 1, 1])
+        
+        with pytest.raises(ValueError, match="duplicate"):
+            Permutation(qbit_num, [0, 0, 2])
+
+    def test_permutation_creation_invalid_type(self):
+        """
+        Test that creating permutation with invalid type raises error
+        """
+        qbit_num = 3
+        # Tuple should work (converted to list)
+        perm_gate = Permutation(qbit_num, (0, 1, 2))
+        assert perm_gate.get_Pattern() == [0, 1, 2]
+        
+        # Non-integer values
+        with pytest.raises(TypeError, match="pattern must contain integers"):
+            Permutation(qbit_num, [0.0, 1.0, 2.0])
+        
+        with pytest.raises(TypeError, match="pattern must contain integers"):
+            Permutation(qbit_num, ["0", "1", "2"])
+        
+        # Invalid type (not list or tuple)
+        with pytest.raises(TypeError, match="pattern must be a list or tuple"):
+            Permutation(qbit_num, "012")
+
+    def test_permutation_get_pattern(self):
+        """
+        Test getting pattern from permutation gate
+        """
+        for qbit_num in range(1, 5):
+            for pattern_tuple in permutations(range(qbit_num)):
+                pattern = list(pattern_tuple)
+                perm_gate = Permutation(qbit_num, pattern)
+                retrieved_pattern = perm_gate.get_Pattern()
+                assert retrieved_pattern == pattern
+
+    def test_permutation_tuple_conversion(self):
+        """
+        Test that tuples are properly converted to lists
+        """
+        for qbit_num in range(1, 5):
+            for pattern_tuple in permutations(range(qbit_num)):
+                # Create with tuple
+                perm_gate = Permutation(qbit_num, pattern_tuple)
+                retrieved_pattern = perm_gate.get_Pattern()
+                # Should return as list
+                assert retrieved_pattern == list(pattern_tuple)
+                assert isinstance(retrieved_pattern, list)
+                
+                # Set with tuple
+                perm_gate.set_Pattern(pattern_tuple)
+                retrieved_pattern = perm_gate.get_Pattern()
+                assert retrieved_pattern == list(pattern_tuple)
+                assert isinstance(retrieved_pattern, list)
+
+    def test_permutation_set_pattern(self):
+        """
+        Test setting pattern on permutation gate
+        """
+        qbit_num = 4
+        initial_pattern = [0, 1, 2, 3]
+        perm_gate = Permutation(qbit_num, initial_pattern)
+        
+        # Set new pattern
+        new_pattern = [3, 2, 1, 0]
+        perm_gate.set_Pattern(new_pattern)
+        assert perm_gate.get_Pattern() == new_pattern
+        
+        # Set another pattern
+        another_pattern = [1, 0, 3, 2]
+        perm_gate.set_Pattern(another_pattern)
+        assert perm_gate.get_Pattern() == another_pattern
+
+    def test_permutation_set_pattern_invalid(self):
+        """
+        Test that setting invalid pattern raises error
+        """
+        qbit_num = 3
+        perm_gate = Permutation(qbit_num, [0, 1, 2])
+        
+        # Wrong size
+        with pytest.raises(ValueError, match="Pattern size.*does not match"):
+            perm_gate.set_Pattern([0, 1])
+        
+        # Out of range
+        with pytest.raises(ValueError, match="out of range"):
+            perm_gate.set_Pattern([0, 1, 3])
+        
+        # Duplicates
+        with pytest.raises(ValueError, match="duplicate"):
+            perm_gate.set_Pattern([0, 1, 1])
+        
+        # Invalid type (not list or tuple)
+        with pytest.raises(TypeError, match="Pattern must be a list or tuple"):
+            perm_gate.set_Pattern("012")
+        
+        # Tuple should work (converted to list)
+        perm_gate.set_Pattern((0, 1, 2))
+        assert perm_gate.get_Pattern() == [0, 1, 2]
+        
+        # Tuple with different pattern
+        perm_gate.set_Pattern((2, 0, 1))
+        assert perm_gate.get_Pattern() == [2, 0, 1]
+
+    def test_permutation_get_matrix_identity(self):
+        """
+        Test that identity permutation gives identity matrix
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            matrix = perm_gate.get_Matrix()
+            
+            expected = np.eye(2**qbit_num, dtype=np.complex128)
+            error = np.linalg.norm(matrix - expected)
+            assert error < 1e-10, f"Identity permutation failed for {qbit_num} qubits"
+
+    def test_permutation_get_matrix_swap(self):
+        """
+        Test permutation matrix for swap operation
+        """
+        qbit_num = 2
+        # Swap qubits: [1, 0]
+        pattern = [1, 0]
+        perm_gate = Permutation(qbit_num, pattern)
+        matrix = perm_gate.get_Matrix()
+        
+        # For 2 qubits, swap should exchange |01> and |10>
+        # Identity: |00> -> |00>, |01> -> |01>, |10> -> |10>, |11> -> |11>
+        # Swap:     |00> -> |00>, |01> -> |10>, |10> -> |01>, |11> -> |11>
+        expected = np.array([
+            [1, 0, 0, 0],
+            [0, 0, 1, 0],
+            [0, 1, 0, 0],
+            [0, 0, 0, 1]
+        ], dtype=np.complex128)
+        
+        error = np.linalg.norm(matrix - expected)
+        assert error < 1e-10, "Swap permutation matrix incorrect"
+
+    def test_permutation_get_matrix_unitary(self):
+        """
+        Test that permutation matrices are unitary
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            np.random.shuffle(pattern)
+            perm_gate = Permutation(qbit_num, pattern)
+            matrix = perm_gate.get_Matrix()
+            
+            # Check unitarity: U @ U^dagger = I
+            unitary_check = matrix @ matrix.conj().T
+            identity = np.eye(2**qbit_num, dtype=np.complex128)
+            error = np.linalg.norm(unitary_check - identity)
+            assert error < 1e-10, f"Matrix not unitary for pattern {pattern}"
+
+    def test_permutation_apply_to_identity(self):
+        """
+        Test applying identity permutation to a state
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            # Create random state
+            matrix_size = 2**qbit_num
+            state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size)
+            state = state / np.linalg.norm(state)
+            
+            state_copy = state.copy()
+            perm_gate.apply_to(state_copy)
+            
+            # Identity should not change the state
+            error = np.linalg.norm(state_copy - state)
+            assert error < 1e-10, "Identity permutation changed state"
+
+    def test_permutation_apply_to_swap(self):
+        """
+        Test applying swap permutation to a state
+        """
+        qbit_num = 2
+        pattern = [1, 0]  # Swap qubits
+        perm_gate = Permutation(qbit_num, pattern)
+        
+        # Create test state |01> = [0, 1, 0, 0]
+        state = np.array([0, 1, 0, 0], dtype=np.complex128)
+        perm_gate.apply_to(state)
+        
+        # After swap, should be |10> = [0, 0, 1, 0]
+        expected = np.array([0, 0, 1, 0], dtype=np.complex128)
+        error = np.linalg.norm(state - expected)
+        assert error < 1e-10, "Swap permutation incorrect"
+
+    def test_permutation_apply_to_matrix(self):
+        """
+        Test applying permutation to a matrix
+        """
+        qbit_num = 3
+        pattern = [2, 0, 1]  # Rotate: 0->2, 1->0, 2->1
+        perm_gate = Permutation(qbit_num, pattern)
+        
+        # Create test matrix
+        matrix_size = 2**qbit_num
+        test_matrix = np.random.rand(matrix_size, matrix_size) + 1j * np.random.rand(matrix_size, matrix_size)
+        test_matrix = test_matrix / np.linalg.norm(test_matrix)
+        
+        # Apply permutation
+        test_matrix_copy = test_matrix.copy()
+        perm_gate.apply_to(test_matrix_copy)
+        
+        # Check that it's different (unless it's identity)
+        if pattern != list(range(qbit_num)):
+            assert not np.allclose(test_matrix_copy, test_matrix), "Permutation should change matrix"
+
+    def test_permutation_composition(self):
+        """
+        Test that applying two permutations is equivalent to their composition
+        """
+        qbit_num = 3
+        pattern1 = [1, 2, 0]  # Rotate left
+        pattern2 = [2, 0, 1]  # Rotate right
+        
+        perm1 = Permutation(qbit_num, pattern1)
+        perm2 = Permutation(qbit_num, pattern2)
+        
+        # Compose patterns: pattern2(pattern1(x))
+        composed_pattern = [pattern2[pattern1[i]] for i in range(qbit_num)]
+        perm_composed = Permutation(qbit_num, composed_pattern)
+        
+        # Create test state
+        matrix_size = 2**qbit_num
+        state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size)
+        state = state / np.linalg.norm(state)
+        
+        # Apply sequentially
+        state_seq = state.copy()
+        perm1.apply_to(state_seq)
+        perm2.apply_to(state_seq)
+        
+        # Apply composed
+        state_comp = state.copy()
+        perm_composed.apply_to(state_comp)
+        
+        error = np.linalg.norm(state_seq - state_comp)
+        assert error < 1e-10, "Composition of permutations incorrect"
+
+    def test_permutation_inverse(self):
+        """
+        Test that applying permutation and its inverse gives identity
+        """
+        for qbit_num in range(2, 5):
+            pattern = list(range(qbit_num))
+            np.random.shuffle(pattern)
+            
+            # Compute inverse permutation
+            inverse_pattern = [0] * qbit_num
+            for i in range(qbit_num):
+                inverse_pattern[pattern[i]] = i
+            
+            perm = Permutation(qbit_num, pattern)
+            perm_inv = Permutation(qbit_num, inverse_pattern)
+            
+            # Create test state
+            matrix_size = 2**qbit_num
+            state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size)
+            state = state / np.linalg.norm(state)
+            
+            # Apply permutation then inverse
+            state_transformed = state.copy()
+            perm.apply_to(state_transformed)
+            perm_inv.apply_to(state_transformed)
+            
+            error = np.linalg.norm(state_transformed - state)
+            assert error < 1e-10, f"Inverse permutation failed for pattern {pattern}"
+
+    def test_permutation_circuit_integration(self):
+        """
+        Test adding permutation gate to circuit
+        """
+        qbit_num = 3
+        pattern = [2, 0, 1]
+        
+        circuit = qgd_Circuit(qbit_num)
+        circuit.add_Permutation(pattern)
+        
+        gates = circuit.get_Gates()
+        assert len(gates) == 1
+        
+        gate = gates[0]
+        assert gate.get_Name() == "Permutation"
+        retrieved_pattern = gate.get_Pattern()
+        assert retrieved_pattern == pattern
+
+    def test_permutation_circuit_multiple(self):
+        """
+        Test adding multiple permutation gates to circuit
+        """
+        qbit_num = 3
+        
+        circuit = qgd_Circuit(qbit_num)
+        pattern1 = [1, 2, 0]
+        pattern2 = [2, 0, 1]
+        
+        circuit.add_Permutation(pattern1)
+        circuit.add_Permutation(pattern2)
+        
+        gates = circuit.get_Gates()
+        assert len(gates) == 2
+        
+        assert gates[0].get_Pattern() == pattern1
+        assert gates[1].get_Pattern() == pattern2
+
+    def test_permutation_get_involved_qubits(self):
+        """
+        Test getting involved qubits from permutation gate
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            involved_qbits = perm_gate.get_Involved_Qbits()
+            # Permutation gate involves all qubits
+            assert involved_qbits == list(range(qbit_num))
+
+    def test_permutation_get_target_qubits(self):
+        """
+        Test getting target qubits from permutation gate
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            target_qbits = perm_gate.get_Target_Qbits()
+            # Permutation gate targets all qubits
+            assert target_qbits == list(range(qbit_num))
+
+    def test_permutation_get_control_qubits(self):
+        """
+        Test getting control qubits from permutation gate (should be empty)
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            control_qbits = perm_gate.get_Control_Qbits()
+            # Permutation gate has no control qubits
+            assert control_qbits == []
+
+    def test_permutation_large_patterns(self):
+        """
+        Test permutation gates with larger numbers of qubits
+        """
+        for qbit_num in [5, 6, 7]:
+            # Test identity
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            matrix = perm_gate.get_Matrix()
+            
+            expected = np.eye(2**qbit_num, dtype=np.complex128)
+            error = np.linalg.norm(matrix - expected)
+            assert error < 1e-10, f"Large identity permutation failed for {qbit_num} qubits"
+            
+            # Test random permutation
+            np.random.seed(42)
+            pattern = list(range(qbit_num))
+            np.random.shuffle(pattern)
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            # Check unitarity
+            matrix = perm_gate.get_Matrix()
+            unitary_check = matrix @ matrix.conj().T
+            identity = np.eye(2**qbit_num, dtype=np.complex128)
+            error = np.linalg.norm(unitary_check - identity)
+            assert error < 1e-10, f"Large permutation not unitary for {qbit_num} qubits"
+
diff --git a/tests/gates/test_gates.py b/tests/gates/test_gates.py
index e75e7136d..59b5a0161 100644
--- a/tests/gates/test_gates.py
+++ b/tests/gates/test_gates.py
@@ -49,7 +49,8 @@ def _discover_gate_names():
 
 
 ALL_GATE_NAMES = _discover_gate_names()
-QISKIT_EXCLUDED_GATES = {"SYC", "CR", "CROT"}
+QISKIT_EXCLUDED_GATES = {"SYC", "CR", "CROT", "Permutation"}
+CIRCUIT_UNSUPPORTED_GATES = {"Gate", "Permutation"}
 QISKIT_MATRIX_UNSUPPORTED = {"Gate"} | QISKIT_EXCLUDED_GATES
 NATIVE_UNSAFE_MATRIX_GATES = {"Gate"}
 NATIVE_UNSAFE_APPLY_GATES = {"Gate"}
@@ -72,7 +73,7 @@ def _discover_parameterized_gate_names():
 def _discover_multi_qubit_gate_names():
     names = []
     for gate_name in ALL_GATE_NAMES:
-        if gate_name == "Gate":
+        if gate_name in CIRCUIT_UNSUPPORTED_GATES:
             continue
         gate_obj = _instantiate_gate(gate_name)
         if len(gate_obj.get_Involved_Qbits()) >= 2:
@@ -95,6 +96,8 @@ def _instantiate_gate(gate_name, qbit_num=4):
         return gate_cls(qbit_num, 0, qbit_num - 1)
     if gate_name.startswith("C"):
         return gate_cls(qbit_num, 0, qbit_num - 1)
+    if gate_name == "Permutation":
+        return gate_cls(qbit_num, list(range(qbit_num)))
     return gate_cls(qbit_num, 0)
 
 
@@ -847,7 +850,7 @@ def test_qiskit_io_roundtrip_per_gate(self, gate_name):
 
     @pytest.mark.parametrize(
         "gate_name",
-        [name for name in ALL_GATE_NAMES if name != "Gate"],
+        [name for name in ALL_GATE_NAMES if name not in CIRCUIT_UNSUPPORTED_GATES],
     )
     def test_squander_invert_circuit(self, gate_name):
         script = f"""