From 1cb8abe16d45ae549a3de934830b7c574d89868c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 27 Oct 2025 02:33:28 +0100
Subject: [PATCH 001/232] Add permutation gate

---
 CMakeLists.txt                                |   1 +
 squander/IO_interfaces/Qiskit_IO.py           |  24 ++-
 squander/__init__.py                          |   3 +-
 squander/gates/gates_Wrapper.cpp              | 142 +++++++++++++++++-
 squander/gates/qgd_Circuit_Wrapper.cpp        |  37 ++++-
 squander/src-cpp/gates/Gates_block.cpp        |  36 ++++-
 squander/src-cpp/gates/Permutation.cpp        | 135 +++++++++++++++++
 squander/src-cpp/gates/include/Gate.h         |   3 +-
 squander/src-cpp/gates/include/Gates_block.h  |  11 +-
 squander/src-cpp/gates/include/Permutation.h  |  55 +++++++
 .../apply_dedicated_gate_kernel_to_input.cpp  |  47 +++++-
 .../apply_dedicated_gate_kernel_to_input.h    |   9 ++
 tests/gates/test_gates.py                     |   2 +-
 13 files changed, 491 insertions(+), 14 deletions(-)
 create mode 100644 squander/src-cpp/gates/Permutation.cpp
 create mode 100644 squander/src-cpp/gates/include/Permutation.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfedb6b03..668518ace 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -367,6 +367,7 @@ list(APPEND qgd_files
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/R.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/RZ_P.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/CZ_NU.cpp
+    ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/Permutation.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/custom_kernel_1qubit_gate.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/Composite.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/kernels/apply_kernel_to_input.cpp
diff --git a/squander/IO_interfaces/Qiskit_IO.py b/squander/IO_interfaces/Qiskit_IO.py
index 4d0c01659..7d44aa3c9 100644
--- a/squander/IO_interfaces/Qiskit_IO.py
+++ b/squander/IO_interfaces/Qiskit_IO.py
@@ -58,7 +58,8 @@
     CU,
     SWAP,
     CSWAP,
-    CCX )
+    CCX,
+    Permutation )
 
 
 
@@ -75,7 +76,6 @@ def scalar(param):
 def get_Qiskit_Circuit( Squander_circuit, parameters ):
 
     from qiskit import QuantumCircuit
-
     # creating Qiskit quantum circuit  
     circuit = QuantumCircuit(Squander_circuit.get_Qbit_Num() )
     
@@ -210,7 +210,13 @@ def get_Qiskit_Circuit( Squander_circuit, parameters ):
             #CCX gate
             target_qbits = gate.get_Target_Qbits()
             circuit.swap(target_qbits[0], target_qbits[1])
-            
+        elif isinstance(gate, Permutation):
+            #Permutation gate
+            from qiskit.circuit.library import PermutationGate
+            pattern = gate.get_Pattern()
+            qubits = list(range(gate.get_Qbit_Num()))
+            circuit.append( PermutationGate(pattern),qubits)
+        
         elif isinstance( gate, Circuit ):
             # Sub-circuit gate
             raise ValueError("Qiskit export of circuits with subcircuit is not supported. Use Circuit::get_Flat_Circuit prior of exporting circuit.")  
@@ -366,6 +372,13 @@ def get_Qiskit_Circuit_inverse( Squander_circuit, parameters ):
             #CCX gate
             target_qbits = gate.get_Target_Qbits()
             circuit.swap(target_qbits[0], target_qbits[1])
+            
+        elif isinstance(gate, Permutation):
+            #Permutation gate
+            from qiskit.circuit.library import PermutationGate
+            pattern = gate.get_Pattern()
+            qubits = list(range(gate.get_Qbit_Num()))
+            circuit.append( PermutationGate(pattern),qubits)
 
         elif isinstance( gate, Circuit ):
             # Sub-circuit gate
@@ -656,6 +669,11 @@ def convert_Qiskit_to_Squander( qc_in ):
             qubit1 = q_register.index( qubits[1] )
             Circuit_Squander.add_SWAP( [qubit1, qubit0] )
 
+        elif name[:11] == 'permutation':
+            #Permutation gate
+            pattern = gate.operation.pattern
+            Circuit_Squander.add_Permutation( pattern )
+
         else:
             print(f"convert_Qiskit_to_Squander: Unimplemented gate: {name}")
 
diff --git a/squander/__init__.py b/squander/__init__.py
index ac809c662..541798ee7 100644
--- a/squander/__init__.py
+++ b/squander/__init__.py
@@ -48,7 +48,8 @@
     CCX,
     CP,
     SWAP,
-    CSWAP
+    CSWAP,
+    Permutation
 )
 
 
diff --git a/squander/gates/gates_Wrapper.cpp b/squander/gates/gates_Wrapper.cpp
index 649ecd977..4c89f59cd 100644
--- a/squander/gates/gates_Wrapper.cpp
+++ b/squander/gates/gates_Wrapper.cpp
@@ -61,6 +61,7 @@ along with this program.  If not, see http://www.gnu.org/licenses/.
 #include "SWAP.h"
 #include "CSWAP.h"
 #include "numpy_interface.h"
+#include "Permutation.h"
 
 
 //////////////////////////////////////
@@ -76,7 +77,6 @@ typedef struct {
 
 
 
-
 template<typename GateT>
 Gate* create_gate( int qbit_num, int target_qbit ) {
     GateT* gate = new GateT( qbit_num, target_qbit );
@@ -117,6 +117,11 @@ Gate* create_multi_target_controlled_gate( int qbit_num, const std::vector<int>&
 }
 
 
+Gate* create_permutation_gate( int qbit_num, const std::vector<int>& pattern ) {
+    Permutation* gate = new Permutation( qbit_num, pattern );
+    return static_cast<Gate*>( gate );
+}
+
 
 /**
 @brief Method called when a python instance of the class  Gate_Wrapper is destroyed
@@ -134,6 +139,7 @@ static void
 }
 
 
+
 /**
 @brief Method called when a python instance of the class  qgd_CH_Wrapper is allocated
 @param type A pointer pointing to a structure describing the type of the class  qgd_CH_Wrapper.
@@ -445,6 +451,55 @@ static PyObject *
 }
 
 
+template<typename GateT>
+static PyObject *
+ permutation_gate_Wrapper_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {(char*)"qbit_num", (char*)"pattern", NULL};
+    int qbit_num = -1;
+    PyObject* pattern_py = NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO", kwlist, &qbit_num, &pattern_py)) {
+        std::string err("Unable to parse arguments");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+
+    if (qbit_num == -1 || pattern_py == NULL) {
+        PyErr_SetString(PyExc_ValueError, "qbit_num and pattern must be provided!");
+        return NULL;
+    }
+
+    if (!PyList_Check(pattern_py)) {
+        PyErr_SetString(PyExc_TypeError, "pattern must be a list!");
+        return NULL;
+    }
+
+    std::vector<int> pattern;
+    Py_ssize_t pattern_size = PyList_Size(pattern_py);
+
+    for (Py_ssize_t i = 0; i < pattern_size; i++) {
+        PyObject* item = PyList_GetItem(pattern_py, i);
+        if (!PyLong_Check(item)) {
+            PyErr_SetString(PyExc_TypeError, "pattern must contain integers!");
+            return NULL;
+        }
+        int qbit = PyLong_AsLong(item);
+        if (qbit >= qbit_num) {
+            PyErr_SetString(PyExc_ValueError, "Pattern qubit index out of range!");
+            return NULL;
+        }
+        pattern.push_back(qbit);
+    }
+
+    Gate_Wrapper *self;
+    self = (Gate_Wrapper *) type->tp_alloc(type, 0);
+    if (self != NULL) {
+        self->gate = create_permutation_gate(qbit_num, pattern);
+    }
+
+    return (PyObject *) self;
+}
 /**
 @brief Method called when a python instance of a non-controlled gate class is initialized
 @param self A pointer pointing to an instance of the class  Gate_Wrapper.
@@ -1230,8 +1285,77 @@ Gate_Wrapper_getstate( Gate_Wrapper *self ) {
 }
 
 
+static PyObject * Gate_Wrapper_get_Pattern( Gate_Wrapper *self ) {
+    std::vector<int> pattern;
+    try {
+        // Cast to Permutation* to access pattern methods
+        Permutation* perm_gate = dynamic_cast<Permutation*>(self->gate);
+        if (perm_gate == nullptr) {
+            PyErr_SetString(PyExc_TypeError, "Gate is not a Permutation gate");
+            return NULL;
+        }
+        pattern = perm_gate->get_pattern();
+    }
+    catch (std::string err) {
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    catch(...) {
+        std::string err( "Invalid pointer to gate class");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
 
+    PyObject* pattern_py = PyList_New(pattern.size());
+    for (size_t i = 0; i < pattern.size(); i++) {
+        PyList_SetItem(pattern_py, i, Py_BuildValue("i", pattern[i]));
+    }
+    return pattern_py;
+}
 
+static PyObject * Gate_Wrapper_set_Pattern( Gate_Wrapper *self, PyObject *args ) {
+    PyObject* pattern_py = NULL;
+    if (!PyArg_ParseTuple(args, "O", &pattern_py)) {
+        std::string err("Unable to parse arguments");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    if (!PyList_Check(pattern_py)) {
+        std::string err("Pattern must be a list!");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    std::vector<int> pattern;
+    Py_ssize_t pattern_size = PyList_Size(pattern_py);
+    for (Py_ssize_t i = 0; i < pattern_size; i++) {
+        PyObject* item = PyList_GetItem(pattern_py, i);
+        if (!PyLong_Check(item)) {
+            std::string err("Pattern must contain integers!");
+            PyErr_SetString(PyExc_Exception, err.c_str());
+            return NULL;
+        }
+        pattern.push_back(PyLong_AsLong(item));
+    }
+    try {
+        // Cast to Permutation* to access pattern methods
+        Permutation* perm_gate = dynamic_cast<Permutation*>(self->gate);
+        if (perm_gate == nullptr) {
+            PyErr_SetString(PyExc_TypeError, "Gate is not a Permutation gate");
+            return NULL;
+        }
+        perm_gate->set_pattern(pattern);
+    }
+    catch (std::string err) {
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    catch(...) {
+        std::string err( "Invalid pointer to gate class");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+    return Py_BuildValue("i", 0);
+}
 
 /**
 @brief Call to set the state of quantum gate from a human-readable data serialized and pickle-able format
@@ -1574,6 +1698,12 @@ extern "C"
     }, \
     {"get_Name", (PyCFunction) Gate_Wrapper_get_Name, METH_NOARGS, \
      "Method to get the name label of the gate" \
+    }, \
+    {"get_Pattern", (PyCFunction) Gate_Wrapper_get_Pattern, METH_NOARGS, \
+     "Method to get the pattern of the permutation gate." \
+    }, \
+    {"set_Pattern", (PyCFunction) Gate_Wrapper_set_Pattern, METH_VARARGS, \
+     "Method to set the pattern of the permutation gate." \
     }
 
 static PyMethodDef Gate_Wrapper_methods[] = {
@@ -1589,6 +1719,7 @@ static PyMethodDef Gate_Wrapper_methods[] = {
 };
 
 
+
 /**
 @brief Structure containing metadata about the members of class  qgd_CH_Wrapper.
 */
@@ -1597,6 +1728,7 @@ static PyMemberDef  Gate_Wrapper_members[] = {
 };
 
 
+
 struct Gate_Wrapper_Type_tmp : PyTypeObject {
 
 
@@ -1622,6 +1754,7 @@ struct Gate_Wrapper_Type_tmp : PyTypeObject {
 static Gate_Wrapper_Type_tmp Gate_Wrapper_Type;
 
 
+
 #define gate_wrapper_type_template(gate_name, wrapper_new) \ 
 struct gate_name##_Wrapper_Type : Gate_Wrapper_Type_tmp { \
 \
@@ -1723,6 +1856,8 @@ gate_wrapper_type_template(Tdg, Gate_Wrapper_new);
 
 gate_wrapper_type_template(R, Gate_Wrapper_new);
 
+gate_wrapper_type_template(Permutation, permutation_gate_Wrapper_new);
+
 
 
 
@@ -1795,7 +1930,8 @@ PyInit_gates_Wrapper(void)
         PyType_Ready(&CCX_Wrapper_Type_ins) < 0 ||
         PyType_Ready(&SWAP_Wrapper_Type_ins) < 0 ||
         PyType_Ready(&CSWAP_Wrapper_Type_ins) < 0 ||
-        PyType_Ready(&R_Wrapper_Type_ins) < 0 ) {
+        PyType_Ready(&R_Wrapper_Type_ins) < 0 ||
+        PyType_Ready(&Permutation_Wrapper_Type_ins) < 0 ) {
 
         Py_DECREF(m);
         return NULL;
@@ -1880,6 +2016,8 @@ PyInit_gates_Wrapper(void)
 
     Py_INCREF_template(CSWAP);
 
+    Py_INCREF_template(Permutation);
+
     return m;
 }
 
diff --git a/squander/gates/qgd_Circuit_Wrapper.cpp b/squander/gates/qgd_Circuit_Wrapper.cpp
index 5a0342f0d..9806bf750 100644
--- a/squander/gates/qgd_Circuit_Wrapper.cpp
+++ b/squander/gates/qgd_Circuit_Wrapper.cpp
@@ -58,6 +58,7 @@ along with this program.  If not, see http://www.gnu.org/licenses/.
 #include "ON.h"
 #include "Adaptive.h"
 #include "Composite.h"
+#include "Permutation.h"
 
 #include "numpy_interface.h"
 
@@ -390,6 +391,33 @@ qgd_Circuit_Wrapper_add_CSWAP(qgd_Circuit_Wrapper *self, PyObject *args, PyObjec
 
 }
 
+/**
+@brief Wrapper function to add a Permutation gate to the front of the gate structure.
+@param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper.
+@param args A tuple of the input arguments: pattern (list of ints)
+@param kwds A tuple of keywords
+*/
+static PyObject *
+qgd_Circuit_Wrapper_add_Permutation(qgd_Circuit_Wrapper *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {(char*)"pattern", NULL};
+    PyObject* pattern_py = NULL;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &pattern_py))
+        return Py_BuildValue("i", -1);
+
+    if (pattern_py != NULL && PyList_Check(pattern_py)) {
+        std::vector<int> pattern;
+        Py_ssize_t list_size = PyList_Size(pattern_py);
+        for (Py_ssize_t i = 0; i < list_size; i++) {
+            PyObject* item = PyList_GetItem(pattern_py, i);
+            pattern.push_back(PyLong_AsLong(item));
+        }
+        if (pattern.size() == self->circuit->get_qbit_num()) {
+            self->circuit->add_permutation(pattern);
+        }
+    }   
+    return Py_BuildValue("i", 0);
+}
 /**
 @brief Wrapper function to add a block of operations to the front of the gate structure.
 @param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper.
@@ -1948,14 +1976,17 @@ static PyMethodDef qgd_Circuit_Wrapper_Methods[] = {
     {"add_CRY", (PyCFunction) qgd_Circuit_Wrapper_add_CRY, METH_VARARGS | METH_KEYWORDS,
      "Call to add a CRY gate to the front of the gate structure"
     },
+    {"add_Permutation", (PyCFunction) qgd_Circuit_Wrapper_add_Permutation, METH_VARARGS | METH_KEYWORDS,
+     "Call to add a Permutation gate to the front of the gate structure"
+    },
     {"add_CRX", (PyCFunction) qgd_Circuit_Wrapper_add_CRX, METH_VARARGS | METH_KEYWORDS,
-     "Call to add a CRY gate to the front of the gate structure"
+     "Call to add a CRX gate to the front of the gate structure"
     },
     {"add_CRZ", (PyCFunction) qgd_Circuit_Wrapper_add_CRZ, METH_VARARGS | METH_KEYWORDS,
-     "Call to add a CRY gate to the front of the gate structure"
+     "Call to add a CRZ gate to the front of the gate structure"
     },
     {"add_CP", (PyCFunction) qgd_Circuit_Wrapper_add_CP, METH_VARARGS | METH_KEYWORDS,
-     "Call to add a CRY gate to the front of the gate structure"
+     "Call to add a CP gate to the front of the gate structure"
     },
     {"add_CCX", (PyCFunction) qgd_Circuit_Wrapper_add_CCX, METH_VARARGS | METH_KEYWORDS,
      "Call to add a CCX gate to the front of the gate structure"
diff --git a/squander/src-cpp/gates/Gates_block.cpp b/squander/src-cpp/gates/Gates_block.cpp
index ff001ae14..7313aeb9b 100644
--- a/squander/src-cpp/gates/Gates_block.cpp
+++ b/squander/src-cpp/gates/Gates_block.cpp
@@ -57,6 +57,7 @@ limitations under the License.
 #include "CZ_NU.h"
 #include "Composite.h"
 #include "Gates_block.h"
+#include "Permutation.h"
 
 #include "custom_kernel_1qubit_gate.h"
 
@@ -416,6 +417,11 @@ Gates_block::apply_from_right( Matrix_real& parameters_mtx, Matrix& input ) {
             com_operation->apply_from_right( parameters_mtx, input );
             break; 
         }
+        case PERMUTATION_OPERATION: {
+            Permutation* perm_operation = static_cast<Permutation*>(operation);
+            perm_operation->apply_to( input );
+            break;
+        }
         default:
             std::string err("Gates_block::apply_from_right: unimplemented gate"); 
             throw err;
@@ -621,6 +627,27 @@ void Gates_block::add_u3_to_front(int target_qbit) {
 
 }
 
+/**
+@brief Append a Permutation gate to the list of gates
+@param pattern The pattern of the permutation
+*/
+void Gates_block::add_permutation(const std::vector<int>& pattern) {
+    // create the operation
+    Gate* operation = static_cast<Gate*>(new Permutation( qbit_num, pattern ));
+    add_gate( operation );
+}
+
+
+/**
+@brief Add a Permutation gate to the front of the list of gates
+@param pattern The pattern of the permutation
+*/
+void Gates_block::add_permutation_to_front(const std::vector<int>& pattern) {
+    // create the operation
+    Gate* operation = static_cast<Gate*>(new Permutation( qbit_num, pattern ));
+    add_gate_to_front( operation );
+}
+
 /**
 @brief Append a RX gate to the list of gates
 @param target_qbit The identification number of the targt qubit. (0 <= target_qbit <= qbit_num-1)
@@ -2212,6 +2239,13 @@ Gates_block::create_remapped_circuit( const std::map<int, int>& qbit_map, const
 
             break;
         }
+        case PERMUTATION_OPERATION:
+        {
+            Gate* cloned_op = op->clone();
+            cloned_op->set_qbit_num( qbit_num_ );
+            ret->add_gate( cloned_op );
+            break;
+        }
         default:
             std::string err("Gates_block::create_remapped_circuit: unimplemented gate"); 
             throw err;
@@ -2406,7 +2440,7 @@ int Gates_block::extract_gates( Gates_block* op_block ) {
         case CH_OPERATION: case SYC_OPERATION:
         case U1_OPERATION: case U2_OPERATION: 
         case U3_OPERATION: case CP_OPERATION:
-        case RY_OPERATION: case CRY_OPERATION: 
+        case RY_OPERATION: case CRY_OPERATION: case PERMUTATION_OPERATION:
         case CRX_OPERATION: case CRZ_OPERATION:
         case RX_OPERATION: case CR_OPERATION:
         case RZ_OPERATION: case X_OPERATION:
diff --git a/squander/src-cpp/gates/Permutation.cpp b/squander/src-cpp/gates/Permutation.cpp
new file mode 100644
index 000000000..5d49f839b
--- /dev/null
+++ b/squander/src-cpp/gates/Permutation.cpp
@@ -0,0 +1,135 @@
+/*
+Created on Fri Jun 26 14:13:26 2020
+Copyright 2020 Peter Rakyta, Ph.D.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author: Peter Rakyta, Ph.D.
+*/
+/*! \file Permutation.cpp
+    \brief Class for the representation of Permutation gate.
+*/
+#include "Permutation.h"
+#include "apply_dedicated_gate_kernel_to_input.h"
+#include "common.h"
+
+Permutation::Permutation(){
+    name = "Permutation";
+    type = PERMUTATION_OPERATION;
+    target_qbits.clear();
+    control_qbits.clear();
+    parameter_num = 0;
+}
+
+Permutation::Permutation(int qbit_num_in, const std::vector<int>& pattern_in) : Gate(qbit_num_in) {
+    if (pattern_in.size() != qbit_num_in) {
+        std::stringstream sstream;
+        sstream << "Permutation: Pattern size " << pattern_in.size() << " is not equal to the number of qubits " << qbit_num_in << std::endl;
+        print(sstream, 0);
+        throw sstream.str();
+    }
+    name = "Permutation";
+    type = PERMUTATION_OPERATION;
+    pattern = pattern_in;
+    control_qbits.clear();
+    parameter_num = 0;
+    target_qbits.resize(qbit_num_in);
+    for (int idx=0; idx<qbit_num_in; idx++){
+        target_qbits[idx] = idx;
+    }
+}
+Permutation::~Permutation(){
+    target_qbits.clear();
+    control_qbits.clear();
+}
+
+Matrix Permutation::get_matrix(){
+    return get_matrix(false);
+}
+
+Matrix Permutation::get_matrix(int parallel){
+    Matrix permutation_matrix = create_identity(matrix_size);
+    apply_to(permutation_matrix, parallel);
+    return permutation_matrix;
+}
+
+void Permutation::apply_to(Matrix& input, int parallel){
+    apply_Permutation_kernel_to_input(input, pattern, matrix_size);
+}
+void Permutation::apply_to(Matrix& input){
+    if (input.rows != matrix_size) {
+        std::string err("Permutation::apply_to: Wrong input size in Permutation gate apply");
+        throw err;
+    }
+    apply_Permutation_kernel_to_input(input, pattern, matrix_size);
+}
+
+void Permutation::apply_to_list(std::vector<Matrix>& inputs, int parallel){
+    int work_batch = 1;
+    if ( parallel == 0 ) {
+        work_batch = inputs.size();
+    }
+    else {
+        work_batch = 1;
+    }
+
+
+    tbb::parallel_for( tbb::blocked_range<int>(0,inputs.size(),work_batch), [&](tbb::blocked_range<int> r) {
+        for (int idx=r.begin(); idx<r.end(); ++idx) { 
+
+            Matrix* input = &inputs[idx];
+
+            apply_to( *input, parallel );
+
+        }
+
+    });
+}
+
+
+std::vector<int> Permutation::get_target_qbits(){
+    return target_qbits;
+}
+
+std::vector<int> Permutation::get_control_qbits(){
+    return control_qbits;
+}
+
+std::vector<int> Permutation::get_pattern(){
+    return pattern;
+}
+
+void Permutation::set_pattern(const std::vector<int>& pattern_in){
+    pattern = pattern_in;
+}
+
+std::vector<int> Permutation::get_involved_qubits(bool only_target){
+    return get_involved_qubits(only_target);
+}
+
+Permutation* Permutation::clone(){
+    Permutation* ret = new Permutation(qbit_num, pattern);
+    ret->set_parameter_start_idx(get_parameter_start_idx());
+    ret->set_parents(parents);
+    ret->set_children(children);
+    return ret;
+}
+
+void Permutation::reorder_qubits(std::vector<int> qbit_list){
+    Gate::reorder_qubits(qbit_list);
+    std::vector<int> new_pattern(qbit_num);
+    for (int idx=0; idx<qbit_num; idx++){
+        new_pattern[idx] = std::find(qbit_list.begin(), qbit_list.end(), pattern[idx]) - qbit_list.begin();
+    }
+    pattern = new_pattern;
+}
\ No newline at end of file
diff --git a/squander/src-cpp/gates/include/Gate.h b/squander/src-cpp/gates/include/Gate.h
index b5fa94ad2..39f285941 100644
--- a/squander/src-cpp/gates/include/Gate.h
+++ b/squander/src-cpp/gates/include/Gate.h
@@ -73,7 +73,8 @@ typedef enum gate_type {GENERAL_OPERATION=1,
                         CRZ_OPERATION=40,
                         CCX_OPERATION=41,
                         SWAP_OPERATION=42,
-                        CSWAP_OPERATION=43} gate_type;
+                        CSWAP_OPERATION=43,
+                        PERMUTATION_OPERATION=44} gate_type;
 
 
 #ifdef _WIN32
diff --git a/squander/src-cpp/gates/include/Gates_block.h b/squander/src-cpp/gates/include/Gates_block.h
index 8a8e5ece3..71e480879 100644
--- a/squander/src-cpp/gates/include/Gates_block.h
+++ b/squander/src-cpp/gates/include/Gates_block.h
@@ -209,8 +209,17 @@ void add_ry(int target_qbit);
 */
 void add_ry_to_front(int target_qbit);
 
+/**
+@brief Append a Permutation gate to the list of gates
+@param pattern The pattern of the permutation
+*/
+void add_permutation(const std::vector<int>& pattern);
 
-
+/**
+@brief Add a Permutation gate to the front of the list of gates
+@param pattern The pattern of the permutation
+*/
+void add_permutation_to_front(const std::vector<int>& pattern);
 
 /**
 @brief Append a CRY gate to the list of gates
diff --git a/squander/src-cpp/gates/include/Permutation.h b/squander/src-cpp/gates/include/Permutation.h
new file mode 100644
index 000000000..9d95aeea1
--- /dev/null
+++ b/squander/src-cpp/gates/include/Permutation.h
@@ -0,0 +1,55 @@
+/*
+Created on Fri Jun 26 14:13:26 2020
+Copyright 2020 Peter Rakyta, Ph.D.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author: Peter Rakyta, Ph.D.
+*/
+/*! \file Permutation.h
+    \brief Class for the representation of Permutation gate.
+*/
+
+#ifndef PERMUTATION_H
+#define PERMUTATION_H
+
+#include "Gate.h"
+#include "common.h"
+#include "matrix.h"
+#include "logging.h"
+#include "tbb/tbb.h"
+
+class Permutation : public Gate {
+
+protected:
+    std::vector<int> pattern;
+
+public:
+    Permutation();
+    Permutation(int qbit_num_in, const std::vector<int>& pattern_in);
+    ~Permutation();
+    Matrix get_matrix();
+    Matrix get_matrix(int parallel);
+    void apply_to(Matrix& input, int parallel);
+    void apply_to(Matrix& input);
+    void apply_to_list(std::vector<Matrix>& inputs, int parallel);
+    std::vector<int> get_pattern();
+    void set_pattern(const std::vector<int>& pattern_in);
+    std::vector<int> get_target_qbits();
+    std::vector<int> get_control_qbits();
+    std::vector<int> get_involved_qubits(bool only_target = false);
+    Permutation* clone();
+    void reorder_qubits(std::vector<int> qbit_list);
+};
+
+#endif //PERMUTATION_H
\ No newline at end of file
diff --git a/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp b/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
index a8058b8bc..ed39eede8 100644
--- a/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
+++ b/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
@@ -25,7 +25,8 @@ limitations under the License.
 //#include <immintrin.h>
 #include "tbb/tbb.h"
 #include <omp.h>
-
+#include <unordered_map>
+#include <unordered_set>
 
 
 
@@ -307,6 +308,50 @@ void apply_SWAP_kernel_to_input(Matrix& input, const std::vector<int>& target_qb
     }
 }
 
+void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size){
+
+    int qbit_num = pattern.size();
+    
+    std::unordered_map<int,int> pattern_map;
+    for (int row_idx=0; row_idx<matrix_size; row_idx++){
+       std::vector<int> old_bits(qbit_num);
+       for (int idx=0; idx<qbit_num; idx++){
+        old_bits[idx] = (row_idx >> pattern[idx]) & 1;
+       }
+       int new_row_idx = 0;
+       for (int idx=0; idx<qbit_num; idx++){
+        new_row_idx |= old_bits[idx] << idx;
+       }
+       pattern_map[row_idx] = new_row_idx;
+    }
+
+    std::unordered_set<int> visited_rows;
+    std::vector<std::vector<int>> row_cycles;
+    for (const auto& [start, _] : pattern_map){
+        if (visited_rows.count(start) ) continue;
+
+        std::vector<int> cycle;
+        int current = start;
+        while (!visited_rows.count(current)){
+            cycle.push_back(current);
+            visited_rows.insert(current);
+            current = pattern_map[current];
+        }
+        if (cycle.size() > 1){
+            row_cycles.push_back(cycle);
+        }
+    }
+    for (const auto& cycle : row_cycles){
+        for (size_t idx=0; idx<cycle.size()-1; idx++){
+            std::swap_ranges(
+                input.get_data() + cycle[idx]*input.stride,
+                input.get_data() + cycle[idx]*input.stride + input.cols,
+                input.get_data() + cycle[idx+1]*input.stride
+            );
+        }
+    }
+}
+
 // TBB Parallelized versions
 
 void apply_X_kernel_to_input_tbb(Matrix& input, const std::vector<int>& target_qbits,
diff --git a/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h b/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
index a4007b62d..46cdb1b03 100644
--- a/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
+++ b/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
@@ -74,6 +74,15 @@ void apply_T_kernel_to_input(Matrix& input, const int& target_qbit, const int& c
  */
 void apply_SWAP_kernel_to_input(Matrix& input, const std::vector<int>& target_qbits, const std::vector<int>& control_qbits, const int& matrix_size);
 
+/**
+ * @brief Applies the Permutation gate kernel to the input matrix.
+ *
+ * @param input The input matrix on which the transformation is applied.
+ * @param pattern The pattern of the permutation.
+ * @param matrix_size The size of the input.
+ */
+void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size);
+
 // TBB Parallelized versions
 void apply_X_kernel_to_input_tbb(Matrix& input, const std::vector<int>& target_qbits, const std::vector<int>& control_qbits, const int& matrix_size);
 void apply_Y_kernel_to_input_tbb(Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size);
diff --git a/tests/gates/test_gates.py b/tests/gates/test_gates.py
index b024df0d2..308f24884 100644
--- a/tests/gates/test_gates.py
+++ b/tests/gates/test_gates.py
@@ -472,7 +472,7 @@ def test_gates(self):
 
             if inspect.isclass(obj):
                 
-                if name == "SYC" or name == "Gate" or name=="CR" or name=="CROT":
+                if name == "SYC" or name == "Gate" or name=="CR" or name=="CROT" or name=="Permutation":
                     continue
 
                 print(f"testing gate: {name}")

From 4c34e924a81721aa0c05a73d66436eb465970f65 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 27 Oct 2025 02:58:41 +0100
Subject: [PATCH 002/232] Fix permutation bug

---
 examples/decomposition/example_SABRE.py | 12 ++++---
 squander/IO_interfaces/Qiskit_IO.py     |  4 +--
 squander/gates/qgd_Circuit.py           |  9 +++++
 squander/gates/qgd_Circuit_Wrapper.cpp  | 45 ++++++++++++++++++++++++-
 squander/src-cpp/gates/Gates_block.cpp  | 24 ++++++++++---
 squander/src-cpp/gates/Permutation.cpp  |  6 +++-
 6 files changed, 87 insertions(+), 13 deletions(-)

diff --git a/examples/decomposition/example_SABRE.py b/examples/decomposition/example_SABRE.py
index 9b990e17e..aa17184c4 100644
--- a/examples/decomposition/example_SABRE.py
+++ b/examples/decomposition/example_SABRE.py
@@ -1,6 +1,7 @@
 from squander import SABRE
 from squander import Qiskit_IO
 from squander import utils
+from squander import Circuit
 
 from qiskit import transpile
 from qiskit import QuantumCircuit
@@ -41,13 +42,14 @@
 print("mapping (q -> Q):", pi)
 print("Final mapping:", final_pi)
 qubits = list(range(N))
-Qiskit_circuit = QuantumCircuit(N)
 pi_map = list(np.array(sabre.get_inverse_pi(pi)))
-Qiskit_circuit.append(CircuitInstruction( PermutationGate(pi_map),qubits))
-Qiskit_circuit &= Qiskit_IO.get_Qiskit_Circuit( Squander_remapped_circuit, parameters_remapped_circuit )
-Qiskit_circuit.append(CircuitInstruction( PermutationGate(list(final_pi)),qubits))
+final_circuit = Circuit(N)
+final_circuit.add_Permutation(list(pi_map))
+final_circuit.add_Circuit(Squander_remapped_circuit)
+final_circuit.add_Permutation(list(final_pi))
+Qiskit_circuit = Qiskit_IO.get_Qiskit_Circuit( final_circuit.get_Flat_Circuit(), parameters_remapped_circuit )
 print("CIRCUIT MAPPED WITH SABRE:")
-#print( Qiskit_circuit )
+print( Qiskit_circuit )
 print("SABRE SWAP COUNT:", swap_count)
 # defining the qubit topology/connectivity for Squander
 coupling_map = [
diff --git a/squander/IO_interfaces/Qiskit_IO.py b/squander/IO_interfaces/Qiskit_IO.py
index 7d44aa3c9..46922231a 100644
--- a/squander/IO_interfaces/Qiskit_IO.py
+++ b/squander/IO_interfaces/Qiskit_IO.py
@@ -214,7 +214,7 @@ def get_Qiskit_Circuit( Squander_circuit, parameters ):
             #Permutation gate
             from qiskit.circuit.library import PermutationGate
             pattern = gate.get_Pattern()
-            qubits = list(range(gate.get_Qbit_Num()))
+            qubits = list(range(len(pattern)))
             circuit.append( PermutationGate(pattern),qubits)
         
         elif isinstance( gate, Circuit ):
@@ -377,7 +377,7 @@ def get_Qiskit_Circuit_inverse( Squander_circuit, parameters ):
             #Permutation gate
             from qiskit.circuit.library import PermutationGate
             pattern = gate.get_Pattern()
-            qubits = list(range(gate.get_Qbit_Num()))
+            qubits = list(range(len(pattern)))
             circuit.append( PermutationGate(pattern),qubits)
 
         elif isinstance( gate, Circuit ):
diff --git a/squander/gates/qgd_Circuit.py b/squander/gates/qgd_Circuit.py
index c10b05ad6..bd0824cfc 100644
--- a/squander/gates/qgd_Circuit.py
+++ b/squander/gates/qgd_Circuit.py
@@ -348,6 +348,15 @@ def add_CP( self, target_qbit, control_qbit):
 	# call the C wrapper function
         super(qgd_Circuit, self).add_CP(target_qbit, control_qbit)
 
+#@brief Call to add a Permutation gate to the front of the gate structure.
+#@param self A pointer pointing to an instance of the class qgd_Circuit.
+#@param Input arguments: pattern (list of int) - permutation pattern.
+
+    def add_Permutation( self, pattern):
+
+	# call the C wrapper function
+        super(qgd_Circuit, self).add_Permutation(pattern)
+
 #@brief Call to add a SWAP gate to the front of the gate structure.
 #@param self A pointer pointing to an instance of the class qgd_Circuit.
 #@param Input arguments: target_qbits (list of int) - list of target qubits (at least 2).
diff --git a/squander/gates/qgd_Circuit_Wrapper.cpp b/squander/gates/qgd_Circuit_Wrapper.cpp
index 9806bf750..9c3acbfef 100644
--- a/squander/gates/qgd_Circuit_Wrapper.cpp
+++ b/squander/gates/qgd_Circuit_Wrapper.cpp
@@ -413,7 +413,23 @@ qgd_Circuit_Wrapper_add_Permutation(qgd_Circuit_Wrapper *self, PyObject *args, P
             pattern.push_back(PyLong_AsLong(item));
         }
         if (pattern.size() == self->circuit->get_qbit_num()) {
-            self->circuit->add_permutation(pattern);
+            try {
+                self->circuit->add_permutation(pattern);
+            } catch (const std::string& e) {
+                PyErr_SetString(PyExc_ValueError, e.c_str());
+                return Py_BuildValue("i", -1);
+            } catch (const std::exception& e) {
+                PyErr_SetString(PyExc_ValueError, e.what());
+                return Py_BuildValue("i", -1);
+            } catch (...) {
+                PyErr_SetString(PyExc_ValueError, "Unknown error occurred in add_permutation");
+                return Py_BuildValue("i", -1);
+            }
+        } else {
+            std::string err = "Pattern size " + std::to_string(pattern.size()) + 
+                             " does not match circuit qubit number " + std::to_string(self->circuit->get_qbit_num());
+            PyErr_SetString(PyExc_ValueError, err.c_str());
+            return Py_BuildValue("i", -1);
         }
     }   
     return Py_BuildValue("i", 0);
@@ -1329,6 +1345,33 @@ get_gate( Gates_block* circuit, int &idx ) {
         Py_DECREF( circuit_input );
 
     }
+    else if (gate->get_type() == PERMUTATION_OPERATION) {
+        // Handle Permutation gate
+        PyObject* qgd_gate_Dict  = PyModule_GetDict( qgd_gate );
+        PyObject* py_gate_class = PyDict_GetItemString( qgd_gate_Dict, "Permutation");
+        
+        // Get the pattern from the Permutation gate
+        Permutation* perm_gate = static_cast<Permutation*>(gate);
+        std::vector<int> pattern = perm_gate->get_pattern();
+        
+        // Convert pattern to Python list
+        PyObject* pattern_list = PyList_New(pattern.size());
+        for (size_t i = 0; i < pattern.size(); i++) {
+            PyList_SetItem(pattern_list, i, Py_BuildValue("i", pattern[i]));
+        }
+        
+        PyObject* gate_input = Py_BuildValue("(OO)", qbit_num, pattern_list);
+        py_gate = PyObject_CallObject(py_gate_class, gate_input);
+        
+        // replace dummy data with real gate data
+        qgd_Gate* py_gate_C = reinterpret_cast<qgd_Gate*>( py_gate );
+        delete( py_gate_C->gate );
+        py_gate_C->gate = static_cast<Gate*>( gate->clone() );
+        
+        Py_DECREF( qgd_gate );
+        Py_DECREF( gate_input );
+        Py_DECREF( pattern_list );
+    }
     else {
 
             Py_DECREF( qgd_gate );    
diff --git a/squander/src-cpp/gates/Gates_block.cpp b/squander/src-cpp/gates/Gates_block.cpp
index 7313aeb9b..c407becaa 100644
--- a/squander/src-cpp/gates/Gates_block.cpp
+++ b/squander/src-cpp/gates/Gates_block.cpp
@@ -633,8 +633,16 @@ void Gates_block::add_u3_to_front(int target_qbit) {
 */
 void Gates_block::add_permutation(const std::vector<int>& pattern) {
     // create the operation
-    Gate* operation = static_cast<Gate*>(new Permutation( qbit_num, pattern ));
-    add_gate( operation );
+    try {
+        Gate* operation = static_cast<Gate*>(new Permutation( qbit_num, pattern ));
+        add_gate( operation );
+    } catch (const std::string& e) {
+        // Re-throw as proper exception
+        throw std::runtime_error(e);
+    } catch (const std::exception& e) {
+        // Re-throw as-is
+        throw;
+    }
 }
 
 
@@ -644,8 +652,16 @@ void Gates_block::add_permutation(const std::vector<int>& pattern) {
 */
 void Gates_block::add_permutation_to_front(const std::vector<int>& pattern) {
     // create the operation
-    Gate* operation = static_cast<Gate*>(new Permutation( qbit_num, pattern ));
-    add_gate_to_front( operation );
+    try {
+        Gate* operation = static_cast<Gate*>(new Permutation( qbit_num, pattern ));
+        add_gate_to_front( operation );
+    } catch (const std::string& e) {
+        // Re-throw as proper exception
+        throw std::runtime_error(e);
+    } catch (const std::exception& e) {
+        // Re-throw as-is
+        throw;
+    }
 }
 
 /**
diff --git a/squander/src-cpp/gates/Permutation.cpp b/squander/src-cpp/gates/Permutation.cpp
index 5d49f839b..7216525db 100644
--- a/squander/src-cpp/gates/Permutation.cpp
+++ b/squander/src-cpp/gates/Permutation.cpp
@@ -114,7 +114,11 @@ void Permutation::set_pattern(const std::vector<int>& pattern_in){
 }
 
 std::vector<int> Permutation::get_involved_qubits(bool only_target){
-    return get_involved_qubits(only_target);
+    std::vector<int> involved_qubits;
+    for (int i = 0; i < qbit_num; i++) {
+        involved_qubits.push_back(i);
+    }
+    return involved_qubits;
 }
 
 Permutation* Permutation::clone(){

From bd16cb40740903c697116da6d47785a0964ca3d6 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 29 Oct 2025 12:55:42 +0100
Subject: [PATCH 003/232] Add partition aware mapping synthesis part

---
 squander/synthesis/PartAM.py | 177 +++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 squander/synthesis/PartAM.py

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
new file mode 100644
index 000000000..053a0a1d2
--- /dev/null
+++ b/squander/synthesis/PartAM.py
@@ -0,0 +1,177 @@
+"""
+This is an implementation of Partition Aware Mapping.
+"""
+from squander.decomposition.qgd_N_Qubit_Decompositions_Wrapper import (
+    qgd_N_Qubit_Decomposition_adaptive as N_Qubit_Decomposition_adaptive,
+    qgd_N_Qubit_Decomposition_Tree_Search as N_Qubit_Decomposition_Tree_Search,
+    qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search,
+)
+from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
+from itertools import permutations
+from squander.partitioning.ilp import get_all_partitions, _get_topo_order, topo_sort_partitions, ilp_global_optimal, recombine_single_qubit_chains
+
+import numpy as np
+from qiskit import QuantumCircuit
+
+from typing import List, Callable
+
+import multiprocessing as mp
+from multiprocessing import Process, Pool
+import os
+
+
+from squander.partitioning.partition import PartitionCircuit
+from squander.partitioning.tools import get_qubits
+from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
+from itertools import product
+
+def extract_subtopology(involved_qbits, qbit_map, config ):
+    mini_topology = []
+    for edge in config["topology"]:
+        if edge[0] in involved_qbits and edge[1] in involved_qbits:
+            mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
+    return mini_topology
+
+class PartitionSynthesisResult:
+    def __init__(self):
+        self.permutations_pairs = []
+        self.synthesised_circuits = []
+        self.synthesised_parameters = []
+        self.cnot_counts = []
+    
+    def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters):
+        self.permutations_pairs.append(permutations_pair)
+        self.synthesised_circuits.append(synthesised_circuit)
+        self.synthesised_parameters.append(synthesised_parameters)
+        self.cnot_counts.append(synthesised_circuit.get_Gate_Nums().get('CNOT', 0))
+    
+    def get_best_result(self):
+        best_index = np.argmin(self.cnot_counts)
+        return self.permutations_pairs[best_index], self.synthesised_circuits[best_index], self.synthesised_parameters[best_index]
+    
+    def get_partition_synthesis_score(self):
+        return np.mean(self.cnot_counts)*0.1 + np.min(self.cnot_counts)*0.9
+
+class qgd_Partition_Aware_Mapping:
+
+    def __init__(self, config):
+        self.config = config
+        self.config.setdefault('strategy', 'TreeSearch')
+        self.config.setdefault('parallel', 0 )
+        self.config.setdefault('verbosity', 0 )
+        self.config.setdefault('tolerance', 1e-8 )
+        self.config.setdefault('test_subcircuits', False )
+        self.config.setdefault('test_final_circuit', True )
+        self.config.setdefault('max_partition_size', 3 )
+        self.config.setdefault('topology', None)
+        self.config.setdefault('routed', False)
+        self.config.setdefault('partition_strategy','ilp')
+        self.config.setdefault('optimizer', 'BFGS')
+        strategy = self.config['strategy']
+        allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
+        if not strategy in allowed_strategies:
+            raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
+        parallel = self.config['parallel']
+        allowed_parallel = [0, 1, 2]
+        if not parallel in allowed_parallel:
+            raise Exception(f"The parallel configuration should be either of {allowed_parallel}, got {parallel}.")
+        verbosity = self.config['verbosity']
+        if not isinstance(verbosity, int):
+            raise Exception(f"The verbosity parameter should be an integer.")
+
+        self.max_partition_size = self.config['max_partition_size']
+        if not isinstance(self.max_partition_size, int):
+            raise Exception(f"The max_partition_size parameter should be an integer.")
+        self.topology = self.config['topology']
+        if not isinstance(self.topology, list):
+            raise Exception(f"The topology parameter should be a list.")
+        self.routed = self.config['routed']
+        if not isinstance(self.routed, bool):
+            raise Exception(f"The routed parameter should be a bool.")
+        self.partition_strategy = self.config['partition_strategy']
+        allowed_partition_strategies = ['ilp', 'tdag', 'kahn', 'qiskit', 'qiskit-fusion', 'bqskit-Quick', 'bqskit-Scan', 'bqskit-Greedy', 'bqskit-Cluster']
+    
+    @staticmethod
+    def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None) -> Circuit:
+        """
+        Call to decompose a partition
+        """
+        strategy = config["strategy"]
+        if strategy == "TreeSearch":
+            cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
+        elif strategy == "TabuSearch":
+            cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
+        elif strategy == "Adaptive":
+            cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
+        else:
+            raise Exception(f"Unsupported decomposition type: {strategy}")
+        cDecompose.set_Verbose( config["verbosity"] )
+        cDecompose.set_Cost_Function_Variant( 3 )	
+        cDecompose.set_Optimization_Tolerance( config["tolerance"] )
+        cDecompose.set_Optimizer( config["optimizer"] )
+        cDecompose.Start_Decomposition()
+        squander_circuit = cDecompose.get_Circuit()
+        parameters       = cDecompose.get_Optimized_Parameters()
+        return squander_circuit, parameters
+
+    @staticmethod
+    def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, mini_topology = None) -> PartitionSynthesisResult:
+        """
+        Call to decompose a partition sequentially
+        """
+        N = Partition_circuit.get_Qbit_Num()
+        perumations_all = permutations(range(N))
+        N_permutations = len(perumations_all)
+        result = PartitionSynthesisResult()
+        # Sequential permutation search
+        P_o_initial = np.random.choice(perumations_all)
+        for P_i in perumations_all:
+            Partition_circuit_tmp = Circuit(N)
+            Partition_circuit_tmp.add_Permutation(P_i)
+            Partition_circuit_tmp.add_Circuit(Partition_circuit)
+            Partition_circuit_tmp.add_Permutation(P_o_initial)
+            synthesised_circuit, synthesised_parameters = DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+            result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters)
+
+        P_i_best, _ = result.get_best_result()[0]
+        for P_o in perumations_all:
+            Partition_circuit_tmp = Circuit(N)
+            Partition_circuit_tmp.add_Permutation(P_i_best)
+            Partition_circuit_tmp.add_Circuit(Partition_circuit)
+            Partition_circuit_tmp.add_Permutation(P_o)
+            synthesised_circuit, synthesised_parameters = DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+            result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters)
+        return result
+
+    def SynthesizeWideCircuit(self, circ, orig_parameters):
+        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(circ, self.max_partition_size)
+        qbit_num_orig_circuit = circ.get_Qbit_Num()
+        gate_dict = {i: gate for i, gate in enumerate(circ.get_Gates())}
+        single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
+        single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
+        single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
+        partitined_circuit = Circuit( qbit_num_orig_circuit )
+        params = []
+        for part in allparts:
+            surrounded_chains = {t for s in part for t in go[s] if t in single_qubit_chains_prepost and go[single_qubit_chains_prepost[t][-1]] and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
+            gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded_chains))
+            #topo sort part + surrounded chains
+            c = Circuit( qbit_num_orig_circuit )
+            for gate_idx in _get_topo_order({x: go[x] & gates for x in gates}, {x: rgo[x] & gates for x in gates}):
+                c.add_Gate( gate_dict[gate_idx] )
+                start = gate_dict[gate_idx].get_Parameter_Start_Index()
+                params.append(orig_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+            partitioned_circuit.add_Circuit(c)
+        for chain in single_qubit_chains:
+            c = Circuit( qbit_num_orig_circuit )
+            for gate_idx in chain:
+                c.add_Gate( gate_dict[gate_idx] )
+                start = gate_dict[gate_idx].get_Parameter_Start_Index()
+                params.append(orig_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+            partitioned_circuit.add_Circuit(c)
+        parameters = np.concatenate(params, axis=0)
+
+        qbit_num_orig_circuit = circ.get_Qbit_Num()
+
+
+        subcircuits = partitioned_circuit.get_Gates()

From 7a60433b50b4ffe59ba4b29d3fdfe3d66740572a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 29 Oct 2025 14:05:20 +0100
Subject: [PATCH 004/232] Add functions to find all and unique subtopologies in
 quantum circuits, enhancing partition synthesis capabilities. Update
 PartitionSynthesisResult to handle multiple topologies and improve
 decomposition process with parallelization.

---
 squander/synthesis/PartAM.py | 290 ++++++++++++++++++++++++++++++-----
 1 file changed, 255 insertions(+), 35 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 053a0a1d2..66d017c59 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -18,6 +18,7 @@
 import multiprocessing as mp
 from multiprocessing import Process, Pool
 import os
+from typing import List, Set, Tuple, FrozenSet
 
 
 from squander.partitioning.partition import PartitionCircuit
@@ -25,6 +26,191 @@
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 from itertools import product
 
+def get_all_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
+    """
+    Find ALL connected subtopologies with exactly k qubits using DFS.
+    
+    Args:
+        edges: List of edges representing the quantum hardware topology
+        k: Number of qubits in the desired subtopologies
+    
+    Returns:
+        List of all subtopologies, where each subtopology is a list of edges
+    """
+    if k <= 0:
+        return []
+    
+    # Build adjacency list
+    adj_list = {}
+    for u, v in edges:
+        if u not in adj_list:
+            adj_list[u] = set()
+        if v not in adj_list:
+            adj_list[v] = set()
+        adj_list[u].add(v)
+        adj_list[v].add(u)
+    
+    all_qubits = sorted(adj_list.keys())
+    
+    if k == 1:
+        return [[] for _ in all_qubits]
+    
+    def get_induced_edges(qubit_subset: Set[int]) -> List[Tuple[int, int]]:
+        induced = []
+        for edge in edges:
+            if edge[0] in qubit_subset and edge[1] in qubit_subset:
+                induced.append(edge)
+        return induced
+    
+    subtopologies = []
+    seen = set()
+    
+    def dfs(current_qubits: Set[int], candidates: Set[int]):
+        """Enumerate connected subgraphs using DFS."""
+        if len(current_qubits) == k:
+            frozen = frozenset(current_qubits)
+            if frozen not in seen:
+                seen.add(frozen)
+                subtopologies.append(get_induced_edges(current_qubits))
+            return
+        
+        # Prune if we can't reach k qubits
+        if len(current_qubits) + len(candidates) < k:
+            return
+        
+        for node in sorted(candidates):
+            # Add node and explore
+            new_qubits = current_qubits | {node}
+            
+            # New candidates: neighbors of new_qubits not yet included
+            new_candidates = set()
+            for q in new_qubits:
+                for neighbor in adj_list[q]:
+                    if neighbor not in new_qubits and neighbor > node:
+                        new_candidates.add(neighbor)
+            
+            dfs(new_qubits, new_candidates)
+    
+    # Start DFS from each qubit
+    for start in all_qubits:
+        candidates = {n for n in adj_list[start] if n > start}
+        dfs({start}, candidates)
+    
+    return subtopologies
+
+
+def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]:
+    """
+    Convert a subgraph to canonical form for isomorphism checking.
+    Relabels nodes as 0,1,2,...,k-1 and returns the lexicographically smallest edge set.
+    """
+    qubits = sorted(qubit_subset)
+    n = len(qubits)
+    
+    # Try all permutations and find lexicographically smallest
+    best_edges = None
+    
+    for perm in permutations(range(n)):
+        # Create mapping: qubits[i] -> perm[i]
+        mapping = {qubits[i]: perm[i] for i in range(n)}
+        
+        # Relabel edges
+        relabeled = []
+        for u, v in induced_edges:
+            new_u, new_v = mapping[u], mapping[v]
+            # Normalize edge direction
+            relabeled.append(tuple(sorted([new_u, new_v])))
+        
+        relabeled = tuple(sorted(relabeled))
+        
+        if best_edges is None or relabeled < best_edges:
+            best_edges = relabeled
+    
+    return frozenset(best_edges)
+
+
+def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
+    """
+    Find all UNIQUE subtopology structures with k qubits using DFS.
+    Returns one example of each non-isomorphic connected subgraph.
+    
+    Args:
+        edges: List of edges representing the quantum hardware topology
+        k: Number of qubits in the desired subtopologies
+    
+    Returns:
+        List of unique subtopologies (one representative per isomorphism class)
+    """
+    if k <= 0:
+        return []
+    
+    # Build adjacency list
+    adj_list = {}
+    for u, v in edges:
+        if u not in adj_list:
+            adj_list[u] = set()
+        if v not in adj_list:
+            adj_list[v] = set()
+        adj_list[u].add(v)
+        adj_list[v].add(u)
+    
+    all_qubits = sorted(adj_list.keys())
+    
+    if k == 1:
+        return [[]]  # Single qubit has no edges
+    
+    def get_induced_edges(qubit_subset: Set[int]) -> List[Tuple[int, int]]:
+        induced = []
+        for edge in edges:
+            if edge[0] in qubit_subset and edge[1] in qubit_subset:
+                induced.append(edge)
+        return induced
+    
+    # Track unique canonical forms and their examples
+    canonical_forms = {}
+    seen = set()
+    
+    def dfs(current_qubits: Set[int], candidates: Set[int]):
+        """Enumerate connected subgraphs using DFS."""
+        if len(current_qubits) == k:
+            frozen = frozenset(current_qubits)
+            if frozen not in seen:
+                seen.add(frozen)
+                induced = get_induced_edges(current_qubits)
+                
+                # Get canonical form
+                canonical = get_canonical_form(current_qubits, induced)
+                
+                # Store first example of each canonical form
+                if canonical not in canonical_forms:
+                    canonical_forms[canonical] = induced
+            return
+        
+        # Prune if we can't reach k qubits
+        if len(current_qubits) + len(candidates) < k:
+            return
+        
+        for node in sorted(candidates):
+            # Add node and explore
+            new_qubits = current_qubits | {node}
+            
+            # New candidates: neighbors of new_qubits not yet included
+            new_candidates = set()
+            for q in new_qubits:
+                for neighbor in adj_list[q]:
+                    if neighbor not in new_qubits and neighbor > node:
+                        new_candidates.add(neighbor)
+            
+            dfs(new_qubits, new_candidates)
+    
+    # Start DFS from each qubit
+    for start in all_qubits:
+        candidates = {n for n in adj_list[start] if n > start}
+        dfs({start}, candidates)
+    
+    return list(canonical_forms.values())
+
+
 def extract_subtopology(involved_qbits, qbit_map, config ):
     mini_topology = []
     for edge in config["topology"]:
@@ -33,28 +219,39 @@ def extract_subtopology(involved_qbits, qbit_map, config ):
     return mini_topology
 
 class PartitionSynthesisResult:
-    def __init__(self):
-        self.permutations_pairs = []
-        self.synthesised_circuits = []
-        self.synthesised_parameters = []
-        self.cnot_counts = []
-    
-    def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters):
-        self.permutations_pairs.append(permutations_pair)
-        self.synthesised_circuits.append(synthesised_circuit)
-        self.synthesised_parameters.append(synthesised_parameters)
-        self.cnot_counts.append(synthesised_circuit.get_Gate_Nums().get('CNOT', 0))
-    
-    def get_best_result(self):
-        best_index = np.argmin(self.cnot_counts)
-        return self.permutations_pairs[best_index], self.synthesised_circuits[best_index], self.synthesised_parameters[best_index]
+    def __init__(self, N , mini_topologies):
+        self.mini_topologies = mini_topologies
+        self.topology_count = len(mini_topologies)
+        self.N = N
+        self.permutations_pairs = [[] for _ in range(len(mini_topologies))]
+        self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
+        self.synthesised_parameters = [[] for _ in range(len(mini_topologies))]
+        self.cnot_counts = [[] for _ in range(len(mini_topologies))]
+    
+    def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
+        self.permutations_pairs[topology_idx].append(permutations_pair)
+        self.synthesised_circuits[topology_idx].append(synthesised_circuit)
+        self.synthesised_parameters[topology_idx].append(synthesised_parameters)
+        self.cnot_counts[topology_idx].append(synthesised_circuit.get_Gate_Nums().get('CNOT', 0))
+    
+    def get_best_result(self, topology_idx):
+        best_index = np.argmin(self.cnot_counts[topology_idx])
+        return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index]
     
     def get_partition_synthesis_score(self):
-        return np.mean(self.cnot_counts)*0.1 + np.min(self.cnot_counts)*0.9
+        score = 0
+        for topology_idx in range(self.topology_count):
+            cnot_count_topology = np.mean(self.cnot_counts[topology_idx])*0.1 + np.min(self.cnot_counts[topology_idx])*0.9
+            if len(self.mini_topologies[topology_idx]) == self.N*(self.N-1)/2:
+                score += cnot_count_topology*0.3/self.topology_count
+            else:
+                score += cnot_count_topology*0.7/self.topology_count
+        return score 
 
 class qgd_Partition_Aware_Mapping:
 
     def __init__(self, config):
+        self.topology = config['topology']
         self.config = config
         self.config.setdefault('strategy', 'TreeSearch')
         self.config.setdefault('parallel', 0 )
@@ -115,32 +312,34 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
         return squander_circuit, parameters
 
     @staticmethod
-    def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, mini_topology = None) -> PartitionSynthesisResult:
+    def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies) -> PartitionSynthesisResult:
         """
         Call to decompose a partition sequentially
         """
         N = Partition_circuit.get_Qbit_Num()
         perumations_all = permutations(range(N))
         N_permutations = len(perumations_all)
-        result = PartitionSynthesisResult()
+        result = PartitionSynthesisResult(N, topologies)
         # Sequential permutation search
-        P_o_initial = np.random.choice(perumations_all)
-        for P_i in perumations_all:
-            Partition_circuit_tmp = Circuit(N)
-            Partition_circuit_tmp.add_Permutation(P_i)
-            Partition_circuit_tmp.add_Circuit(Partition_circuit)
-            Partition_circuit_tmp.add_Permutation(P_o_initial)
-            synthesised_circuit, synthesised_parameters = DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-            result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters)
-
-        P_i_best, _ = result.get_best_result()[0]
-        for P_o in perumations_all:
-            Partition_circuit_tmp = Circuit(N)
-            Partition_circuit_tmp.add_Permutation(P_i_best)
-            Partition_circuit_tmp.add_Circuit(Partition_circuit)
-            Partition_circuit_tmp.add_Permutation(P_o)
-            synthesised_circuit, synthesised_parameters = DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-            result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters)
+        for topology_idx in range(len(topologies)):
+            mini_topology = topologies[topology_idx]
+            P_o_initial = np.random.choice(perumations_all)
+            for P_i in perumations_all:
+                Partition_circuit_tmp = Circuit(N)
+                Partition_circuit_tmp.add_Permutation(P_i)
+                Partition_circuit_tmp.add_Circuit(Partition_circuit)
+                Partition_circuit_tmp.add_Permutation(P_o_initial)
+                synthesised_circuit, synthesised_parameters = DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+                result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
+
+            P_i_best, _ = result.get_best_result(topology_idx)[0]
+            for P_o in perumations_all:
+                Partition_circuit_tmp = Circuit(N)
+                Partition_circuit_tmp.add_Permutation(P_i_best)
+                Partition_circuit_tmp.add_Circuit(Partition_circuit)
+                Partition_circuit_tmp.add_Permutation(P_o)
+                synthesised_circuit, synthesised_parameters = DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+                result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
         return result
 
     def SynthesizeWideCircuit(self, circ, orig_parameters):
@@ -175,3 +374,24 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
 
         subcircuits = partitioned_circuit.get_Gates()
+
+        optimized_results = [None] * len(subcircuits)
+
+        with Pool(processes=mp.cpu_count()) as pool:
+            for partition_idx, subcircuit in enumerate( subcircuits ):
+
+                start_idx = subcircuit.get_Parameter_Start_Index()
+                end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
+                subcircuit_parameters = parameters[ start_idx:end_idx ]
+                k = subcircuit.get_Qbit_Num()
+                mini_topologies = get_unique_subtopologies(self.topology, k)
+                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (subcircuit, subcircuit_parameters, self.config, mini_topologies) )
+
+        for partition_idx, subcircuit in enumerate( subcircuits ):
+            optimized_results[partition_idx].wait()
+            optimized_results[partition_idx] = optimized_results[partition_idx].get()
+
+        weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
+        parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L], fusion_info)
+        L = topo_sort_partitions(circ, self.max_partition_size, parts)
+        print(L)

From 881a0af17eeefd7dc8821787d58037a23d834537 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 29 Oct 2025 14:08:54 +0100
Subject: [PATCH 005/232] Import Partition Aware Mapping module to enhance
 circuit optimization capabilities.

---
 squander/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/__init__.py b/squander/__init__.py
index 541798ee7..07be6fa08 100644
--- a/squander/__init__.py
+++ b/squander/__init__.py
@@ -14,7 +14,7 @@
 
 # optimization of wide circuits (optimize wide circuits)
 from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization
-
+from squander.decomposition.qgd_Partition_Aware_Mapping import qgd_Partition_Aware_Mapping as Partition_Aware_Mapping
 # variational quantum solver
 from squander.variational_quantum_eigensolver.qgd_Variational_Quantum_Eigensolver_Base import qgd_Variational_Quantum_Eigensolver_Base as Variational_Quantum_Eigensolver
 from squander.variational_quantum_eigensolver.qgd_Generative_Quantum_Machine_Learning_Base import qgd_Generative_Quantum_Machine_Learning_Base as Generative_Quantum_Machine_Learning

From fb402d1b669499d0c304fbc5e4b6463cdeb35df5 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 29 Oct 2025 14:10:59 +0100
Subject: [PATCH 006/232] fix init

---
 squander/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/__init__.py b/squander/__init__.py
index 07be6fa08..661fc0286 100644
--- a/squander/__init__.py
+++ b/squander/__init__.py
@@ -14,7 +14,7 @@
 
 # optimization of wide circuits (optimize wide circuits)
 from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization
-from squander.decomposition.qgd_Partition_Aware_Mapping import qgd_Partition_Aware_Mapping as Partition_Aware_Mapping
+from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping as Partition_Aware_Mapping
 # variational quantum solver
 from squander.variational_quantum_eigensolver.qgd_Variational_Quantum_Eigensolver_Base import qgd_Variational_Quantum_Eigensolver_Base as Variational_Quantum_Eigensolver
 from squander.variational_quantum_eigensolver.qgd_Generative_Quantum_Machine_Learning_Base import qgd_Generative_Quantum_Machine_Learning_Base as Generative_Quantum_Machine_Learning

From 853b3192dd374eba8ed19be44bf04d9641367639 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 29 Oct 2025 14:26:34 +0100
Subject: [PATCH 007/232] fix type

---
 squander/synthesis/PartAM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 66d017c59..350f1c148 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -349,7 +349,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
-        partitined_circuit = Circuit( qbit_num_orig_circuit )
+        partitioned_circuit = Circuit( qbit_num_orig_circuit )
         params = []
         for part in allparts:
             surrounded_chains = {t for s in part for t in go[s] if t in single_qubit_chains_prepost and go[single_qubit_chains_prepost[t][-1]] and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}

From 8a5fa970a4ddbc81a68bce5395c077099f72a07f Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Wed, 29 Oct 2025 09:49:57 -0400
Subject: [PATCH 008/232] Implement decomposition strategy selection in
 DecomposePartition_and_Perm function and update supported gates in
 qasm_to_squander_circuit.

---
 squander/synthesis/PartAM.py | 50 +++++++++++++++++-------------------
 squander/utils.py            |  3 +--
 2 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 350f1c148..6b6a6fd9a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -26,6 +26,28 @@
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 from itertools import product
 
+def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None) -> Circuit:
+    """
+    Call to decompose a partition
+    """
+    strategy = config["strategy"]
+    if strategy == "TreeSearch":
+        cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
+    elif strategy == "TabuSearch":
+        cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
+    elif strategy == "Adaptive":
+        cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
+    else:
+        raise Exception(f"Unsupported decomposition type: {strategy}")
+    cDecompose.set_Verbose( config["verbosity"] )
+    cDecompose.set_Cost_Function_Variant( 3 )	
+    cDecompose.set_Optimization_Tolerance( config["tolerance"] )
+    cDecompose.set_Optimizer( config["optimizer"] )
+    cDecompose.Start_Decomposition()
+    squander_circuit = cDecompose.get_Circuit()
+    parameters       = cDecompose.get_Optimized_Parameters()
+    return squander_circuit, parameters
+
 def get_all_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
     """
     Find ALL connected subtopologies with exactly k qubits using DFS.
@@ -287,29 +309,6 @@ def __init__(self, config):
             raise Exception(f"The routed parameter should be a bool.")
         self.partition_strategy = self.config['partition_strategy']
         allowed_partition_strategies = ['ilp', 'tdag', 'kahn', 'qiskit', 'qiskit-fusion', 'bqskit-Quick', 'bqskit-Scan', 'bqskit-Greedy', 'bqskit-Cluster']
-    
-    @staticmethod
-    def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None) -> Circuit:
-        """
-        Call to decompose a partition
-        """
-        strategy = config["strategy"]
-        if strategy == "TreeSearch":
-            cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
-        elif strategy == "TabuSearch":
-            cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
-        elif strategy == "Adaptive":
-            cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
-        else:
-            raise Exception(f"Unsupported decomposition type: {strategy}")
-        cDecompose.set_Verbose( config["verbosity"] )
-        cDecompose.set_Cost_Function_Variant( 3 )	
-        cDecompose.set_Optimization_Tolerance( config["tolerance"] )
-        cDecompose.set_Optimizer( config["optimizer"] )
-        cDecompose.Start_Decomposition()
-        squander_circuit = cDecompose.get_Circuit()
-        parameters       = cDecompose.get_Optimized_Parameters()
-        return squander_circuit, parameters
 
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies) -> PartitionSynthesisResult:
@@ -317,13 +316,12 @@ def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_paramete
         Call to decompose a partition sequentially
         """
         N = Partition_circuit.get_Qbit_Num()
-        perumations_all = permutations(range(N))
-        N_permutations = len(perumations_all)
+        perumations_all = list(permutations(range(N)))
         result = PartitionSynthesisResult(N, topologies)
         # Sequential permutation search
         for topology_idx in range(len(topologies)):
             mini_topology = topologies[topology_idx]
-            P_o_initial = np.random.choice(perumations_all)
+            P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
             for P_i in perumations_all:
                 Partition_circuit_tmp = Circuit(N)
                 Partition_circuit_tmp.add_Permutation(P_i)
diff --git a/squander/utils.py b/squander/utils.py
index e5a459940..d0dc234a3 100644
--- a/squander/utils.py
+++ b/squander/utils.py
@@ -129,8 +129,7 @@ def qasm_to_squander_circuit( filename: str, return_transpiled=False):
     """
     
     qc = qiskit.QuantumCircuit.from_qasm_file(filename)
-    from squander.gates import gates_Wrapper as gate
-    SUPPORTED_GATES_NAMES = {n.lower().replace("cnot", "cx") for n in dir(gate) if not n.startswith("_") and issubclass(getattr(gate, n), gate.Gate) and n != "Gate"}
+    SUPPORTED_GATES_NAMES = {'u1','p','u2','u3','u','cu','cx','cry','crz','crx','cu1','cz','ch','rx','ry','rz','h','x','y','z','s','sdg','sx','t','tdg','r','ccx','cswap'}
     if any(gate.operation.name not in SUPPORTED_GATES_NAMES for gate in qc.data):
         qc_transpiled = qiskit.transpile(qc, basis_gates=SUPPORTED_GATES_NAMES, optimization_level=0)
     else:

From 579d7788221f75bc2491d9df5c27b13a5b9702ea Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Thu, 30 Oct 2025 05:51:28 -0400
Subject: [PATCH 009/232] Partial commit

---
 squander/synthesis/PartAM.py | 43 ++++++++++++++++++++++++++++++------
 1 file changed, 36 insertions(+), 7 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6b6a6fd9a..ef2a947c1 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -19,6 +19,7 @@
 from multiprocessing import Process, Pool
 import os
 from typing import List, Set, Tuple, FrozenSet
+from tqdm import tqdm
 
 
 from squander.partitioning.partition import PartitionCircuit
@@ -376,7 +377,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         optimized_results = [None] * len(subcircuits)
 
         with Pool(processes=mp.cpu_count()) as pool:
-            for partition_idx, subcircuit in enumerate( subcircuits ):
+            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Synthesizing partitions") ):
 
                 start_idx = subcircuit.get_Parameter_Start_Index()
                 end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
@@ -385,11 +386,39 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 mini_topologies = get_unique_subtopologies(self.topology, k)
                 optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (subcircuit, subcircuit_parameters, self.config, mini_topologies) )
 
-        for partition_idx, subcircuit in enumerate( subcircuits ):
-            optimized_results[partition_idx].wait()
-            optimized_results[partition_idx] = optimized_results[partition_idx].get()
+            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Processing partitions") ):
+                optimized_results[partition_idx] = optimized_results[partition_idx].get()
 
         weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
-        parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L], fusion_info)
-        L = topo_sort_partitions(circ, self.max_partition_size, parts)
-        print(L)
+        L, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
+        # Create a mapping from partition frozensets to their indices in allparts
+        partition_to_idx = {frozenset(part): i for i, part in enumerate(allparts)}
+
+        # Convert the returned partitions to indices
+        L_indices = [partition_to_idx[frozenset(part)] for part in L_parts]
+
+        # Now directly select the already-optimized subcircuits using the indices
+        selected_optimized_subcircuits = [optimized_subcircuits[i] for i in L_indices]
+
+  max_gates = max(len(c.get_Gates()) for c in optimized_subcircuits)
+  def to_cost(d): return d.get('CNOT', 0)*max_gates + sum(d[x] for x in d if x != 'CNOT')
+  weights = [to_cost(circ.get_Gate_Nums()) for circ in optimized_subcircuits[:len(allparts)]]
+
+  # ilp_global_optimal returns the selected partitions as frozensets
+  L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
+
+  # Create a mapping from partition frozensets to their indices in allparts
+  partition_to_idx = {frozenset(part): i for i, part in enumerate(allparts)}
+
+  # Convert the returned partitions to indices
+  L_indices = [partition_to_idx[frozenset(part)] for part in L_parts]
+
+  # Now directly select the already-optimized subcircuits using the indices
+  selected_optimized_subcircuits = [optimized_subcircuits[i] for i in L_indices]
+  selected_parameters = [optimized_parameter_list[i] for i in L_indices]
+
+  # Construct the final circuit from the optimized subcircuits
+  wide_circuit, wide_parameters = self.ConstructCircuitFromPartitions(
+      selected_optimized_subcircuits,
+      selected_parameters
+  )

From f649965b81079523e9a74ee48e2743cc3027142d Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Oct 2025 18:55:25 +0100
Subject: [PATCH 010/232] reowrk permutation gate to be more efficient

---
 squander/src-cpp/gates/Permutation.cpp        |  70 +++++++++++-
 squander/src-cpp/gates/include/Permutation.h  |   7 ++
 .../apply_dedicated_gate_kernel_to_input.cpp  | 101 +++++++++++++-----
 .../apply_dedicated_gate_kernel_to_input.h    |  14 +++
 4 files changed, 163 insertions(+), 29 deletions(-)

diff --git a/squander/src-cpp/gates/Permutation.cpp b/squander/src-cpp/gates/Permutation.cpp
index 7216525db..e8eefb1cc 100644
--- a/squander/src-cpp/gates/Permutation.cpp
+++ b/squander/src-cpp/gates/Permutation.cpp
@@ -29,6 +29,8 @@ Permutation::Permutation(){
     target_qbits.clear();
     control_qbits.clear();
     parameter_num = 0;
+    cycles_cache_valid = false;
+    cycles_cache_matrix_size = 0;
 }
 
 Permutation::Permutation(int qbit_num_in, const std::vector<int>& pattern_in) : Gate(qbit_num_in) {
@@ -47,6 +49,8 @@ Permutation::Permutation(int qbit_num_in, const std::vector<int>& pattern_in) :
     for (int idx=0; idx<qbit_num_in; idx++){
         target_qbits[idx] = idx;
     }
+    cycles_cache_valid = false;
+    cycles_cache_matrix_size = 0;
 }
 Permutation::~Permutation(){
     target_qbits.clear();
@@ -64,14 +68,33 @@ Matrix Permutation::get_matrix(int parallel){
 }
 
 void Permutation::apply_to(Matrix& input, int parallel){
-    apply_Permutation_kernel_to_input(input, pattern, matrix_size);
+    if (input.rows != matrix_size) {
+        std::string err("Permutation::apply_to: Wrong input size in Permutation gate apply");
+        throw err;
+    }
+
+    if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) {
+        build_cycles_cache();
+    }
+    if (parallel == 2) {
+        apply_Permutation_kernel_to_input_tbb(input, pattern, matrix_size, cycles_cache);
+    }
+    else if (parallel == 1) {
+        apply_Permutation_kernel_to_input_omp(input, pattern, matrix_size, cycles_cache);
+    }
+    else {
+        apply_Permutation_kernel_to_input(input, pattern, matrix_size, cycles_cache);
+    }
 }
 void Permutation::apply_to(Matrix& input){
     if (input.rows != matrix_size) {
         std::string err("Permutation::apply_to: Wrong input size in Permutation gate apply");
         throw err;
     }
-    apply_Permutation_kernel_to_input(input, pattern, matrix_size);
+    if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) {
+        build_cycles_cache();
+    }
+    apply_Permutation_kernel_to_input(input, pattern, matrix_size, cycles_cache);
 }
 
 void Permutation::apply_to_list(std::vector<Matrix>& inputs, int parallel){
@@ -111,6 +134,7 @@ std::vector<int> Permutation::get_pattern(){
 
 void Permutation::set_pattern(const std::vector<int>& pattern_in){
     pattern = pattern_in;
+    invalidate_cache();
 }
 
 std::vector<int> Permutation::get_involved_qubits(bool only_target){
@@ -136,4 +160,46 @@ void Permutation::reorder_qubits(std::vector<int> qbit_list){
         new_pattern[idx] = std::find(qbit_list.begin(), qbit_list.end(), pattern[idx]) - qbit_list.begin();
     }
     pattern = new_pattern;
+    invalidate_cache();
+}
+
+void Permutation::invalidate_cache(){
+    cycles_cache_valid = false;
+    cycles_cache.clear();
+    cycles_cache_matrix_size = 0;
+}
+
+void Permutation::build_cycles_cache(){
+    cycles_cache.clear();
+    cycles_cache_matrix_size = matrix_size;
+
+    int qbit_num = pattern.size();
+
+    // Precompute next index for all rows once to avoid repeated bit work in cycle walks
+    std::vector<int> next_index(matrix_size);
+    for (int row_idx = 0; row_idx < matrix_size; ++row_idx) {
+        int new_row_idx = 0;
+        for (int idx = 0; idx < qbit_num; idx++) {
+            int bit = (row_idx >> pattern[idx]) & 1;
+            new_row_idx |= (bit << idx);
+        }
+        next_index[row_idx] = new_row_idx;
+    }
+
+    std::vector<uint8_t> visited(matrix_size, 0);
+    for (int start = 0; start < matrix_size; ++start) {
+        if (visited[start]) continue;
+        std::vector<int> cycle;
+        int current = start;
+        while (!visited[current]) {
+            visited[current] = 1;
+            cycle.push_back(current);
+            current = next_index[current];
+        }
+        if (cycle.size() > 1) {
+            cycles_cache.push_back(std::move(cycle));
+        }
+    }
+
+    cycles_cache_valid = true;
 }
\ No newline at end of file
diff --git a/squander/src-cpp/gates/include/Permutation.h b/squander/src-cpp/gates/include/Permutation.h
index 9d95aeea1..e8999b82a 100644
--- a/squander/src-cpp/gates/include/Permutation.h
+++ b/squander/src-cpp/gates/include/Permutation.h
@@ -33,6 +33,13 @@ class Permutation : public Gate {
 
 protected:
     std::vector<int> pattern;
+    // Cached cycles for current pattern and matrix size
+    std::vector<std::vector<int>> cycles_cache;
+    int cycles_cache_matrix_size = 0;
+    bool cycles_cache_valid = false;
+
+    void invalidate_cache();
+    void build_cycles_cache();
 
 public:
     Permutation();
diff --git a/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp b/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
index ed39eede8..8e2f7c160 100644
--- a/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
+++ b/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp
@@ -311,42 +311,89 @@ void apply_SWAP_kernel_to_input(Matrix& input, const std::vector<int>& target_qb
 void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size){
 
     int qbit_num = pattern.size();
-    
-    std::unordered_map<int,int> pattern_map;
-    for (int row_idx=0; row_idx<matrix_size; row_idx++){
-       std::vector<int> old_bits(qbit_num);
-       for (int idx=0; idx<qbit_num; idx++){
-        old_bits[idx] = (row_idx >> pattern[idx]) & 1;
-       }
-       int new_row_idx = 0;
-       for (int idx=0; idx<qbit_num; idx++){
-        new_row_idx |= old_bits[idx] << idx;
-       }
-       pattern_map[row_idx] = new_row_idx;
-    }
 
-    std::unordered_set<int> visited_rows;
-    std::vector<std::vector<int>> row_cycles;
-    for (const auto& [start, _] : pattern_map){
-        if (visited_rows.count(start) ) continue;
+    auto permuted_index = [&](int row_idx) -> int {
+        int new_row_idx = 0;
+        for (int idx = 0; idx < qbit_num; idx++) {
+            int bit = (row_idx >> pattern[idx]) & 1;
+            new_row_idx |= (bit << idx);
+        }
+        return new_row_idx;
+    };
+
+    std::vector<uint8_t> visited(matrix_size, 0);
+
+    for (int start = 0; start < matrix_size; ++start) {
+        if (visited[start]) continue;
 
         std::vector<int> cycle;
         int current = start;
-        while (!visited_rows.count(current)){
+        while (!visited[current]) {
+            visited[current] = 1;
             cycle.push_back(current);
-            visited_rows.insert(current);
-            current = pattern_map[current];
+            current = permuted_index(current);
         }
-        if (cycle.size() > 1){
-            row_cycles.push_back(cycle);
+
+        if (cycle.size() <= 1) continue;
+
+        for (size_t idx = 0; idx < cycle.size() - 1; idx++) {
+            std::swap_ranges(
+                input.get_data() + cycle[idx] * input.stride,
+                input.get_data() + cycle[idx] * input.stride + input.cols,
+                input.get_data() + cycle[idx + 1] * input.stride
+            );
+        }
+    }
+}
+
+// Overload that applies permutation using precomputed cycles
+void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern; // currently unused, kept for interface symmetry / potential validation
+    (void)matrix_size; // rows already validated by caller
+
+    for (const auto& cycle : cycles) {
+        for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
+            std::swap_ranges(
+                input.get_data() + cycle[idx] * input.stride,
+                input.get_data() + cycle[idx] * input.stride + input.cols,
+                input.get_data() + cycle[idx + 1] * input.stride
+            );
         }
     }
-    for (const auto& cycle : row_cycles){
-        for (size_t idx=0; idx<cycle.size()-1; idx++){
+}
+
+void apply_Permutation_kernel_to_input_tbb(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern;
+    (void)matrix_size;
+
+    tbb::parallel_for(tbb::blocked_range<size_t>(0, cycles.size(), 64),
+        [&](const tbb::blocked_range<size_t>& range) {
+            for (size_t cdx = range.begin(); cdx != range.end(); ++cdx) {
+                const auto& cycle = cycles[cdx];
+                for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
+                    std::swap_ranges(
+                        input.get_data() + cycle[idx] * input.stride,
+                        input.get_data() + cycle[idx] * input.stride + input.cols,
+                        input.get_data() + cycle[idx + 1] * input.stride
+                    );
+                }
+            }
+        }
+    );
+}
+
+void apply_Permutation_kernel_to_input_omp(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles){
+    (void)pattern;
+    (void)matrix_size;
+
+    #pragma omp parallel for schedule(static)
+    for (int cdx = 0; cdx < (int)cycles.size(); ++cdx) {
+        const auto& cycle = cycles[cdx];
+        for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) {
             std::swap_ranges(
-                input.get_data() + cycle[idx]*input.stride,
-                input.get_data() + cycle[idx]*input.stride + input.cols,
-                input.get_data() + cycle[idx+1]*input.stride
+                input.get_data() + cycle[idx] * input.stride,
+                input.get_data() + cycle[idx] * input.stride + input.cols,
+                input.get_data() + cycle[idx + 1] * input.stride
             );
         }
     }
diff --git a/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h b/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
index 46cdb1b03..8e21aaa34 100644
--- a/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
+++ b/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h
@@ -83,6 +83,20 @@ void apply_SWAP_kernel_to_input(Matrix& input, const std::vector<int>& target_qb
  */
 void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size);
 
+/**
+ * @brief Applies the Permutation gate kernel using precomputed cycles.
+ *
+ * @param input The input matrix on which the transformation is applied.
+ * @param pattern The pattern of the permutation (used only for validation or future extensions).
+ * @param matrix_size The size of the input.
+ * @param cycles The disjoint cycles of row indices representing the permutation.
+ */
+void apply_Permutation_kernel_to_input(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+
+// Parallelized versions for permutation with precomputed cycles
+void apply_Permutation_kernel_to_input_tbb(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+void apply_Permutation_kernel_to_input_omp(Matrix& input, const std::vector<int>& pattern, const int& matrix_size, const std::vector<std::vector<int>>& cycles);
+
 // TBB Parallelized versions
 void apply_X_kernel_to_input_tbb(Matrix& input, const std::vector<int>& target_qbits, const std::vector<int>& control_qbits, const int& matrix_size);
 void apply_Y_kernel_to_input_tbb(Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size);

From 9001704fd055d5549c353d4e04df49da9899096f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Oct 2025 18:56:16 +0100
Subject: [PATCH 011/232] Reowrk PartAM

---
 examples/decomposition/PartAM_example.py |  56 +++++++++
 squander/synthesis/PartAM.py             | 146 +++++++++++------------
 2 files changed, 123 insertions(+), 79 deletions(-)
 create mode 100644 examples/decomposition/PartAM_example.py

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
new file mode 100644
index 000000000..ed44c59c8
--- /dev/null
+++ b/examples/decomposition/PartAM_example.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 26 14:42:56 2020
+Copyright 2020 Peter Rakyta, Ph.D.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+@author: Peter Rakyta, Ph.D.
+"""
+## \file wide_circuit_optimization.py
+## \brief Simple example python code demonstrating a wide circuit optimization
+
+import squander.decomposition.qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization
+from squander import Partition_Aware_Mapping
+from squander import utils
+from squander import Qiskit_IO
+import time
+from squander import Circuit
+import numpy as np
+if __name__ == '__main__':
+
+
+    config = {
+            'strategy': "TreeSearch",
+            'test_subcircuits': True,
+            'test_final_circuit': True,
+            'max_partition_size': 3,
+            'minimum_partition_size': 2,  # Minimum qubits per partition (default: 2)
+    }
+
+    filename = "benchmarks/qfast/5q/vqe.qasm"
+    start_time = time.time()
+
+    # load the circuit from a file
+    circ, parameters = utils.qasm_to_squander_circuit(filename)
+    config['topology'] = [
+    (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
+    (8, 9), (8, 10), (8, 11), (8, 12), (8, 13), (8, 14), (8, 15),
+    (0, 8),
+    ]
+    wide_circuit_optimizer = Partition_Aware_Mapping( config )
+    wide_circuit_optimizer.SynthesizeWideCircuit( circ, parameters )
+
+    print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
+
+
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index ef2a947c1..247f1cf95 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -241,6 +241,13 @@ def extract_subtopology(involved_qbits, qbit_map, config ):
             mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
     return mini_topology
 
+class SingleQubitPartitionResult:
+    def __init__(self,circuit_in,parameters_in):
+        self.circuit = circuit_in
+        self.parameters = parameters_in
+    def get_partition_synthesis_score(self):
+        return 0
+
 class PartitionSynthesisResult:
     def __init__(self, N , mini_topologies):
         self.mini_topologies = mini_topologies
@@ -291,58 +298,62 @@ def __init__(self, config):
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
             raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
-        parallel = self.config['parallel']
-        allowed_parallel = [0, 1, 2]
-        if not parallel in allowed_parallel:
-            raise Exception(f"The parallel configuration should be either of {allowed_parallel}, got {parallel}.")
-        verbosity = self.config['verbosity']
-        if not isinstance(verbosity, int):
-            raise Exception(f"The verbosity parameter should be an integer.")
-
-        self.max_partition_size = self.config['max_partition_size']
-        if not isinstance(self.max_partition_size, int):
-            raise Exception(f"The max_partition_size parameter should be an integer.")
-        self.topology = self.config['topology']
-        if not isinstance(self.topology, list):
-            raise Exception(f"The topology parameter should be a list.")
-        self.routed = self.config['routed']
-        if not isinstance(self.routed, bool):
-            raise Exception(f"The routed parameter should be a bool.")
-        self.partition_strategy = self.config['partition_strategy']
-        allowed_partition_strategies = ['ilp', 'tdag', 'kahn', 'qiskit', 'qiskit-fusion', 'bqskit-Quick', 'bqskit-Scan', 'bqskit-Greedy', 'bqskit-Cluster']
-
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies) -> PartitionSynthesisResult:
         """
         Call to decompose a partition sequentially
         """
         N = Partition_circuit.get_Qbit_Num()
-        perumations_all = list(permutations(range(N)))
-        result = PartitionSynthesisResult(N, topologies)
-        # Sequential permutation search
-        for topology_idx in range(len(topologies)):
-            mini_topology = topologies[topology_idx]
-            P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
-            for P_i in perumations_all:
-                Partition_circuit_tmp = Circuit(N)
-                Partition_circuit_tmp.add_Permutation(P_i)
-                Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                Partition_circuit_tmp.add_Permutation(P_o_initial)
-                synthesised_circuit, synthesised_parameters = DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-                result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
-
-            P_i_best, _ = result.get_best_result(topology_idx)[0]
-            for P_o in perumations_all:
-                Partition_circuit_tmp = Circuit(N)
-                Partition_circuit_tmp.add_Permutation(P_i_best)
-                Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                Partition_circuit_tmp.add_Permutation(P_o)
-                synthesised_circuit, synthesised_parameters = DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-                result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
+        if N !=1:
+            perumations_all = list(permutations(range(N)))
+            result = PartitionSynthesisResult(N, topologies)
+            # Sequential permutation search
+            for topology_idx in range(len(topologies)):
+                mini_topology = topologies[topology_idx]
+                P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
+                for P_i in perumations_all:
+                    Partition_circuit_tmp = Circuit(N)
+                    Partition_circuit_tmp.add_Permutation(P_i)
+                    Partition_circuit_tmp.add_Circuit(Partition_circuit)
+                    Partition_circuit_tmp.add_Permutation(P_o_initial)
+                    synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+                    result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
+
+                P_i_best, _ = result.get_best_result(topology_idx)[0]
+                for P_o in perumations_all:
+                    Partition_circuit_tmp = Circuit(N)
+                    Partition_circuit_tmp.add_Permutation(P_i_best)
+                    Partition_circuit_tmp.add_Circuit(Partition_circuit)
+                    Partition_circuit_tmp.add_Permutation(P_o)
+                    synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+                    result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
+        else:
+            result = SingleQubitPartitionResult(Partition_circuit,Partition_parameters)
         return result
-
+    @staticmethod
+    def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None) -> Circuit:
+        """
+        Call to decompose a partition
+        """
+        strategy = config["strategy"]
+        if strategy == "TreeSearch":
+            cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
+        elif strategy == "TabuSearch":
+            cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
+        elif strategy == "Adaptive":
+            cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
+        else:
+            raise Exception(f"Unsupported decomposition type: {strategy}")
+        cDecompose.set_Verbose( config["verbosity"] )
+        cDecompose.set_Cost_Function_Variant( 3 )	
+        cDecompose.set_Optimization_Tolerance( config["tolerance"] )
+        cDecompose.set_Optimizer( config["optimizer"] )
+        cDecompose.Start_Decomposition()
+        squander_circuit = cDecompose.get_Circuit()
+        parameters       = cDecompose.get_Optimized_Parameters()
+        return squander_circuit, parameters
     def SynthesizeWideCircuit(self, circ, orig_parameters):
-        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(circ, self.max_partition_size)
+        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_qubit = get_all_partitions(circ, self.config["max_partition_size"])
         qbit_num_orig_circuit = circ.get_Qbit_Num()
         gate_dict = {i: gate for i, gate in enumerate(circ.get_Gates())}
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
@@ -360,6 +371,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 start = gate_dict[gate_idx].get_Parameter_Start_Index()
                 params.append(orig_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
             partitioned_circuit.add_Circuit(c)
+        # Only add single-qubit chains as separate partitions if minimum_partition_size allows it
         for chain in single_qubit_chains:
             c = Circuit( qbit_num_orig_circuit )
             for gate_idx in chain:
@@ -383,42 +395,18 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
                 subcircuit_parameters = parameters[ start_idx:end_idx ]
                 k = subcircuit.get_Qbit_Num()
-                mini_topologies = get_unique_subtopologies(self.topology, k)
-                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (subcircuit, subcircuit_parameters, self.config, mini_topologies) )
+                qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
+                involved_qbits = subcircuit.get_Qbits()
+
+                qbit_num = len( involved_qbits )
+                mini_topologies = get_unique_subtopologies(self.topology, qbit_num)
+                qbit_map = {}
+                for idx in range( len(involved_qbits) ):
+                    qbit_map[ involved_qbits[idx] ] = idx
+                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
+                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Processing partitions") ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
 
-        weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
-        L, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
-        # Create a mapping from partition frozensets to their indices in allparts
-        partition_to_idx = {frozenset(part): i for i, part in enumerate(allparts)}
-
-        # Convert the returned partitions to indices
-        L_indices = [partition_to_idx[frozenset(part)] for part in L_parts]
-
-        # Now directly select the already-optimized subcircuits using the indices
-        selected_optimized_subcircuits = [optimized_subcircuits[i] for i in L_indices]
-
-  max_gates = max(len(c.get_Gates()) for c in optimized_subcircuits)
-  def to_cost(d): return d.get('CNOT', 0)*max_gates + sum(d[x] for x in d if x != 'CNOT')
-  weights = [to_cost(circ.get_Gate_Nums()) for circ in optimized_subcircuits[:len(allparts)]]
-
-  # ilp_global_optimal returns the selected partitions as frozensets
-  L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
-
-  # Create a mapping from partition frozensets to their indices in allparts
-  partition_to_idx = {frozenset(part): i for i, part in enumerate(allparts)}
-
-  # Convert the returned partitions to indices
-  L_indices = [partition_to_idx[frozenset(part)] for part in L_parts]
-
-  # Now directly select the already-optimized subcircuits using the indices
-  selected_optimized_subcircuits = [optimized_subcircuits[i] for i in L_indices]
-  selected_parameters = [optimized_parameter_list[i] for i in L_indices]
-
-  # Construct the final circuit from the optimized subcircuits
-  wide_circuit, wide_parameters = self.ConstructCircuitFromPartitions(
-      selected_optimized_subcircuits,
-      selected_parameters
-  )
+        weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
\ No newline at end of file

From 829535e4b76c635cfa5fc4c776bd7a19acc2413a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 31 Oct 2025 11:19:20 +0100
Subject: [PATCH 012/232] Change example + remove resynthesis

---
 .../wide_circuit_optimization.py              | 35 +++----------------
 .../qgd_Wide_Circuit_Optimization.py          | 20 ++++++++---
 2 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/examples/decomposition/wide_circuit_optimization.py b/examples/decomposition/wide_circuit_optimization.py
index 7ab21525b..62a96a8e4 100644
--- a/examples/decomposition/wide_circuit_optimization.py
+++ b/examples/decomposition/wide_circuit_optimization.py
@@ -21,12 +21,12 @@
 ## \brief Simple example python code demonstrating a wide circuit optimization
 
 import squander.decomposition.qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization
+from squander import Partition_Aware_Mapping
 from squander import utils
 from squander import Qiskit_IO
-import time, requests, os, zipfile, tempfile
-from pathlib import Path
-    
-
+import time
+from squander import Circuit
+import numpy as np
 if __name__ == '__main__':
 
 
@@ -37,32 +37,7 @@
             'max_partition_size': 3,
     }
 
-    zip_url = "https://zenodo.org/records/17293975/files/benchmark_circuit_QMill_IBM.zip?download=1"
-    temp_dir = tempfile.mkdtemp(prefix="repos_")
-    zip_path = os.path.join(temp_dir, "benchmark_circuit_QMill_IBM.zip")
-    qasm_files = []
-    # Download zip
-    r = requests.get(zip_url, stream=True)
-    if r.status_code != 200:
-        raise RuntimeError(f"Failed to download {zip_url}: HTTP {r.status_code}")
-    with open(zip_path, "wb") as f:
-        for chunk in r.iter_content(chunk_size=8192):
-            f.write(chunk)
-
-    # Extract zip
-    extract_path = os.path.join(temp_dir, "benchmark_circuit_QMill_IBM")
-    with zipfile.ZipFile(zip_path, "r") as zf:
-        zf.extractall(extract_path)
-
-    # Find QASM files
-    for path in Path(extract_path).rglob("*.qasm"):
-        qasm_files.append(str(path.resolve()))
-
-    #filename = next(x for x in qasm_files if x.endswith("mod5_4_qmill_ibm.qasm"))
-    #filename = next(x for x in qasm_files if x.endswith("gf2^E8_mult_qmill_ibm.qasm"))
-    filename = next(x for x in qasm_files if x.endswith("csum_mux_9_qmill_ibm.qasm"))    
-
-    #filename = "examples/partitioning/qasm_samples/heisenberg-16-20.qasm"
+    filename = "examples/partitioning/qasm_samples/heisenberg-16-20.qasm"
     start_time = time.time()
 
     # load the circuit from a file
diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index a73b982a2..6bcac1d88 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -463,10 +463,22 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
             max_gates = max(len(c.get_Gates()) for c in optimized_subcircuits)
             def to_cost(d): return d.get('CNOT', 0)*max_gates + sum(d[x] for x in d if x != 'CNOT')
             weights = [to_cost(circ.get_Gate_Nums()) for circ in optimized_subcircuits[:len(allparts)]]
-            L, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
-            parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L], fusion_info)
-            L = topo_sort_partitions(circ, self.max_partition_size, parts)
-            return self.OptimizeWideCircuit(circ, orig_parameters, global_min=False, prepartitioning=[parts[i] for i in L])
+            L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
+            # Create a mapping from partition frozensets to their indices in allparts
+            partition_to_idx = {frozenset(part): i for i, part in enumerate(allparts)}
+
+            # Convert the returned partitions to indices
+            L_indices = [partition_to_idx[frozenset(part)] for part in L_parts]
+
+            # Now directly select the already-optimized subcircuits using the indices
+            selected_optimized_subcircuits = [optimized_subcircuits[i] for i in L_indices]
+            selected_parameters = [optimized_parameter_list[i] for i in L_indices]
+
+            # Construct the final circuit from the optimized subcircuits
+            return self.ConstructCircuitFromPartitions(
+                selected_optimized_subcircuits,
+                selected_parameters
+            )
             """
             Lgate = [set(allparts[i]) for i in L]
             for part in Lgate:

From f8341fd312a058fffd426d9e22b34d78df9fd41a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 31 Oct 2025 16:37:15 +0100
Subject: [PATCH 013/232] Refactor circuit optimization process by streamlining
 partition handling and enhancing final mapping retrieval. Update to use
 optimized subcircuits directly in the construction of the final circuit.

---
 .../qgd_Wide_Circuit_Optimization.py          | 22 +++++--------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 6bcac1d88..1bbc02ff5 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -463,22 +463,10 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
             max_gates = max(len(c.get_Gates()) for c in optimized_subcircuits)
             def to_cost(d): return d.get('CNOT', 0)*max_gates + sum(d[x] for x in d if x != 'CNOT')
             weights = [to_cost(circ.get_Gate_Nums()) for circ in optimized_subcircuits[:len(allparts)]]
-            L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
-            # Create a mapping from partition frozensets to their indices in allparts
-            partition_to_idx = {frozenset(part): i for i, part in enumerate(allparts)}
-
-            # Convert the returned partitions to indices
-            L_indices = [partition_to_idx[frozenset(part)] for part in L_parts]
-
-            # Now directly select the already-optimized subcircuits using the indices
-            selected_optimized_subcircuits = [optimized_subcircuits[i] for i in L_indices]
-            selected_parameters = [optimized_parameter_list[i] for i in L_indices]
-
-            # Construct the final circuit from the optimized subcircuits
-            return self.ConstructCircuitFromPartitions(
-                selected_optimized_subcircuits,
-                selected_parameters
-            )
+            L, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
+            parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L], fusion_info)
+            L = topo_sort_partitions(circ, self.max_partition_size, parts)
+            return self.OptimizeWideCircuit(circ, orig_parameters, global_min=False, prepartitioning=[parts[i] for i in L])
             """
             Lgate = [set(allparts[i]) for i in L]
             for part in Lgate:
@@ -518,6 +506,6 @@ def route_circuit(self, circ: Circuit, orig_parameters: np.ndarray):
         sabre = SABRE(circ, self.config["topology"])
         Squander_remapped_circuit, parameters_remapped_circuit, pi, final_pi, swap_count = sabre.map_circuit(orig_parameters)
         self.config.setdefault("initial_mapping",pi)
-        self.config.setdefault("final_mapping",final_pi)
+        self.config.setdefault("final_mapping",sabre.get_inverse_pi(final_pi))
         self.config["routed"] = True
         return Squander_remapped_circuit, parameters_remapped_circuit

From 7e5a94b584d77aa37e8126e2402dc14ef5f44975 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 1 Nov 2025 15:24:13 +0100
Subject: [PATCH 014/232] Partial work on PartAM

---
 .../wide_circuit_optimization.py              |   4 +-
 squander/synthesis/PartAM.py                  | 205 +++++++++++++++++-
 2 files changed, 197 insertions(+), 12 deletions(-)

diff --git a/examples/decomposition/wide_circuit_optimization.py b/examples/decomposition/wide_circuit_optimization.py
index 62a96a8e4..89ca6d6f6 100644
--- a/examples/decomposition/wide_circuit_optimization.py
+++ b/examples/decomposition/wide_circuit_optimization.py
@@ -47,7 +47,7 @@
     wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
 
     # run circuti optimization
-    circ_flat, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters )
+    circ_flat, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters, True )
 
     config['topology'] = [
     (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
@@ -58,7 +58,7 @@
     circo = Qiskit_IO.get_Qiskit_Circuit(circ_flat.get_Flat_Circuit(),parameters)
     # run circuti optimization
     circ, parameters = Qiskit_IO.convert_Qiskit_to_Squander(circo)
-    wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters )
+    wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters, True )
 
     print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 247f1cf95..7251f68f0 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -20,7 +20,8 @@
 import os
 from typing import List, Set, Tuple, FrozenSet
 from tqdm import tqdm
-
+from collections import deque, defaultdict
+import numpy as np
 
 from squander.partitioning.partition import PartitionCircuit
 from squander.partitioning.tools import get_qubits
@@ -249,7 +250,7 @@ def get_partition_synthesis_score(self):
         return 0
 
 class PartitionSynthesisResult:
-    def __init__(self, N , mini_topologies):
+    def __init__(self, N , mini_topologies, involved_qbits, qubit_map):
         self.mini_topologies = mini_topologies
         self.topology_count = len(mini_topologies)
         self.N = N
@@ -257,7 +258,8 @@ def __init__(self, N , mini_topologies):
         self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
         self.synthesised_parameters = [[] for _ in range(len(mini_topologies))]
         self.cnot_counts = [[] for _ in range(len(mini_topologies))]
-    
+        self.involved_qbits = involved_qbits
+        self.qubit_map = qubit_map
     def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
         self.permutations_pairs[topology_idx].append(permutations_pair)
         self.synthesised_circuits[topology_idx].append(synthesised_circuit)
@@ -299,14 +301,14 @@ def __init__(self, config):
         if not strategy in allowed_strategies:
             raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
     @staticmethod
-    def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies) -> PartitionSynthesisResult:
+    def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
         """
         Call to decompose a partition sequentially
         """
         N = Partition_circuit.get_Qbit_Num()
         if N !=1:
             perumations_all = list(permutations(range(N)))
-            result = PartitionSynthesisResult(N, topologies)
+            result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map)
             # Sequential permutation search
             for topology_idx in range(len(topologies)):
                 mini_topology = topologies[topology_idx]
@@ -352,8 +354,9 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
         squander_circuit = cDecompose.get_Circuit()
         parameters       = cDecompose.get_Optimized_Parameters()
         return squander_circuit, parameters
+
     def SynthesizeWideCircuit(self, circ, orig_parameters):
-        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_qubit = get_all_partitions(circ, self.config["max_partition_size"])
+        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(circ, self.config["max_partition_size"])
         qbit_num_orig_circuit = circ.get_Qbit_Num()
         gate_dict = {i: gate for i, gate in enumerate(circ.get_Gates())}
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
@@ -389,7 +392,42 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         optimized_results = [None] * len(subcircuits)
 
         with Pool(processes=mp.cpu_count()) as pool:
-            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Synthesizing partitions") ):
+            for partition_idx, subcircuit in enumerate( subcircuits ):
+
+                start_idx = subcircuit.get_Parameter_Start_Index()
+                end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
+                subcircuit_parameters = parameters[ start_idx:end_idx ]
+                k = subcircuit.get_Qbit_Num()
+                qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
+                involved_qbits = subcircuit.get_Qbits()
+
+                qbit_num = len( involved_qbits )
+                mini_topologies = get_unique_subtopologies(self.topology, qbit_num)
+                qbit_map = {}
+                for idx in range( len(involved_qbits) ):
+                    qbit_map[ involved_qbits[idx] ] = idx
+                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
+                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+
+            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis") ):
+                optimized_results[partition_idx] = optimized_results[partition_idx].get()
+
+        weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
+        L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
+        parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
+        L = topo_sort_partitions(circ, self.max_partition_size, parts)
+        from squander.partitioning.kahn import kahn_partition_preparts
+        from squander.partitioning.tools import translate_param_order
+        partitioned_circuit, param_order, _ = kahn_partition_preparts(circ, self.max_partition_size, [parts[i] for i in L])
+        parameters = translate_param_order(orig_parameters, param_order)
+
+        subcircuits = partitioned_circuit.get_Gates()
+
+        # the list of optimized subcircuits
+        optimized_results = [None] * len(subcircuits)
+
+        with Pool(processes=mp.cpu_count()) as pool:
+            for partition_idx, subcircuit in enumerate( subcircuits ):
 
                 start_idx = subcircuit.get_Parameter_Start_Index()
                 end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
@@ -404,9 +442,156 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
-                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies) )
+                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
-            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Processing partitions") ):
+            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis") ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
+        return optimized_results
 
-        weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
\ No newline at end of file
+    def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
+        optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
+        DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
+        D = self.compute_distances_bfs(circ.get_Qbit_Num())
+        pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
+        
+            
+    def construct_DAG_and_IDAG(self, optimized_partitions):
+        DAG = []
+        IDAG = []
+        for idx in range(len(optimized_partitions)):
+            if idx != len(optimized_partitions)-1:
+                Involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
+                for next_idx in range(idx+1, len(optimized_partitions)):
+                    Involved_qbits_next = optimized_partitions[next_idx].involved_qbits
+                    intersection = [i for i in Involved_qbits_current if i in Involved_qbits_next]
+                    if len(intersection) > 0:
+                        DAG.append((idx, next_idx))
+                    for intersection_qbit in intersection:
+                        Involved_qbits_current.remove(intersection_qbit)
+                    if len(Involved_qbits_current) == 0:
+                        break
+            if idx != 0:
+                Involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
+                for prev_idx in range(idx-1, -1, -1):
+                    Involved_qbits_prev = optimized_partitions[prev_idx].involved_qbits
+                    intersection = [i for i in Involved_qbits_current if i in Involved_qbits_prev]
+                    if len(intersection) > 0:
+                        IDAG.append((prev_idx, idx))
+                    for intersection_qbit in intersection:
+                        Involved_qbits_current.remove(intersection_qbit)
+                    if len(Involved_qbits_current) == 0:
+                        break
+        return DAG, IDAG
+
+    def compute_distances_bfs(self, N):
+        """BFS distance computation - faster than Floyd-Warshall."""
+        D = np.ones((N, N)) * np.inf
+        
+        # Build adjacency list
+        adj = defaultdict(list)
+        for u, v in self.config['topology']:
+            adj[u].append(v)
+            adj[v].append(u)
+        
+        # BFS from each vertex
+        for start in range(N):
+            D[start][start] = 0
+            queue = deque([(start, 0)])
+            visited = {start}
+            
+            while queue:
+                node, dist = queue.popleft()
+                for neighbor in adj[node]:
+                    if neighbor not in visited:
+                        visited.add(neighbor)
+                        D[start][neighbor] = dist + 1
+                        queue.append((neighbor, dist + 1))
+        
+        return D*3 #multiply by 3 to make it CNOT cost instead of SWAP cost
+
+    def _compute_smart_initial_layout(self, circuit, N, D):
+
+        # Count interactions between qubits
+        interaction_count = defaultdict(int)
+        gates = circuit.get_Gates()
+        
+        for gate in gates:
+            if gate.get_Control_Qbit() != -1:
+                q1 = gate.get_Target_Qbit()
+                q2 = gate.get_Control_Qbit()
+                if q1 < N and q2 < N:
+                    key = (min(q1, q2), max(q1, q2))
+                    interaction_count[key] += 1
+        
+        if not interaction_count:
+            # No 2-qubit gates, use trivial mapping
+            return np.arange(N)
+        
+        # Find most interacting qubit pair
+        most_connected = max(interaction_count.items(), key=lambda x: x[1])
+        q1, q2 = most_connected[0]
+        
+        # Find physical qubits that are connected
+        # Start with an arbitrary connected pair
+        for edge in self.config['topology']:
+            p1, p2 = edge
+            break  # Just take first edge
+        
+        # Initialize mapping
+        pi = np.arange(N)
+        
+        # Place most interacting qubits on connected physical qubits
+        pi[q1] = p1
+        pi[q2] = p2
+        
+        # Place other qubits using greedy approach
+        placed_logical = {q1, q2}
+        placed_physical = {p1, p2}
+        
+        # For each remaining logical qubit, find where to place it
+        remaining_logical = [q for q in range(N) if q not in placed_logical]
+        
+        # Sort by how much they interact with already placed qubits
+        def interaction_score(q):
+            score = 0
+            for placed_q in placed_logical:
+                key = (min(q, placed_q), max(q, placed_q))
+                score += interaction_count.get(key, 0)
+            return score
+        
+        remaining_logical.sort(key=interaction_score, reverse=True)
+        
+        # Place them near their interacting partners
+        for logical_q in remaining_logical:
+            # Find best physical location
+            best_physical = None
+            best_score = float('inf')
+            
+            for physical_q in range(N):
+                if physical_q not in placed_physical:
+                    # Calculate average distance to interacting qubits
+                    total_dist = 0
+                    count = 0
+                    for other_q in placed_logical:
+                        key = (min(logical_q, other_q), max(logical_q, other_q))
+                        weight = interaction_count.get(key, 0)
+                        if weight > 0:
+                            other_physical = pi[other_q]
+                            total_dist += D[physical_q][other_physical] * weight
+                            count += weight
+                    
+                    if count > 0:
+                        avg_dist = total_dist / count
+                    else:
+                        avg_dist = 0
+                    
+                    if avg_dist < best_score:
+                        best_score = avg_dist
+                        best_physical = physical_q
+            
+            if best_physical is not None:
+                pi[logical_q] = best_physical
+                placed_logical.add(logical_q)
+                placed_physical.add(best_physical)
+        
+        return pi

From 99d2f4de5f7fa5f9da7d7d40625d101163824605 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 3 Nov 2025 07:59:14 +0100
Subject: [PATCH 015/232] Move auxiliary functions to PartAM utils

---
 squander/synthesis/PartAM.py       | 330 +++--------------------------
 squander/synthesis/PartAM_utils.py | 233 ++++++++++++++++++++
 2 files changed, 267 insertions(+), 296 deletions(-)
 create mode 100644 squander/synthesis/PartAM_utils.py

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 7251f68f0..90d64747b 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -27,258 +27,7 @@
 from squander.partitioning.tools import get_qubits
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 from itertools import product
-
-def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None) -> Circuit:
-    """
-    Call to decompose a partition
-    """
-    strategy = config["strategy"]
-    if strategy == "TreeSearch":
-        cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
-    elif strategy == "TabuSearch":
-        cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
-    elif strategy == "Adaptive":
-        cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
-    else:
-        raise Exception(f"Unsupported decomposition type: {strategy}")
-    cDecompose.set_Verbose( config["verbosity"] )
-    cDecompose.set_Cost_Function_Variant( 3 )	
-    cDecompose.set_Optimization_Tolerance( config["tolerance"] )
-    cDecompose.set_Optimizer( config["optimizer"] )
-    cDecompose.Start_Decomposition()
-    squander_circuit = cDecompose.get_Circuit()
-    parameters       = cDecompose.get_Optimized_Parameters()
-    return squander_circuit, parameters
-
-def get_all_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
-    """
-    Find ALL connected subtopologies with exactly k qubits using DFS.
-    
-    Args:
-        edges: List of edges representing the quantum hardware topology
-        k: Number of qubits in the desired subtopologies
-    
-    Returns:
-        List of all subtopologies, where each subtopology is a list of edges
-    """
-    if k <= 0:
-        return []
-    
-    # Build adjacency list
-    adj_list = {}
-    for u, v in edges:
-        if u not in adj_list:
-            adj_list[u] = set()
-        if v not in adj_list:
-            adj_list[v] = set()
-        adj_list[u].add(v)
-        adj_list[v].add(u)
-    
-    all_qubits = sorted(adj_list.keys())
-    
-    if k == 1:
-        return [[] for _ in all_qubits]
-    
-    def get_induced_edges(qubit_subset: Set[int]) -> List[Tuple[int, int]]:
-        induced = []
-        for edge in edges:
-            if edge[0] in qubit_subset and edge[1] in qubit_subset:
-                induced.append(edge)
-        return induced
-    
-    subtopologies = []
-    seen = set()
-    
-    def dfs(current_qubits: Set[int], candidates: Set[int]):
-        """Enumerate connected subgraphs using DFS."""
-        if len(current_qubits) == k:
-            frozen = frozenset(current_qubits)
-            if frozen not in seen:
-                seen.add(frozen)
-                subtopologies.append(get_induced_edges(current_qubits))
-            return
-        
-        # Prune if we can't reach k qubits
-        if len(current_qubits) + len(candidates) < k:
-            return
-        
-        for node in sorted(candidates):
-            # Add node and explore
-            new_qubits = current_qubits | {node}
-            
-            # New candidates: neighbors of new_qubits not yet included
-            new_candidates = set()
-            for q in new_qubits:
-                for neighbor in adj_list[q]:
-                    if neighbor not in new_qubits and neighbor > node:
-                        new_candidates.add(neighbor)
-            
-            dfs(new_qubits, new_candidates)
-    
-    # Start DFS from each qubit
-    for start in all_qubits:
-        candidates = {n for n in adj_list[start] if n > start}
-        dfs({start}, candidates)
-    
-    return subtopologies
-
-
-def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]:
-    """
-    Convert a subgraph to canonical form for isomorphism checking.
-    Relabels nodes as 0,1,2,...,k-1 and returns the lexicographically smallest edge set.
-    """
-    qubits = sorted(qubit_subset)
-    n = len(qubits)
-    
-    # Try all permutations and find lexicographically smallest
-    best_edges = None
-    
-    for perm in permutations(range(n)):
-        # Create mapping: qubits[i] -> perm[i]
-        mapping = {qubits[i]: perm[i] for i in range(n)}
-        
-        # Relabel edges
-        relabeled = []
-        for u, v in induced_edges:
-            new_u, new_v = mapping[u], mapping[v]
-            # Normalize edge direction
-            relabeled.append(tuple(sorted([new_u, new_v])))
-        
-        relabeled = tuple(sorted(relabeled))
-        
-        if best_edges is None or relabeled < best_edges:
-            best_edges = relabeled
-    
-    return frozenset(best_edges)
-
-
-def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
-    """
-    Find all UNIQUE subtopology structures with k qubits using DFS.
-    Returns one example of each non-isomorphic connected subgraph.
-    
-    Args:
-        edges: List of edges representing the quantum hardware topology
-        k: Number of qubits in the desired subtopologies
-    
-    Returns:
-        List of unique subtopologies (one representative per isomorphism class)
-    """
-    if k <= 0:
-        return []
-    
-    # Build adjacency list
-    adj_list = {}
-    for u, v in edges:
-        if u not in adj_list:
-            adj_list[u] = set()
-        if v not in adj_list:
-            adj_list[v] = set()
-        adj_list[u].add(v)
-        adj_list[v].add(u)
-    
-    all_qubits = sorted(adj_list.keys())
-    
-    if k == 1:
-        return [[]]  # Single qubit has no edges
-    
-    def get_induced_edges(qubit_subset: Set[int]) -> List[Tuple[int, int]]:
-        induced = []
-        for edge in edges:
-            if edge[0] in qubit_subset and edge[1] in qubit_subset:
-                induced.append(edge)
-        return induced
-    
-    # Track unique canonical forms and their examples
-    canonical_forms = {}
-    seen = set()
-    
-    def dfs(current_qubits: Set[int], candidates: Set[int]):
-        """Enumerate connected subgraphs using DFS."""
-        if len(current_qubits) == k:
-            frozen = frozenset(current_qubits)
-            if frozen not in seen:
-                seen.add(frozen)
-                induced = get_induced_edges(current_qubits)
-                
-                # Get canonical form
-                canonical = get_canonical_form(current_qubits, induced)
-                
-                # Store first example of each canonical form
-                if canonical not in canonical_forms:
-                    canonical_forms[canonical] = induced
-            return
-        
-        # Prune if we can't reach k qubits
-        if len(current_qubits) + len(candidates) < k:
-            return
-        
-        for node in sorted(candidates):
-            # Add node and explore
-            new_qubits = current_qubits | {node}
-            
-            # New candidates: neighbors of new_qubits not yet included
-            new_candidates = set()
-            for q in new_qubits:
-                for neighbor in adj_list[q]:
-                    if neighbor not in new_qubits and neighbor > node:
-                        new_candidates.add(neighbor)
-            
-            dfs(new_qubits, new_candidates)
-    
-    # Start DFS from each qubit
-    for start in all_qubits:
-        candidates = {n for n in adj_list[start] if n > start}
-        dfs({start}, candidates)
-    
-    return list(canonical_forms.values())
-
-
-def extract_subtopology(involved_qbits, qbit_map, config ):
-    mini_topology = []
-    for edge in config["topology"]:
-        if edge[0] in involved_qbits and edge[1] in involved_qbits:
-            mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
-    return mini_topology
-
-class SingleQubitPartitionResult:
-    def __init__(self,circuit_in,parameters_in):
-        self.circuit = circuit_in
-        self.parameters = parameters_in
-    def get_partition_synthesis_score(self):
-        return 0
-
-class PartitionSynthesisResult:
-    def __init__(self, N , mini_topologies, involved_qbits, qubit_map):
-        self.mini_topologies = mini_topologies
-        self.topology_count = len(mini_topologies)
-        self.N = N
-        self.permutations_pairs = [[] for _ in range(len(mini_topologies))]
-        self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
-        self.synthesised_parameters = [[] for _ in range(len(mini_topologies))]
-        self.cnot_counts = [[] for _ in range(len(mini_topologies))]
-        self.involved_qbits = involved_qbits
-        self.qubit_map = qubit_map
-    def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
-        self.permutations_pairs[topology_idx].append(permutations_pair)
-        self.synthesised_circuits[topology_idx].append(synthesised_circuit)
-        self.synthesised_parameters[topology_idx].append(synthesised_parameters)
-        self.cnot_counts[topology_idx].append(synthesised_circuit.get_Gate_Nums().get('CNOT', 0))
-    
-    def get_best_result(self, topology_idx):
-        best_index = np.argmin(self.cnot_counts[topology_idx])
-        return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index]
-    
-    def get_partition_synthesis_score(self):
-        score = 0
-        for topology_idx in range(self.topology_count):
-            cnot_count_topology = np.mean(self.cnot_counts[topology_idx])*0.1 + np.min(self.cnot_counts[topology_idx])*0.9
-            if len(self.mini_topologies[topology_idx]) == self.N*(self.N-1)/2:
-                score += cnot_count_topology*0.3/self.topology_count
-            else:
-                score += cnot_count_topology*0.7/self.topology_count
-        return score 
+from squander.synthesis.PartAM_utils import get_all_subtopologies, get_unique_subtopologies, SingleQubitPartitionResult, PartitionSynthesisResult
 
 class qgd_Partition_Aware_Mapping:
 
@@ -300,6 +49,7 @@ def __init__(self, config):
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
             raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
+
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
         """
@@ -414,73 +164,61 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
         weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
+        structures = [optimized_results[i] for i in L_parts]
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
         L = topo_sort_partitions(circ, self.max_partition_size, parts)
-        from squander.partitioning.kahn import kahn_partition_preparts
-        from squander.partitioning.tools import translate_param_order
-        partitioned_circuit, param_order, _ = kahn_partition_preparts(circ, self.max_partition_size, [parts[i] for i in L])
-        parameters = translate_param_order(orig_parameters, param_order)
 
-        subcircuits = partitioned_circuit.get_Gates()
-
-        # the list of optimized subcircuits
-        optimized_results = [None] * len(subcircuits)
-
-        with Pool(processes=mp.cpu_count()) as pool:
-            for partition_idx, subcircuit in enumerate( subcircuits ):
-
-                start_idx = subcircuit.get_Parameter_Start_Index()
-                end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
-                subcircuit_parameters = parameters[ start_idx:end_idx ]
-                k = subcircuit.get_Qbit_Num()
-                qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
-                involved_qbits = subcircuit.get_Qbits()
-
-                qbit_num = len( involved_qbits )
-                mini_topologies = get_unique_subtopologies(self.topology, qbit_num)
-                qbit_map = {}
-                for idx in range( len(involved_qbits) ):
-                    qbit_map[ involved_qbits[idx] ] = idx
-                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
-                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
-
-            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis") ):
-                optimized_results[partition_idx] = optimized_results[partition_idx].get()
-        return optimized_results
+        return [structures[i] for i in L], [parts[i] for i in L]
 
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
-        optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
+        optimized_partitions, preparation_parts = self.SynthesizeWideCircuit(circ, orig_parameters)
+        print(preparation_parts)
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
         D = self.compute_distances_bfs(circ.get_Qbit_Num())
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
-        
+
+    def get_initial_layer(self, IDAG, N):
+        initial_layer = []
+        active_qbits = list(range(N))
+        for idx in range(len(IDAG)):
+            if len(IDAG[idx][1]) == 0:
+                initial_layer.append(idx)
+                for qbit in IDAG[idx][0].involved_qbits:
+                    active_qbits.remove(qbit)
+            if len(active_qbits) == 0:
+                break
+        return initial_layer
             
     def construct_DAG_and_IDAG(self, optimized_partitions):
         DAG = []
         IDAG = []
         for idx in range(len(optimized_partitions)):
+            parents = []
+            children = []
             if idx != len(optimized_partitions)-1:
-                Involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
+                involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
                 for next_idx in range(idx+1, len(optimized_partitions)):
-                    Involved_qbits_next = optimized_partitions[next_idx].involved_qbits
-                    intersection = [i for i in Involved_qbits_current if i in Involved_qbits_next]
+                    involved_qbits_next = optimized_partitions[next_idx].involved_qbits
+                    intersection = [i for i in involved_qbits_current if i in involved_qbits_next]
                     if len(intersection) > 0:
-                        DAG.append((idx, next_idx))
+                        children.append(next_idx)
                     for intersection_qbit in intersection:
-                        Involved_qbits_current.remove(intersection_qbit)
-                    if len(Involved_qbits_current) == 0:
+                        involved_qbits_current.remove(intersection_qbit)
+                    if len(involved_qbits_current) == 0:
                         break
             if idx != 0:
-                Involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
+                involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
                 for prev_idx in range(idx-1, -1, -1):
-                    Involved_qbits_prev = optimized_partitions[prev_idx].involved_qbits
-                    intersection = [i for i in Involved_qbits_current if i in Involved_qbits_prev]
+                    involved_qbits_prev = optimized_partitions[prev_idx].involved_qbits
+                    intersection = [i for i in involved_qbits_current if i in involved_qbits_prev]
                     if len(intersection) > 0:
-                        IDAG.append((prev_idx, idx))
+                        parents.append(prev_idx)
                     for intersection_qbit in intersection:
-                        Involved_qbits_current.remove(intersection_qbit)
-                    if len(Involved_qbits_current) == 0:
+                        involved_qbits_current.remove(intersection_qbit)
+                    if len(involved_qbits_current) == 0:
                         break
+            DAG.append([idx, children])
+            IDAG.append([idx, parents])
         return DAG, IDAG
 
     def compute_distances_bfs(self, N):
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
new file mode 100644
index 000000000..ecabdf913
--- /dev/null
+++ b/squander/synthesis/PartAM_utils.py
@@ -0,0 +1,233 @@
+import numpy as np
+from typing import List, Tuple, Set, FrozenSet
+from itertools import permutations
+
+def get_all_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
+    """
+    Find ALL connected subtopologies with exactly k qubits using DFS.
+    
+    Args:
+        edges: List of edges representing the quantum hardware topology
+        k: Number of qubits in the desired subtopologies
+    
+    Returns:
+        List of all subtopologies, where each subtopology is a list of edges
+    """
+    if k <= 0:
+        return []
+    
+    # Build adjacency list
+    adj_list = {}
+    for u, v in edges:
+        if u not in adj_list:
+            adj_list[u] = set()
+        if v not in adj_list:
+            adj_list[v] = set()
+        adj_list[u].add(v)
+        adj_list[v].add(u)
+    
+    all_qubits = sorted(adj_list.keys())
+    
+    if k == 1:
+        return [[] for _ in all_qubits]
+    
+    def get_induced_edges(qubit_subset: Set[int]) -> List[Tuple[int, int]]:
+        induced = []
+        for edge in edges:
+            if edge[0] in qubit_subset and edge[1] in qubit_subset:
+                induced.append(edge)
+        return induced
+    
+    subtopologies = []
+    seen = set()
+    
+    def dfs(current_qubits: Set[int], candidates: Set[int]):
+        """Enumerate connected subgraphs using DFS."""
+        if len(current_qubits) == k:
+            frozen = frozenset(current_qubits)
+            if frozen not in seen:
+                seen.add(frozen)
+                subtopologies.append(get_induced_edges(current_qubits))
+            return
+        
+        # Prune if we can't reach k qubits
+        if len(current_qubits) + len(candidates) < k:
+            return
+        
+        for node in sorted(candidates):
+            # Add node and explore
+            new_qubits = current_qubits | {node}
+            
+            # New candidates: neighbors of new_qubits not yet included
+            new_candidates = set()
+            for q in new_qubits:
+                for neighbor in adj_list[q]:
+                    if neighbor not in new_qubits and neighbor > node:
+                        new_candidates.add(neighbor)
+            
+            dfs(new_qubits, new_candidates)
+    
+    # Start DFS from each qubit
+    for start in all_qubits:
+        candidates = {n for n in adj_list[start] if n > start}
+        dfs({start}, candidates)
+    
+    return subtopologies
+
+
+def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]:
+    """
+    Convert a subgraph to canonical form for isomorphism checking.
+    Relabels nodes as 0,1,2,...,k-1 and returns the lexicographically smallest edge set.
+    """
+    qubits = sorted(qubit_subset)
+    n = len(qubits)
+    
+    # Try all permutations and find lexicographically smallest
+    best_edges = None
+    
+    for perm in permutations(range(n)):
+        # Create mapping: qubits[i] -> perm[i]
+        mapping = {qubits[i]: perm[i] for i in range(n)}
+        
+        # Relabel edges
+        relabeled = []
+        for u, v in induced_edges:
+            new_u, new_v = mapping[u], mapping[v]
+            # Normalize edge direction
+            relabeled.append(tuple(sorted([new_u, new_v])))
+        
+        relabeled = tuple(sorted(relabeled))
+        
+        if best_edges is None or relabeled < best_edges:
+            best_edges = relabeled
+    
+    return frozenset(best_edges)
+
+
+def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
+    """
+    Find all UNIQUE subtopology structures with k qubits using DFS.
+    Returns one example of each non-isomorphic connected subgraph.
+    
+    Args:
+        edges: List of edges representing the quantum hardware topology
+        k: Number of qubits in the desired subtopologies
+    
+    Returns:
+        List of unique subtopologies (one representative per isomorphism class)
+    """
+    if k <= 0:
+        return []
+    
+    # Build adjacency list
+    adj_list = {}
+    for u, v in edges:
+        if u not in adj_list:
+            adj_list[u] = set()
+        if v not in adj_list:
+            adj_list[v] = set()
+        adj_list[u].add(v)
+        adj_list[v].add(u)
+    
+    all_qubits = sorted(adj_list.keys())
+    
+    if k == 1:
+        return [[]]  # Single qubit has no edges
+    
+    def get_induced_edges(qubit_subset: Set[int]) -> List[Tuple[int, int]]:
+        induced = []
+        for edge in edges:
+            if edge[0] in qubit_subset and edge[1] in qubit_subset:
+                induced.append(edge)
+        return induced
+    
+    # Track unique canonical forms and their examples
+    canonical_forms = {}
+    seen = set()
+    
+    def dfs(current_qubits: Set[int], candidates: Set[int]):
+        """Enumerate connected subgraphs using DFS."""
+        if len(current_qubits) == k:
+            frozen = frozenset(current_qubits)
+            if frozen not in seen:
+                seen.add(frozen)
+                induced = get_induced_edges(current_qubits)
+                
+                # Get canonical form
+                canonical = get_canonical_form(current_qubits, induced)
+                
+                # Store first example of each canonical form
+                if canonical not in canonical_forms:
+                    canonical_forms[canonical] = induced
+            return
+        
+        # Prune if we can't reach k qubits
+        if len(current_qubits) + len(candidates) < k:
+            return
+        
+        for node in sorted(candidates):
+            # Add node and explore
+            new_qubits = current_qubits | {node}
+            
+            # New candidates: neighbors of new_qubits not yet included
+            new_candidates = set()
+            for q in new_qubits:
+                for neighbor in adj_list[q]:
+                    if neighbor not in new_qubits and neighbor > node:
+                        new_candidates.add(neighbor)
+            
+            dfs(new_qubits, new_candidates)
+    
+    # Start DFS from each qubit
+    for start in all_qubits:
+        candidates = {n for n in adj_list[start] if n > start}
+        dfs({start}, candidates)
+    
+    return list(canonical_forms.values())
+
+
+def extract_subtopology(involved_qbits, qbit_map, config ):
+    mini_topology = []
+    for edge in config["topology"]:
+        if edge[0] in involved_qbits and edge[1] in involved_qbits:
+            mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
+    return mini_topology
+
+class SingleQubitPartitionResult:
+    def __init__(self,circuit_in,parameters_in):
+        self.circuit = circuit_in
+        self.parameters = parameters_in
+    def get_partition_synthesis_score(self):
+        return 0
+
+class PartitionSynthesisResult:
+    def __init__(self, N , mini_topologies, involved_qbits, qubit_map):
+        self.mini_topologies = mini_topologies
+        self.topology_count = len(mini_topologies)
+        self.N = N
+        self.permutations_pairs = [[] for _ in range(len(mini_topologies))]
+        self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
+        self.synthesised_parameters = [[] for _ in range(len(mini_topologies))]
+        self.cnot_counts = [[] for _ in range(len(mini_topologies))]
+        self.involved_qbits = involved_qbits
+        self.qubit_map = qubit_map
+    def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
+        self.permutations_pairs[topology_idx].append(permutations_pair)
+        self.synthesised_circuits[topology_idx].append(synthesised_circuit)
+        self.synthesised_parameters[topology_idx].append(synthesised_parameters)
+        self.cnot_counts[topology_idx].append(synthesised_circuit.get_Gate_Nums().get('CNOT', 0))
+    
+    def get_best_result(self, topology_idx):
+        best_index = np.argmin(self.cnot_counts[topology_idx])
+        return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index]
+    
+    def get_partition_synthesis_score(self):
+        score = 0
+        for topology_idx in range(self.topology_count):
+            cnot_count_topology = np.mean(self.cnot_counts[topology_idx])*0.1 + np.min(self.cnot_counts[topology_idx])*0.9
+            if len(self.mini_topologies[topology_idx]) == self.N*(self.N-1)/2:
+                score += cnot_count_topology*0.3/self.topology_count
+            else:
+                score += cnot_count_topology*0.7/self.topology_count
+        return score 
\ No newline at end of file

From c2192cd4695f759d57fe2fa9db04be0e48359a10 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 3 Nov 2025 14:12:57 +0100
Subject: [PATCH 016/232] Move back to two synthesis method for now

---
 squander/synthesis/PartAM.py | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 90d64747b..2c35eced1 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -164,11 +164,39 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
         weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
-        structures = [optimized_results[i] for i in L_parts]
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
         L = topo_sort_partitions(circ, self.max_partition_size, parts)
+        from squander.partitioning.kahn import kahn_partition_preparts
+        from squander.partitioning.tools import translate_param_order
+        partitioned_circuit, param_order, _ = kahn_partition_preparts(circ, self.max_partition_size, [parts[i] for i in L])
+        parameters = translate_param_order(orig_parameters, param_order)
 
-        return [structures[i] for i in L], [parts[i] for i in L]
+        subcircuits = partitioned_circuit.get_Gates()
+
+        # the list of optimized subcircuits
+        optimized_results = [None] * len(subcircuits)
+
+        with Pool(processes=mp.cpu_count()) as pool:
+            for partition_idx, subcircuit in enumerate( subcircuits ):
+
+                start_idx = subcircuit.get_Parameter_Start_Index()
+                end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
+                subcircuit_parameters = parameters[ start_idx:end_idx ]
+                k = subcircuit.get_Qbit_Num()
+                qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
+                involved_qbits = subcircuit.get_Qbits()
+
+                qbit_num = len( involved_qbits )
+                mini_topologies = get_unique_subtopologies(self.topology, qbit_num)
+                qbit_map = {}
+                for idx in range( len(involved_qbits) ):
+                    qbit_map[ involved_qbits[idx] ] = idx
+                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
+                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+
+            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis") ):
+                optimized_results[partition_idx] = optimized_results[partition_idx].get()
+        return optimized_results
 
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         optimized_partitions, preparation_parts = self.SynthesizeWideCircuit(circ, orig_parameters)

From e0311824094caff157a9ff75a8b4eff03436f286 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 4 Nov 2025 00:20:05 +0100
Subject: [PATCH 017/232] Expand utils and partial work on PartAM

---
 squander/synthesis/PartAM.py       |  41 ++++-
 squander/synthesis/PartAM_utils.py | 237 +++++++++++------------------
 2 files changed, 124 insertions(+), 154 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2c35eced1..ca8c01652 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -27,7 +27,7 @@
 from squander.partitioning.tools import get_qubits
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 from itertools import product
-from squander.synthesis.PartAM_utils import get_all_subtopologies, get_unique_subtopologies, SingleQubitPartitionResult, PartitionSynthesisResult
+from squander.synthesis.PartAM_utils import get_subtopologies_of_type, get_unique_subtopologies, SingleQubitPartitionResult, PartitionSynthesisResult, min_cnots_between_permutations
 
 class qgd_Partition_Aware_Mapping:
 
@@ -165,16 +165,16 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
-        L = topo_sort_partitions(circ, self.max_partition_size, parts)
+        L = topo_sort_partitions(circ, self.config["max_partition_size"], parts)
         from squander.partitioning.kahn import kahn_partition_preparts
         from squander.partitioning.tools import translate_param_order
-        partitioned_circuit, param_order, _ = kahn_partition_preparts(circ, self.max_partition_size, [parts[i] for i in L])
+        partitioned_circuit, param_order, _ = kahn_partition_preparts(circ, self.config["max_partition_size"], [parts[i] for i in L])
         parameters = translate_param_order(orig_parameters, param_order)
 
         subcircuits = partitioned_circuit.get_Gates()
 
         # the list of optimized subcircuits
-        optimized_results = [None] * len(subcircuits)
+        optimized_partitions = [None] * len(subcircuits)
 
         with Pool(processes=mp.cpu_count()) as pool:
             for partition_idx, subcircuit in enumerate( subcircuits ):
@@ -192,19 +192,44 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
-                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+                optimized_partitions[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis") ):
-                optimized_results[partition_idx] = optimized_results[partition_idx].get()
-        return optimized_results
+                optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
+        return optimized_partitions
 
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         optimized_partitions, preparation_parts = self.SynthesizeWideCircuit(circ, orig_parameters)
-        print(preparation_parts)
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
         D = self.compute_distances_bfs(circ.get_Qbit_Num())
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
 
+    def Heuristic_Search(self, F, pi, DAG, IDAG):
+        resolved_partitions = [False] * len(DAG)
+        partition_order = []
+        execute_partition_list = []
+        E = self.generate_E(F, DAG, IDAG, resolved_partitions)
+        while len(F) != 0:
+        
+        scores = []
+        partition_candidates = self.obtain_partition_candidates(F)
+        if len(partition_candidates) != 0:
+            for partition_candidate in partition_candidates:
+                score = self.score_partition_candidate(partition_candidate, F, E, pi, DAG, IDAG, resolved_partitions)
+                scores.append(score)
+        min_idx = np.argmin(scores)
+        min_partition_candidate = partition_candidates[min_idx]
+
+    def obtain_partition_candidates(self, F, optimized_partitions):
+        partition_candidates = []
+        for partition_idx in F:
+            partition = optimized_partitions[partition_idx]
+            for tdx, mini_topology in partition.mini_topologies:
+                topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                for topology_candidate in topology_candidates:
+                    for pdx, permutation_pair in enumerate(partition.permutation_pairs[tdx]):
+                        partition_candidates.append([PartitionCandidate(partition_idx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],)])
+
     def get_initial_layer(self, IDAG, N):
         initial_layer = []
         active_qbits = list(range(N))
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index ecabdf913..3538db555 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -2,21 +2,7 @@
 from typing import List, Tuple, Set, FrozenSet
 from itertools import permutations
 
-def get_all_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
-    """
-    Find ALL connected subtopologies with exactly k qubits using DFS.
-    
-    Args:
-        edges: List of edges representing the quantum hardware topology
-        k: Number of qubits in the desired subtopologies
-    
-    Returns:
-        List of all subtopologies, where each subtopology is a list of edges
-    """
-    if k <= 0:
-        return []
-    
-    # Build adjacency list
+def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
     adj_list = {}
     for u, v in edges:
         if u not in adj_list:
@@ -25,167 +11,101 @@ def get_all_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tup
             adj_list[v] = set()
         adj_list[u].add(v)
         adj_list[v].add(u)
-    
+    return adj_list
+
+def _get_induced_edges(edges: List[Tuple[int, int]], qubit_subset: Set[int]) -> List[Tuple[int, int]]:
+    return [edge for edge in edges if edge[0] in qubit_subset and edge[1] in qubit_subset]
+
+def _dfs_enumerate(adj_list: dict, k: int, callback: Callable[[Set[int]], None]):
     all_qubits = sorted(adj_list.keys())
-    
-    if k == 1:
-        return [[] for _ in all_qubits]
-    
-    def get_induced_edges(qubit_subset: Set[int]) -> List[Tuple[int, int]]:
-        induced = []
-        for edge in edges:
-            if edge[0] in qubit_subset and edge[1] in qubit_subset:
-                induced.append(edge)
-        return induced
-    
-    subtopologies = []
     seen = set()
-    
     def dfs(current_qubits: Set[int], candidates: Set[int]):
-        """Enumerate connected subgraphs using DFS."""
         if len(current_qubits) == k:
             frozen = frozenset(current_qubits)
             if frozen not in seen:
                 seen.add(frozen)
-                subtopologies.append(get_induced_edges(current_qubits))
+                callback(current_qubits)
             return
-        
-        # Prune if we can't reach k qubits
         if len(current_qubits) + len(candidates) < k:
             return
-        
         for node in sorted(candidates):
-            # Add node and explore
             new_qubits = current_qubits | {node}
-            
-            # New candidates: neighbors of new_qubits not yet included
-            new_candidates = set()
-            for q in new_qubits:
-                for neighbor in adj_list[q]:
-                    if neighbor not in new_qubits and neighbor > node:
-                        new_candidates.add(neighbor)
-            
+            new_candidates = {neighbor for q in new_qubits for neighbor in adj_list[q] 
+                            if neighbor not in new_qubits and neighbor > node}
             dfs(new_qubits, new_candidates)
-    
-    # Start DFS from each qubit
     for start in all_qubits:
-        candidates = {n for n in adj_list[start] if n > start}
-        dfs({start}, candidates)
-    
-    return subtopologies
-
+        dfs({start}, {n for n in adj_list[start] if n > start})
 
 def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]:
-    """
-    Convert a subgraph to canonical form for isomorphism checking.
-    Relabels nodes as 0,1,2,...,k-1 and returns the lexicographically smallest edge set.
-    """
     qubits = sorted(qubit_subset)
     n = len(qubits)
-    
-    # Try all permutations and find lexicographically smallest
     best_edges = None
-    
     for perm in permutations(range(n)):
-        # Create mapping: qubits[i] -> perm[i]
         mapping = {qubits[i]: perm[i] for i in range(n)}
-        
-        # Relabel edges
-        relabeled = []
-        for u, v in induced_edges:
-            new_u, new_v = mapping[u], mapping[v]
-            # Normalize edge direction
-            relabeled.append(tuple(sorted([new_u, new_v])))
-        
-        relabeled = tuple(sorted(relabeled))
-        
+        relabeled = tuple(sorted([tuple(sorted([mapping[u], mapping[v]])) for u, v in induced_edges]))
         if best_edges is None or relabeled < best_edges:
             best_edges = relabeled
-    
     return frozenset(best_edges)
 
-
 def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
-    """
-    Find all UNIQUE subtopology structures with k qubits using DFS.
-    Returns one example of each non-isomorphic connected subgraph.
-    
-    Args:
-        edges: List of edges representing the quantum hardware topology
-        k: Number of qubits in the desired subtopologies
-    
-    Returns:
-        List of unique subtopologies (one representative per isomorphism class)
-    """
     if k <= 0:
         return []
-    
-    # Build adjacency list
-    adj_list = {}
-    for u, v in edges:
-        if u not in adj_list:
-            adj_list[u] = set()
-        if v not in adj_list:
-            adj_list[v] = set()
-        adj_list[u].add(v)
-        adj_list[v].add(u)
-    
-    all_qubits = sorted(adj_list.keys())
-    
+    adj_list = _build_adj_list(edges)
     if k == 1:
-        return [[]]  # Single qubit has no edges
-    
-    def get_induced_edges(qubit_subset: Set[int]) -> List[Tuple[int, int]]:
-        induced = []
-        for edge in edges:
-            if edge[0] in qubit_subset and edge[1] in qubit_subset:
-                induced.append(edge)
-        return induced
-    
-    # Track unique canonical forms and their examples
+        return [[]]
     canonical_forms = {}
-    seen = set()
-    
-    def dfs(current_qubits: Set[int], candidates: Set[int]):
-        """Enumerate connected subgraphs using DFS."""
-        if len(current_qubits) == k:
-            frozen = frozenset(current_qubits)
-            if frozen not in seen:
-                seen.add(frozen)
-                induced = get_induced_edges(current_qubits)
-                
-                # Get canonical form
-                canonical = get_canonical_form(current_qubits, induced)
-                
-                # Store first example of each canonical form
-                if canonical not in canonical_forms:
-                    canonical_forms[canonical] = induced
-            return
-        
-        # Prune if we can't reach k qubits
-        if len(current_qubits) + len(candidates) < k:
-            return
-        
-        for node in sorted(candidates):
-            # Add node and explore
-            new_qubits = current_qubits | {node}
-            
-            # New candidates: neighbors of new_qubits not yet included
-            new_candidates = set()
-            for q in new_qubits:
-                for neighbor in adj_list[q]:
-                    if neighbor not in new_qubits and neighbor > node:
-                        new_candidates.add(neighbor)
-            
-            dfs(new_qubits, new_candidates)
-    
-    # Start DFS from each qubit
-    for start in all_qubits:
-        candidates = {n for n in adj_list[start] if n > start}
-        dfs({start}, candidates)
-    
+    def process(qubits):
+        induced = _get_induced_edges(edges, qubits)
+        canonical = get_canonical_form(qubits, induced)
+        if canonical not in canonical_forms:
+            canonical_forms[canonical] = induced
+    _dfs_enumerate(adj_list, k, process)
     return list(canonical_forms.values())
 
+def get_subtopologies_of_type(edges: List[Tuple[int, int]], target_topology: List[Tuple[int, int]]) -> List[List[Tuple[int, int]]]:
+    target_qubits = set()
+    for u, v in target_topology:
+        target_qubits.add(u)
+        target_qubits.add(v)
+    k = len(target_qubits) if target_qubits else 1
+    if k <= 0:
+        return []
+    adj_list = _build_adj_list(edges)
+    if k == 1:
+        return [[] for _ in adj_list.keys()]
+    target_canonical = get_canonical_form(target_qubits, target_topology)
+    matches = []
+    def process(qubits):
+        induced = _get_induced_edges(edges, qubits)
+        canonical = get_canonical_form(qubits, induced)
+        if canonical == target_canonical:
+            matches.append(induced)
+    _dfs_enumerate(adj_list, k, process)
+    return matches
+
+def min_cnots_between_permutations(A, B):
+    n = len(A)
+    inv_B = [0] * n
+    for pos, qubit in enumerate(B):
+        inv_B[qubit] = pos
+    
+    P = [inv_B[A[i]] for i in range(n)]
+    visited = [False] * n
+    total_cnots = 0
+    
+    for i in range(n):
+        if not visited[i]:
+            cycle_len = 0
+            j = i
+            while not visited[j]:
+                visited[j] = True
+                j = P[j]
+                cycle_len += 1
+            if cycle_len >= 2:
+                total_cnots += 2 * cycle_len - 3
+    
+    return total_cnots
+
 
 def extract_subtopology(involved_qbits, qbit_map, config ):
     mini_topology = []
@@ -195,13 +115,16 @@ def extract_subtopology(involved_qbits, qbit_map, config ):
     return mini_topology
 
 class SingleQubitPartitionResult:
+    
     def __init__(self,circuit_in,parameters_in):
         self.circuit = circuit_in
         self.parameters = parameters_in
+    
     def get_partition_synthesis_score(self):
         return 0
 
 class PartitionSynthesisResult:
+    
     def __init__(self, N , mini_topologies, involved_qbits, qubit_map):
         self.mini_topologies = mini_topologies
         self.topology_count = len(mini_topologies)
@@ -210,14 +133,25 @@ def __init__(self, N , mini_topologies, involved_qbits, qubit_map):
         self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
         self.synthesised_parameters = [[] for _ in range(len(mini_topologies))]
         self.cnot_counts = [[] for _ in range(len(mini_topologies))]
+        self.circuit_structures = [[] for _ in range(len(mini_topologies))]
         self.involved_qbits = involved_qbits
         self.qubit_map = qubit_map
+    
     def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
         self.permutations_pairs[topology_idx].append(permutations_pair)
         self.synthesised_circuits[topology_idx].append(synthesised_circuit)
         self.synthesised_parameters[topology_idx].append(synthesised_parameters)
         self.cnot_counts[topology_idx].append(synthesised_circuit.get_Gate_Nums().get('CNOT', 0))
-    
+        self.circuit_structures[topology_idx].append(self.extract_circuit_structure(synthesised_circuit))
+    
+    def extract_circuit_structure(self, circuit):
+        circuit_structure = []
+        for gate in circuit.get_Gates():
+            gate.get_involved_qubits()
+            if len(involved_qbits) != 1:
+                circuit_structure.append(involved_qbits)
+        return circuit_structure
+
     def get_best_result(self, topology_idx):
         best_index = np.argmin(self.cnot_counts[topology_idx])
         return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index]
@@ -230,4 +164,15 @@ def get_partition_synthesis_score(self):
                 score += cnot_count_topology*0.3/self.topology_count
             else:
                 score += cnot_count_topology*0.7/self.topology_count
-        return score 
\ No newline at end of file
+        return score 
+
+class PartitionCandidate:
+    
+    def __init__(self, partition_idx, circuit_structure, P_i, P_o, topology, qbit_map, involved_qbits):
+        self.partition_idx = partition_idx
+        self.circuit_structure = circuit_structure
+        self.P_i = P_i
+        self.P_o = P_o
+        self.topology = topology
+        self.qbit_map = qbit_map
+        self.involved_qbits = involved_qbits
\ No newline at end of file

From fb739f997ef6439923702784451c0aeb340a972a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 5 Nov 2025 14:45:32 +0100
Subject: [PATCH 018/232] Generate partition candidates

---
 squander/synthesis/PartAM.py       | 29 +++++++------
 squander/synthesis/PartAM_utils.py | 70 +++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 16 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index ca8c01652..843c59c0f 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -27,7 +27,9 @@
 from squander.partitioning.tools import get_qubits
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 from itertools import product
-from squander.synthesis.PartAM_utils import get_subtopologies_of_type, get_unique_subtopologies, SingleQubitPartitionResult, PartitionSynthesisResult, min_cnots_between_permutations
+from squander.synthesis.PartAM_utils import (get_subtopologies_of_type, get_unique_subtopologies, 
+SingleQubitPartitionResult, PartitionSynthesisResult, min_cnots_between_permutations, 
+PartitionCandidate, get_node_mapping)
 
 class qgd_Partition_Aware_Mapping:
 
@@ -147,7 +149,6 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 start_idx = subcircuit.get_Parameter_Start_Index()
                 end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
                 subcircuit_parameters = parameters[ start_idx:end_idx ]
-                k = subcircuit.get_Qbit_Num()
                 qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
                 involved_qbits = subcircuit.get_Qbits()
 
@@ -207,18 +208,17 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
     def Heuristic_Search(self, F, pi, DAG, IDAG):
         resolved_partitions = [False] * len(DAG)
         partition_order = []
-        execute_partition_list = []
+        execute_gate_list = []
         E = self.generate_E(F, DAG, IDAG, resolved_partitions)
         while len(F) != 0:
-        
-        scores = []
-        partition_candidates = self.obtain_partition_candidates(F)
-        if len(partition_candidates) != 0:
-            for partition_candidate in partition_candidates:
-                score = self.score_partition_candidate(partition_candidate, F, E, pi, DAG, IDAG, resolved_partitions)
-                scores.append(score)
-        min_idx = np.argmin(scores)
-        min_partition_candidate = partition_candidates[min_idx]
+            scores = []
+            partition_candidates = self.obtain_partition_candidates(F)
+            if len(partition_candidates) != 0:
+                for partition_candidate in partition_candidates:
+                    score = self.score_partition_candidate(partition_candidate, F, E, pi, DAG, IDAG, resolved_partitions)
+                    scores.append(score)
+            min_idx = np.argmin(scores)
+            min_partition_candidate = partition_candidates[min_idx]
 
     def obtain_partition_candidates(self, F, optimized_partitions):
         partition_candidates = []
@@ -228,8 +228,9 @@ def obtain_partition_candidates(self, F, optimized_partitions):
                 topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutation_pairs[tdx]):
-                        partition_candidates.append([PartitionCandidate(partition_idx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],)])
-
+                        partition_candidates.append([PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qbit_map,partition.involved_qbits)])
+        return partition_candidates
+        
     def get_initial_layer(self, IDAG, N):
         initial_layer = []
         active_qbits = list(range(N))
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 3538db555..8b8af2fa2 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -83,6 +83,30 @@ def process(qubits):
     _dfs_enumerate(adj_list, k, process)
     return matches
 
+def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int, int]]) -> dict:
+    qubits1 = set()
+    for u, v in topology1:
+        qubits1.add(u)
+        qubits1.add(v)
+    qubits2 = set()
+    for u, v in topology2:
+        qubits2.add(u)
+        qubits2.add(v)
+    if len(qubits1) != len(qubits2):
+        return {}
+    sorted_qubits1 = sorted(qubits1)
+    sorted_qubits2 = sorted(qubits2)
+    n = len(sorted_qubits1)
+    for perm in permutations(range(n)):
+        mapping = {sorted_qubits1[i]: sorted_qubits2[perm[i]] for i in range(n)}
+        mapped_edges = set()
+        for u, v in topology1:
+            mapped_edges.add(tuple(sorted([mapping[u], mapping[v]])))
+        original_edges = set(tuple(sorted([u, v])) for u, v in topology2)
+        if mapped_edges == original_edges:
+            return mapping
+    return {}
+
 def min_cnots_between_permutations(A, B):
     n = len(A)
     inv_B = [0] * n
@@ -106,6 +130,35 @@ def min_cnots_between_permutations(A, B):
     
     return total_cnots
 
+def find_best_permutation_with_constraints(A, constraints, strategy='greedy'):
+    n = len(A)
+    B = [None] * n
+    used_qubits = set()
+    
+    # Apply constraints
+    for pos, qubit in constraints.items():
+        B[pos] = qubit
+        used_qubits.add(qubit)
+    
+    # Fill unconstrained positions
+    available_qubits = [q for q in range(n) if q not in used_qubits]
+    unconstrained_positions = [i for i in range(n) if B[i] is None]
+
+    for pos in unconstrained_positions:
+        if A[pos] in available_qubits:
+            B[pos] = A[pos]
+            available_qubits.remove(A[pos])
+    
+    # Fill remaining positions with remaining qubits
+    j = 0
+    for pos in unconstrained_positions:
+        if B[pos] is None:
+            B[pos] = available_qubits[j]
+            j += 1
+    
+    
+    return B
+
 
 def extract_subtopology(involved_qbits, qbit_map, config ):
     mini_topology = []
@@ -168,11 +221,24 @@ def get_partition_synthesis_score(self):
 
 class PartitionCandidate:
     
-    def __init__(self, partition_idx, circuit_structure, P_i, P_o, topology, qbit_map, involved_qbits):
+    def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits):
         self.partition_idx = partition_idx
+        self.topology_idx = topology_idx
+        self.permutation_idx = permutation_idx
         self.circuit_structure = circuit_structure
         self.P_i = P_i
         self.P_o = P_o
         self.topology = topology
+        self.mini_topology = mini_topology
         self.qbit_map = qbit_map
-        self.involved_qbits = involved_qbits
\ No newline at end of file
+        self.involved_qbits = involved_qbits
+        self.node_mapping = get_node_mapping(mini_topology, topology)
+
+    def transform_pi_input(self, pi):
+
+        qbit_map_swapped = {self.node_mapping[self.P_i.index(v)]: k for k, v in self.qbit_map.items()}
+        return find_best_permutation_with_constraints(pi, qbit_map_swapped)
+
+    def transform_pi_output(self, pi):
+        qbit_map_swapped = {self.node_mapping[self.P_o.index(v)]: k for k, v in self.qbit_map.items()}
+        return find_best_permutation_with_constraints(pi, qbit_map_swapped)
\ No newline at end of file

From e6ad7f963ada7301444883d4232d4d8cce2e3823 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 14 Nov 2025 15:34:16 +0100
Subject: [PATCH 019/232] Create heuristic search with basic cost function

---
 squander/synthesis/PartAM.py       | 66 +++++++++++++++++++---
 squander/synthesis/PartAM_utils.py | 89 +++++++++++++++++++++++++++---
 2 files changed, 138 insertions(+), 17 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 843c59c0f..b968d2b85 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -29,7 +29,7 @@
 from itertools import product
 from squander.synthesis.PartAM_utils import (get_subtopologies_of_type, get_unique_subtopologies, 
 SingleQubitPartitionResult, PartitionSynthesisResult, min_cnots_between_permutations, 
-PartitionCandidate, get_node_mapping)
+PartitionCandidate, get_node_mapping, permutation_to_cnot_circuit)
 
 class qgd_Partition_Aware_Mapping:
 
@@ -60,7 +60,7 @@ def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_paramete
         N = Partition_circuit.get_Qbit_Num()
         if N !=1:
             perumations_all = list(permutations(range(N)))
-            result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map)
+            result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
             # Sequential permutation search
             for topology_idx in range(len(topologies)):
                 mini_topology = topologies[topology_idx]
@@ -183,7 +183,6 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 start_idx = subcircuit.get_Parameter_Start_Index()
                 end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
                 subcircuit_parameters = parameters[ start_idx:end_idx ]
-                k = subcircuit.get_Qbit_Num()
                 qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
                 involved_qbits = subcircuit.get_Qbits()
 
@@ -200,25 +199,74 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         return optimized_partitions
 
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
-        optimized_partitions, preparation_parts = self.SynthesizeWideCircuit(circ, orig_parameters)
+        optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
         D = self.compute_distances_bfs(circ.get_Qbit_Num())
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
+        F = self.get_initial_layer(IDAG, circ.get_Qbit_Num())
+        partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,optimized_partitions,D)
+        final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions)
+        return final_circuit, final_parameters, pi, pi_final
 
-    def Heuristic_Search(self, F, pi, DAG, IDAG):
+    def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
         resolved_partitions = [False] * len(DAG)
         partition_order = []
-        execute_gate_list = []
-        E = self.generate_E(F, DAG, IDAG, resolved_partitions)
         while len(F) != 0:
             scores = []
-            partition_candidates = self.obtain_partition_candidates(F)
+            partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
             if len(partition_candidates) != 0:
                 for partition_candidate in partition_candidates:
-                    score = self.score_partition_candidate(partition_candidate, F, E, pi, DAG, IDAG, resolved_partitions)
+                    score = self.score_partition_candidate(partition_candidate, F, pi, D)
                     scores.append(score)
             min_idx = np.argmin(scores)
             min_partition_candidate = partition_candidates[min_idx]
+            F.remove(min_partition_candidate.partition_idx)
+            resolved_partitions[min_partition_candidate.partition_idx] = True
+            partition_order.append(permutation_to_cnot_circuit(pi, min_partition_candidate.transform_pi_input(pi)))
+            pi = min_partition_candidate.transform_pi_input(pi)
+            partition_order.append(min_partition_candidate)
+            pi = min_partition_candidate.transform_pi_output(pi)
+            children = DAG[min_partition_candidate.partition_idx][1]
+            while len(children) != 0:
+                child = children.pop(0)
+                if not resolved_partitions[child] and child not in F:
+                    if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                        child_partition = optimized_partitions[child]
+                        qubit = child_partition.circuit.get_Qbits()[0]
+                        child_partition.circuit.map_circuit({qubit: pi[qubit]})
+                        partition_order.append(child_partition.circuit)
+                        children.append(DAG[child][1])
+                    else:
+                        F.append(child)
+        return partition_order, pi
+
+    def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
+        final_circuit = Circuit(N)
+        final_parameters = []
+        for part in partition_order:
+            if isinstance(part, Circuit):
+                final_circuit.add_Circuit(part)
+            elif isinstance(part, SingleQubitPartitionResult):
+                final_circuit.add_Circuit(part.circuit)
+                final_parameters.append(part.parameters)
+            else:
+                part_circ, part_parameters = part.get_final_circuit(optimized_partitions,N)
+                final_circuit.add_Circuit(part_circ)
+                final_parameters.append(part_parameters)
+        final_parameters = np.concatenate(final_parameters,axis=0)
+        return final_circuit, final_parameters
+
+    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions):
+        score = 0 
+        input_perm = partition_candidate.transform_pi_input(pi)
+        output_perm = partition_candidate.transform_pi_output(input_perm)
+        score += permutation_to_cnot_circuit(pi,input_perm)
+        score += len(partition_candidate.circuit_structure)
+        for partition_idx in F:
+            partition_structure = optimized_partitions[partition_idx].get_original_circuit_structure()
+            for qbits in partition_structure:
+                score += self.D[output_perm[qbits[0]]][output_perm[qbits[1]]]
+        return score
 
     def obtain_partition_candidates(self, F, optimized_partitions):
         partition_candidates = []
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 8b8af2fa2..0bf3cdf6e 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -1,7 +1,7 @@
 import numpy as np
 from typing import List, Tuple, Set, FrozenSet
 from itertools import permutations
-
+from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
 def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
     adj_list = {}
     for u, v in edges:
@@ -16,7 +16,7 @@ def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
 def _get_induced_edges(edges: List[Tuple[int, int]], qubit_subset: Set[int]) -> List[Tuple[int, int]]:
     return [edge for edge in edges if edge[0] in qubit_subset and edge[1] in qubit_subset]
 
-def _dfs_enumerate(adj_list: dict, k: int, callback: Callable[[Set[int]], None]):
+def _dfs_enumerate(adj_list: dict, k: int, callback):
     all_qubits = sorted(adj_list.keys())
     seen = set()
     def dfs(current_qubits: Set[int], candidates: Set[int]):
@@ -130,6 +130,38 @@ def min_cnots_between_permutations(A, B):
     
     return total_cnots
 
+def permutation_to_cnot_circuit(A, B):
+    n = len(A)
+    inv_B = {qubit: pos for pos, qubit in enumerate(B)}
+    P = [inv_B[A[i]] for i in range(n)]
+    
+    visited = [False] * n
+    cnot_circuit = Circuit(n)
+    
+    for i in range(n):
+        if not visited[i]:
+            # Extract cycle
+            cycle = []
+            j = i
+            while not visited[j]:
+                visited[j] = True
+                cycle.append(j)
+                j = P[j]
+            
+            # Convert cycle to CNOTs
+            k = len(cycle)
+            if k == 2:
+                cnot_circuit.add_CNOT(cycle[1], cycle[0])
+            elif k >= 3:
+                # Forward pass
+                for idx in range(k - 1):
+                    cnot_circuit.add_CNOT(cycle[idx + 1], cycle[idx])
+                # Backward pass
+                for idx in range(k - 2, 0, -1):
+                    cnot_circuit.add_CNOT(cycle[idx + 1], cycle[idx])
+    
+    return cnot_circuit
+
 def find_best_permutation_with_constraints(A, constraints, strategy='greedy'):
     n = len(A)
     B = [None] * n
@@ -175,21 +207,31 @@ def __init__(self,circuit_in,parameters_in):
     
     def get_partition_synthesis_score(self):
         return 0
-
+# Virtual qubits q, reduced virtual qubits (the remapped circuit only up to partition_size) q*
+# Physical qubits Q, reduced physical qubits Q* 
 class PartitionSynthesisResult:
     
-    def __init__(self, N , mini_topologies, involved_qbits, qubit_map):
+    def __init__(self, N , mini_topologies, involved_qbits, qubit_map, original_circuit):
+        #The physical mini_topology of the partition q*
         self.mini_topologies = mini_topologies
+        #number of topologies
         self.topology_count = len(mini_topologies)
+        #Qubit num of the partition
         self.N = N
+        # P_o and P_i pairs q*->Q*
         self.permutations_pairs = [[] for _ in range(len(mini_topologies))]
+        # results of synthesis
         self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
         self.synthesised_parameters = [[] for _ in range(len(mini_topologies))]
         self.cnot_counts = [[] for _ in range(len(mini_topologies))]
         self.circuit_structures = [[] for _ in range(len(mini_topologies))]
+        # Involved q qubits on the circuit
         self.involved_qbits = involved_qbits
+        # q->q*
         self.qubit_map = qubit_map
-    
+        # the original circuit
+        self.original_circuit = original_circuit
+
     def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
         self.permutations_pairs[topology_idx].append(permutations_pair)
         self.synthesised_circuits[topology_idx].append(synthesised_circuit)
@@ -200,7 +242,7 @@ def add_result(self, permutations_pair, synthesised_circuit, synthesised_paramet
     def extract_circuit_structure(self, circuit):
         circuit_structure = []
         for gate in circuit.get_Gates():
-            gate.get_involved_qubits()
+            involved_qbits = gate.get_involved_qubits()
             if len(involved_qbits) != 1:
                 circuit_structure.append(involved_qbits)
         return circuit_structure
@@ -209,6 +251,17 @@ def get_best_result(self, topology_idx):
         best_index = np.argmin(self.cnot_counts[topology_idx])
         return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index]
     
+    #get the circuit structure in q 
+    def get_original_circuit_structure(self):
+        #q*->q
+        qbit_map_inverse = {v:k for k,v in self.qubit_map.items()}
+        circuit_structure = []
+        for gate in self.original_circuit.get_Gates():
+            involved_qbits = gate.get_involved_qubits()
+            if len(involved_qbits) != 1:
+                circuit_structure.append(qbit_map_inverse[involved_qbits[0]],qbit_map_inverse[involved_qbits[1]])
+        return circuit_structure
+        
     def get_partition_synthesis_score(self):
         score = 0
         for topology_idx in range(self.topology_count):
@@ -222,23 +275,43 @@ def get_partition_synthesis_score(self):
 class PartitionCandidate:
     
     def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits):
+        #Which partition does this belong to
         self.partition_idx = partition_idx
+        #the index of the q* topology
         self.topology_idx = topology_idx
+        #the index of the P_i and P_o pair
         self.permutation_idx = permutation_idx
+        # the structure of the circuit in q*
         self.circuit_structure = circuit_structure
+        # permutations in q*->Q*
         self.P_i = P_i
         self.P_o = P_o
+        #The mini_topology in Q
         self.topology = topology
+        #The mini topology in q*
         self.mini_topology = mini_topology
+        # q->q*
         self.qbit_map = qbit_map
+        # q belonging to the original circuit
         self.involved_qbits = involved_qbits
+        # q->Q*
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
     def transform_pi_input(self, pi):
-
+        #Q->q
         qbit_map_swapped = {self.node_mapping[self.P_i.index(v)]: k for k, v in self.qbit_map.items()}
         return find_best_permutation_with_constraints(pi, qbit_map_swapped)
 
     def transform_pi_output(self, pi):
+        #Q->q
         qbit_map_swapped = {self.node_mapping[self.P_o.index(v)]: k for k, v in self.qbit_map.items()}
-        return find_best_permutation_with_constraints(pi, qbit_map_swapped)
\ No newline at end of file
+        return find_best_permutation_with_constraints(pi, qbit_map_swapped)
+
+    def get_final_circuit(self,optimized_partitions,N):
+        partition = optimized_partitions[self.partition_idx]
+        part_parameters = partition.synthesized_parameters[self.topology_idx][self.permutation_idx]
+        part_circuit = partition.synthesized_circuits[self.topology_idx][self.permutation_idx]
+        qbit_map_swapped = {k : self.node_mapping[self.P_i.index(v)] for k, v in self.qbit_map.items()}
+        part_circuit.Remap_Qbits(qbit_map_swapped,N)
+        return part_circuit, part_parameters
+    
\ No newline at end of file

From 0207216667377655df62d1f6ee67489a12c225e0 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Fri, 14 Nov 2025 16:17:14 +0100
Subject: [PATCH 020/232] Refactor PartAM_example and PartAM to improve
 partition mapping and update circuit file paths

---
 examples/decomposition/PartAM_example.py |  9 +++------
 squander/synthesis/PartAM.py             | 25 +++++++++++-------------
 squander/synthesis/PartAM_utils.py       |  6 +++---
 3 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index ed44c59c8..6dd908a5e 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -35,21 +35,18 @@
             'test_subcircuits': True,
             'test_final_circuit': True,
             'max_partition_size': 3,
-            'minimum_partition_size': 2,  # Minimum qubits per partition (default: 2)
     }
 
-    filename = "benchmarks/qfast/5q/vqe.qasm"
+    filename = "benchmarks/qfast/4q/adder_q4.qasm"
     start_time = time.time()
 
     # load the circuit from a file
     circ, parameters = utils.qasm_to_squander_circuit(filename)
     config['topology'] = [
-    (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
-    (8, 9), (8, 10), (8, 11), (8, 12), (8, 13), (8, 14), (8, 15),
-    (0, 8),
+    (0, 1), (0, 2), (0, 3), 
     ]
     wide_circuit_optimizer = Partition_Aware_Mapping( config )
-    wide_circuit_optimizer.SynthesizeWideCircuit( circ, parameters )
+    wide_circuit_optimizer.Partition_Aware_Mapping( circ, parameters )
 
     print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index b968d2b85..2db0df2df 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -23,13 +23,10 @@
 from collections import deque, defaultdict
 import numpy as np
 
-from squander.partitioning.partition import PartitionCircuit
-from squander.partitioning.tools import get_qubits
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
-from itertools import product
 from squander.synthesis.PartAM_utils import (get_subtopologies_of_type, get_unique_subtopologies, 
-SingleQubitPartitionResult, PartitionSynthesisResult, min_cnots_between_permutations, 
-PartitionCandidate, get_node_mapping, permutation_to_cnot_circuit)
+SingleQubitPartitionResult, PartitionSynthesisResult, 
+PartitionCandidate, permutation_to_cnot_circuit, min_cnots_between_permutations)
 
 class qgd_Partition_Aware_Mapping:
 
@@ -203,7 +200,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
         D = self.compute_distances_bfs(circ.get_Qbit_Num())
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
-        F = self.get_initial_layer(IDAG, circ.get_Qbit_Num())
+        F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
         partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,optimized_partitions,D)
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions)
         return final_circuit, final_parameters, pi, pi_final
@@ -216,7 +213,7 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
             partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
             if len(partition_candidates) != 0:
                 for partition_candidate in partition_candidates:
-                    score = self.score_partition_candidate(partition_candidate, F, pi, D)
+                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, D)
                     scores.append(score)
             min_idx = np.argmin(scores)
             min_partition_candidate = partition_candidates[min_idx]
@@ -256,11 +253,11 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         final_parameters = np.concatenate(final_parameters,axis=0)
         return final_circuit, final_parameters
 
-    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions):
+    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, D):
         score = 0 
         input_perm = partition_candidate.transform_pi_input(pi)
         output_perm = partition_candidate.transform_pi_output(input_perm)
-        score += permutation_to_cnot_circuit(pi,input_perm)
+        score += min_cnots_between_permutations(pi,input_perm)
         score += len(partition_candidate.circuit_structure)
         for partition_idx in F:
             partition_structure = optimized_partitions[partition_idx].get_original_circuit_structure()
@@ -272,20 +269,20 @@ def obtain_partition_candidates(self, F, optimized_partitions):
         partition_candidates = []
         for partition_idx in F:
             partition = optimized_partitions[partition_idx]
-            for tdx, mini_topology in partition.mini_topologies:
+            for tdx, mini_topology in enumerate(partition.mini_topologies):
                 topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
                 for topology_candidate in topology_candidates:
-                    for pdx, permutation_pair in enumerate(partition.permutation_pairs[tdx]):
-                        partition_candidates.append([PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qbit_map,partition.involved_qbits)])
+                    for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
+                        partition_candidates.append(PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits))
         return partition_candidates
         
-    def get_initial_layer(self, IDAG, N):
+    def get_initial_layer(self, IDAG, N, optimized_partitions):
         initial_layer = []
         active_qbits = list(range(N))
         for idx in range(len(IDAG)):
             if len(IDAG[idx][1]) == 0:
                 initial_layer.append(idx)
-                for qbit in IDAG[idx][0].involved_qbits:
+                for qbit in optimized_partitions[IDAG[idx][0]].involved_qbits:
                     active_qbits.remove(qbit)
             if len(active_qbits) == 0:
                 break
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 0bf3cdf6e..9b58be581 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -242,7 +242,7 @@ def add_result(self, permutations_pair, synthesised_circuit, synthesised_paramet
     def extract_circuit_structure(self, circuit):
         circuit_structure = []
         for gate in circuit.get_Gates():
-            involved_qbits = gate.get_involved_qubits()
+            involved_qbits = gate.get_Involved_Qbits()
             if len(involved_qbits) != 1:
                 circuit_structure.append(involved_qbits)
         return circuit_structure
@@ -257,9 +257,9 @@ def get_original_circuit_structure(self):
         qbit_map_inverse = {v:k for k,v in self.qubit_map.items()}
         circuit_structure = []
         for gate in self.original_circuit.get_Gates():
-            involved_qbits = gate.get_involved_qubits()
+            involved_qbits = gate.get_Involved_Qbits()
             if len(involved_qbits) != 1:
-                circuit_structure.append(qbit_map_inverse[involved_qbits[0]],qbit_map_inverse[involved_qbits[1]])
+                circuit_structure.append((qbit_map_inverse[involved_qbits[0]],qbit_map_inverse[involved_qbits[1]]))
         return circuit_structure
         
     def get_partition_synthesis_score(self):

From 33a15e826d477edf8eddf5a4157840654f682ab4 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 14 Nov 2025 16:36:53 +0100
Subject: [PATCH 021/232] Rework scoring

---
 squander/synthesis/PartAM.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2db0df2df..6f1bc2072 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -213,7 +213,7 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
             partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
             if len(partition_candidates) != 0:
                 for partition_candidate in partition_candidates:
-                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, D)
+                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions)
                     scores.append(score)
             min_idx = np.argmin(scores)
             min_partition_candidate = partition_candidates[min_idx]
@@ -253,16 +253,23 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         final_parameters = np.concatenate(final_parameters,axis=0)
         return final_circuit, final_parameters
 
-    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, D):
+    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions):
         score = 0 
         input_perm = partition_candidate.transform_pi_input(pi)
         output_perm = partition_candidate.transform_pi_output(input_perm)
         score += min_cnots_between_permutations(pi,input_perm)
         score += len(partition_candidate.circuit_structure)
         for partition_idx in F:
-            partition_structure = optimized_partitions[partition_idx].get_original_circuit_structure()
-            for qbits in partition_structure:
-                score += self.D[output_perm[qbits[0]]][output_perm[qbits[1]]]
+            partition = optimized_partitions[partition_idx]
+            mini_scores = []
+            for tdx, mini_topology in enumerate(partition.mini_topologies):
+                topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                for topology_candidate in topology_candidates:
+                    for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
+                        new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
+                        mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+            score += min(mini_scores)
+
         return score
 
     def obtain_partition_candidates(self, F, optimized_partitions):

From 6136c6fa18d8e22c44190a443edf3d6028e38575 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Fri, 14 Nov 2025 17:20:15 +0100
Subject: [PATCH 022/232] Fix minor bugs

---
 squander/synthesis/PartAM.py       | 4 ++--
 squander/synthesis/PartAM_utils.py | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6f1bc2072..768a0be87 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -202,7 +202,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
         F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
         partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,optimized_partitions,D)
-        final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions)
+        final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions, circ.get_Qbit_Num())
         return final_circuit, final_parameters, pi, pi_final
 
     def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
@@ -267,7 +267,7 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                        mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                        mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
             score += min(mini_scores)
 
         return score
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 9b58be581..cff896bd6 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -308,10 +308,11 @@ def transform_pi_output(self, pi):
         return find_best_permutation_with_constraints(pi, qbit_map_swapped)
 
     def get_final_circuit(self,optimized_partitions,N):
+        print(self.node_mapping,self.qbit_map,self.involved_qbits)
         partition = optimized_partitions[self.partition_idx]
-        part_parameters = partition.synthesized_parameters[self.topology_idx][self.permutation_idx]
-        part_circuit = partition.synthesized_circuits[self.topology_idx][self.permutation_idx]
-        qbit_map_swapped = {k : self.node_mapping[self.P_i.index(v)] for k, v in self.qbit_map.items()}
+        part_parameters = partition.synthesised_parameters[self.topology_idx][self.permutation_idx]
+        part_circuit = partition.synthesised_circuits[self.topology_idx][self.permutation_idx]
+        qbit_map_swapped = {v : self.node_mapping[self.P_i.index(v)] for k, v in self.qbit_map.items()}
         part_circuit.Remap_Qbits(qbit_map_swapped,N)
         return part_circuit, part_parameters
     
\ No newline at end of file

From ecb52fcc6319dab29272edac8fc30bea7322a479 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 20 Nov 2025 17:03:26 +0100
Subject: [PATCH 023/232] Add device checking and extended gate set

---
 squander/synthesis/PartAM.py       | 87 ++++++++++++++++++++++++++----
 squander/synthesis/PartAM_utils.py | 15 +++++-
 2 files changed, 92 insertions(+), 10 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6f1bc2072..574f944b5 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -26,7 +26,7 @@
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 from squander.synthesis.PartAM_utils import (get_subtopologies_of_type, get_unique_subtopologies, 
 SingleQubitPartitionResult, PartitionSynthesisResult, 
-PartitionCandidate, permutation_to_cnot_circuit, min_cnots_between_permutations)
+PartitionCandidate, permutation_to_cnot_circuit, min_cnots_between_permutations, check_circuit_compatibility)
 
 class qgd_Partition_Aware_Mapping:
 
@@ -203,6 +203,8 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
         partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,optimized_partitions,D)
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions)
+        if not check_circuit_compatibility(final_circuit, self.topology):
+            raise Exception("The final circuit is not compatible with the topology.")
         return final_circuit, final_parameters, pi, pi_final
 
     def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
@@ -213,7 +215,7 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
             partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
             if len(partition_candidates) != 0:
                 for partition_candidate in partition_candidates:
-                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions)
+                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, DAG)
                     scores.append(score)
             min_idx = np.argmin(scores)
             min_partition_candidate = partition_candidates[min_idx]
@@ -253,12 +255,47 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         final_parameters = np.concatenate(final_parameters,axis=0)
         return final_circuit, final_parameters
 
-    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions):
-        score = 0 
+    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, DAG):
+        score_F = 0
+        score_E = 0
+        E_visited_partitions = []
         input_perm = partition_candidate.transform_pi_input(pi)
         output_perm = partition_candidate.transform_pi_output(input_perm)
-        score += min_cnots_between_permutations(pi,input_perm)
-        score += len(partition_candidate.circuit_structure)
+        score_F += min_cnots_between_permutations(pi,input_perm)
+        score_F += len(partition_candidate.circuit_structure)
+        for partition_idx in DAG[partition_candidate.partition_idx][1]:
+            if not isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
+                if partition_idx in E_visited_partitions:
+                    continue
+                E_visited_partitions.append(partition_idx)
+                mini_scores = []
+                for tdx, mini_topology in enumerate(optimized_partitions[partition_idx].mini_topologies):
+                    topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                    for topology_candidate in topology_candidates:
+                        for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx].permutations_pairs[tdx]):
+                            new_cand = PartitionCandidate(partition_idx,tdx,pdx,optimized_partitions[partition_idx].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx].qubit_map,optimized_partitions[partition_idx].involved_qbits)
+                            mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                score_E += min(mini_scores)
+            else:
+                is_single_qubit = True
+                partition_idx_new = DAG[partition_idx][1]
+                while is_single_qubit and len(partition_idx_new) != 0:
+                    is_single_qubit = isinstance(optimized_partitions[partition_idx_new[0]], SingleQubitPartitionResult)
+                    partition_idx_new = DAG[partition_idx_new[0]][1]
+                if len(partition_idx_new) != 0 and not is_single_qubit:
+                    partition_idx_new = partition_idx_new[0]
+                    if partition_idx_new in E_visited_partitions:
+                        continue
+                    E_visited_partitions.append(partition_idx_new)
+                    mini_scores = []
+                    for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_new].mini_topologies):
+                        topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                        for topology_candidate in topology_candidates:
+                            for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_new].permutations_pairs[tdx]):
+                                new_cand = PartitionCandidate(partition_idx_new,tdx,pdx,optimized_partitions[partition_idx_new].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_new].qubit_map,optimized_partitions[partition_idx_new].involved_qbits)
+                                mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                    score_E += min(mini_scores)
+                
         for partition_idx in F:
             partition = optimized_partitions[partition_idx]
             mini_scores = []
@@ -268,9 +305,41 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
                         mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
-            score += min(mini_scores)
-
-        return score
+            score_F += min(mini_scores)
+            for partition_idx_E in DAG[partition_idx][1]:
+                if not isinstance(optimized_partitions[partition_idx_E], SingleQubitPartitionResult):
+                    if partition_idx_E in E_visited_partitions:
+                        continue
+                    E_visited_partitions.append(partition_idx_E)
+                    mini_scores = []
+                    for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_E].mini_topologies):
+                        topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                        for topology_candidate in topology_candidates:
+                            for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
+                                new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
+                                mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                    score_E += min(mini_scores)
+                else:
+                    is_single_qubit = True
+                    partition_idx_new = DAG[partition_idx_E][1]
+                    while is_single_qubit and len(partition_idx_new) != 0:
+                        is_single_qubit = isinstance(optimized_partitions[partition_idx_new[0]], SingleQubitPartitionResult)
+                        partition_idx_new = DAG[partition_idx_new[0]][1]
+                    if len(partition_idx_new) != 0 and not is_single_qubit:
+                        partition_idx_new = partition_idx_new[0]
+                        if partition_idx_E in E_visited_partitions:
+                            continue
+                        E_visited_partitions.append(partition_idx_E)
+                        mini_scores = []
+                        for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_E].mini_topologies):
+                            topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                            for topology_candidate in topology_candidates:
+                                for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
+                                    new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
+                                    mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                        score_E += min(mini_scores)
+
+        return 0.2*score_E/len(E_visited_partitions) + score_F/len(F)
 
     def obtain_partition_candidates(self, F, optimized_partitions):
         partition_candidates = []
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 9b58be581..ce4e94990 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -314,4 +314,17 @@ def get_final_circuit(self,optimized_partitions,N):
         qbit_map_swapped = {k : self.node_mapping[self.P_i.index(v)] for k, v in self.qbit_map.items()}
         part_circuit.Remap_Qbits(qbit_map_swapped,N)
         return part_circuit, part_parameters
-    
\ No newline at end of file
+
+def check_circuit_compatibility(circuit: Circuit, topology):
+    circuit_topology = []
+    gates = circuit.get_Gates()
+    for gate in gates:
+        qubits = gate.get_Involved_Qbits()
+        if len(qubits) != 1:
+            qubits = tuple(qubits)
+            if qubits not in circuit_topology and qubits[::-1] not in circuit_topology:
+                circuit_topology.append(qubits)
+    for qubits in circuit_topology:
+        if qubits not in topology and qubits[::-1] not in topology:
+            return False
+    return True
\ No newline at end of file

From 3a74492990e1e531eb502dc67a04d1eba3f6409a Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Thu, 20 Nov 2025 17:07:13 +0100
Subject: [PATCH 024/232] rework example

---
 examples/decomposition/PartAM_example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 6dd908a5e..96769f4df 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -46,8 +46,8 @@
     (0, 1), (0, 2), (0, 3), 
     ]
     wide_circuit_optimizer = Partition_Aware_Mapping( config )
-    wide_circuit_optimizer.Partition_Aware_Mapping( circ, parameters )
-
+    circ, params, _,_ = wide_circuit_optimizer.Partition_Aware_Mapping( circ, parameters )
+    print(Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params))
     print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
 
 

From d2f477b5c19a07b3b232e921d1490b600c230f3b Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Thu, 20 Nov 2025 17:14:17 +0100
Subject: [PATCH 025/232] Resolve merge error

---
 squander/synthesis/PartAM.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index f8a244b62..6f779a136 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -274,7 +274,7 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                     for topology_candidate in topology_candidates:
                         for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx].permutations_pairs[tdx]):
                             new_cand = PartitionCandidate(partition_idx,tdx,pdx,optimized_partitions[partition_idx].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx].qubit_map,optimized_partitions[partition_idx].involved_qbits)
-                            mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                            mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
                 score_E += min(mini_scores)
             else:
                 is_single_qubit = True
@@ -293,7 +293,7 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                         for topology_candidate in topology_candidates:
                             for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_new].permutations_pairs[tdx]):
                                 new_cand = PartitionCandidate(partition_idx_new,tdx,pdx,optimized_partitions[partition_idx_new].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_new].qubit_map,optimized_partitions[partition_idx_new].involved_qbits)
-                                mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                                mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
                     score_E += min(mini_scores)
                 
         for partition_idx in F:
@@ -317,7 +317,7 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                         for topology_candidate in topology_candidates:
                             for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
                                 new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
-                                mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                                mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
                     score_E += min(mini_scores)
                 else:
                     is_single_qubit = True
@@ -336,7 +336,7 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                             for topology_candidate in topology_candidates:
                                 for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
                                     new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
-                                    mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(min_cnots_between_permutations))+len(new_cand.circuit_structure))
+                                    mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
                         score_E += min(mini_scores)
 
         return 0.2*score_E/len(E_visited_partitions) + score_F/len(F)

From db30f89c6f038ccfe15575fcdf56c5ea4579bf12 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Thu, 20 Nov 2025 17:47:06 +0100
Subject: [PATCH 026/232] Fix PartAM utils and add consistent naming to
 qgd_Circuit

---
 squander/gates/qgd_Circuit.py      |  3 +++
 squander/synthesis/PartAM_utils.py | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/squander/gates/qgd_Circuit.py b/squander/gates/qgd_Circuit.py
index bd0824cfc..c3b58adf2 100644
--- a/squander/gates/qgd_Circuit.py
+++ b/squander/gates/qgd_Circuit.py
@@ -512,6 +512,9 @@ def get_Qbits(self):
     
         return super().get_Qbits()
 
+    def get_Involved_Qbits(self):
+
+        return super().get_Qbits()
 
 #@brief Call to set hte min fusion in the circuit
 #@param Input arguments: min_fusion
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 2d113db52..0886f03d9 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -308,7 +308,6 @@ def transform_pi_output(self, pi):
         return find_best_permutation_with_constraints(pi, qbit_map_swapped)
 
     def get_final_circuit(self,optimized_partitions,N):
-        print(self.node_mapping,self.qbit_map,self.involved_qbits)
         partition = optimized_partitions[self.partition_idx]
         part_parameters = partition.synthesised_parameters[self.topology_idx][self.permutation_idx]
         part_circuit = partition.synthesised_circuits[self.topology_idx][self.permutation_idx]
@@ -321,10 +320,21 @@ def check_circuit_compatibility(circuit: Circuit, topology):
     gates = circuit.get_Gates()
     for gate in gates:
         qubits = gate.get_Involved_Qbits()
-        if len(qubits) != 1:
+        if len(qubits) == 1:
+            continue
+        elif len(qubits) == 2:
             qubits = tuple(qubits)
             if qubits not in circuit_topology and qubits[::-1] not in circuit_topology:
                 circuit_topology.append(qubits)
+        else:
+            gates_new = gate.get_Gates()
+            for gate_new in gates_new:
+                qubits_new = gate_new.get_Involved_Qbits()
+                if len(qubits_new)==1:
+                    continue
+                qubits_new = tuple(qubits_new)
+                if qubits_new not in circuit_topology and qubits_new[::-1] not in circuit_topology:
+                    circuit_topology.append(qubits_new)
     for qubits in circuit_topology:
         if qubits not in topology and qubits[::-1] not in topology:
             return False

From 01737edfc51f5c725323d89d9595641bbd91d0f4 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Thu, 20 Nov 2025 17:47:17 +0100
Subject: [PATCH 027/232] Add state checking to example

---
 examples/decomposition/PartAM_example.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 96769f4df..7e8b0c086 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -41,13 +41,31 @@
     start_time = time.time()
 
     # load the circuit from a file
-    circ, parameters = utils.qasm_to_squander_circuit(filename)
+    circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
     config['topology'] = [
     (0, 1), (0, 2), (0, 3), 
     ]
     wide_circuit_optimizer = Partition_Aware_Mapping( config )
-    circ, params, _,_ = wide_circuit_optimizer.Partition_Aware_Mapping( circ, parameters )
-    print(Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params))
+    circ, params, input_perm,output_perm = wide_circuit_optimizer.Partition_Aware_Mapping( circ_orig, parameters_orig )
+    #print(Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params))
+    num_qubits = circ.get_Qbit_Num() 
+    matrix_size = 1 << num_qubits 
+    initial_state_real = np.random.uniform(-1.0,1.0, (matrix_size,) )
+    initial_state_imag = np.random.uniform(-1.0,1.0, (matrix_size,) )
+    initial_state = initial_state_real + initial_state_imag*1j
+    initial_state = initial_state/np.linalg.norm(initial_state)
+    original_state = initial_state.copy()
+    circ_orig.apply_to(parameters_orig,original_state)
+    circ_Final = Circuit(circ.get_Qbit_Num() )
+    circ_Final.add_Permutation(input_perm)
+    circ_Final.add_Circuit(circ)
+    output_perm_T = [0]* circ.get_Qbit_Num() 
+    for i, j in enumerate(output_perm):
+        output_perm_T[j] = i
+    circ_Final.add_Permutation(output_perm)
+    PartAM_state = initial_state.copy()
+    circ_Final.apply_to(params,PartAM_state)
+    print(f"Decomposition error on random state: {1-abs(np.vdot(PartAM_state,original_state))}")
     print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
 
 

From ce7e1c73815e5dbf1b2209b2be8771facb5d9ddd Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Thu, 20 Nov 2025 17:51:29 +0100
Subject: [PATCH 028/232] Remake example

---
 examples/decomposition/PartAM_example.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 7e8b0c086..9dddca975 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -57,12 +57,12 @@
     original_state = initial_state.copy()
     circ_orig.apply_to(parameters_orig,original_state)
     circ_Final = Circuit(circ.get_Qbit_Num() )
-    circ_Final.add_Permutation(input_perm)
-    circ_Final.add_Circuit(circ)
     output_perm_T = [0]* circ.get_Qbit_Num() 
     for i, j in enumerate(output_perm):
         output_perm_T[j] = i
-    circ_Final.add_Permutation(output_perm)
+    circ_Final.add_Permutation(input_perm)
+    circ_Final.add_Circuit(circ)
+    circ_Final.add_Permutation(output_perm_T)
     PartAM_state = initial_state.copy()
     circ_Final.apply_to(params,PartAM_state)
     print(f"Decomposition error on random state: {1-abs(np.vdot(PartAM_state,original_state))}")

From 29dd8061574904740f9a051c9becd3289508667f Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Fri, 21 Nov 2025 11:31:51 +0100
Subject: [PATCH 029/232] Rework comments

---
 squander/synthesis/PartAM.py       |  4 ++--
 squander/synthesis/PartAM_utils.py | 20 +++++++++++---------
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6f779a136..5ba9c5de7 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -233,8 +233,8 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
                         child_partition = optimized_partitions[child]
                         qubit = child_partition.circuit.get_Qbits()[0]
                         child_partition.circuit.map_circuit({qubit: pi[qubit]})
-                        partition_order.append(child_partition.circuit)
-                        children.append(DAG[child][1])
+                        partition_order.append(child_partition)
+                        children.extend(DAG[child][1])
                     else:
                         F.append(child)
         return partition_order, pi
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 0886f03d9..e3d46103e 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -218,7 +218,8 @@ def __init__(self, N , mini_topologies, involved_qbits, qubit_map, original_circ
         self.topology_count = len(mini_topologies)
         #Qubit num of the partition
         self.N = N
-        # P_o and P_i pairs q*->Q*
+        # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc 
+        # P_o in Q*->q* permutation pattern [Q*1 Q*0 Q*2] This means that the current output of Q*1 is equal to q*0
         self.permutations_pairs = [[] for _ in range(len(mini_topologies))]
         # results of synthesis
         self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
@@ -227,7 +228,7 @@ def __init__(self, N , mini_topologies, involved_qbits, qubit_map, original_circ
         self.circuit_structures = [[] for _ in range(len(mini_topologies))]
         # Involved q qubits on the circuit
         self.involved_qbits = involved_qbits
-        # q->q*
+        # {q:q*}
         self.qubit_map = qubit_map
         # the original circuit
         self.original_circuit = original_circuit
@@ -277,24 +278,25 @@ class PartitionCandidate:
     def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits):
         #Which partition does this belong to
         self.partition_idx = partition_idx
-        #the index of the q* topology
+        #the index of the Q* topology
         self.topology_idx = topology_idx
         #the index of the P_i and P_o pair
         self.permutation_idx = permutation_idx
-        # the structure of the circuit in q*
+        # the structure of the circuit in Q*
         self.circuit_structure = circuit_structure
-        # permutations in q*->Q*
+        # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc 
         self.P_i = P_i
+        # P_o in Q*->q* permutation pattern [Q*1 Q*0 Q*2] This means that the current output of Q*1 is equal to q*0
         self.P_o = P_o
         #The mini_topology in Q
         self.topology = topology
-        #The mini topology in q*
+        #The mini topology in Q*
         self.mini_topology = mini_topology
-        # q->q*
+        # {q:q*}
         self.qbit_map = qbit_map
         # q belonging to the original circuit
         self.involved_qbits = involved_qbits
-        # q->Q*
+        # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
     def transform_pi_input(self, pi):
@@ -312,7 +314,7 @@ def get_final_circuit(self,optimized_partitions,N):
         part_parameters = partition.synthesised_parameters[self.topology_idx][self.permutation_idx]
         part_circuit = partition.synthesised_circuits[self.topology_idx][self.permutation_idx]
         qbit_map_swapped = {v : self.node_mapping[self.P_i.index(v)] for k, v in self.qbit_map.items()}
-        part_circuit.Remap_Qbits(qbit_map_swapped,N)
+        part_circuit = part_circuit.Remap_Qbits(qbit_map_swapped,N)
         return part_circuit, part_parameters
 
 def check_circuit_compatibility(circuit: Circuit, topology):

From 9366bd4a82b2f52f02fe52358fc9ea4490e4d00e Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 21 Nov 2025 12:40:48 +0100
Subject: [PATCH 030/232] Add temporary diagnostics

---
 examples/decomposition/PartAM_example.py |  60 ++++++-
 squander/synthesis/PartAM.py             | 196 +++++++++++++++++++++--
 2 files changed, 242 insertions(+), 14 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 9dddca975..826a6ae6d 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -35,6 +35,7 @@
             'test_subcircuits': True,
             'test_final_circuit': True,
             'max_partition_size': 3,
+            'diagnostics': True,  # Enable diagnostic output
     }
 
     filename = "benchmarks/qfast/4q/adder_q4.qasm"
@@ -60,12 +61,67 @@
     output_perm_T = [0]* circ.get_Qbit_Num() 
     for i, j in enumerate(output_perm):
         output_perm_T[j] = i
+    # Validate permutation inverse calculation
+    if config.get('diagnostics', False):
+        print(f"\n{'='*70}")
+        print(f"Permutation Validation")
+        print(f"{'='*70}")
+        print(f"input_perm (initial pi): {input_perm}")
+        print(f"output_perm (final pi): {output_perm}")
+        
+        # Compute inverse
+        output_perm_T = [0] * circ.get_Qbit_Num()
+        for i, j in enumerate(output_perm):
+            output_perm_T[j] = i
+        print(f"output_perm_T (inverse): {output_perm_T}")
+        
+        # Verify inverse: output_perm_T[output_perm[i]] should equal i
+        test_inverse = [output_perm_T[output_perm[i]] for i in range(len(output_perm))]
+        if test_inverse != list(range(len(output_perm))):
+            print(f"  ERROR: Inverse calculation is WRONG!")
+            print(f"  Expected: {list(range(len(output_perm)))}")
+            print(f"  Got: {test_inverse}")
+        else:
+            print(f"  Inverse verified: OK")
+    
+    if not config.get('diagnostics', False):
+        output_perm_T = [0] * circ.get_Qbit_Num()
+        for i, j in enumerate(output_perm):
+            output_perm_T[j] = i
+    
     circ_Final.add_Permutation(input_perm)
     circ_Final.add_Circuit(circ)
     circ_Final.add_Permutation(output_perm_T)
+    
+    # Additional matrix validation in example
+    if config.get('diagnostics', False):
+        try:
+            print(f"\n{'='*70}")
+            print(f"Final Circuit Matrix Validation")
+            print(f"{'='*70}")
+            orig_matrix = circ_orig.get_Matrix(parameters_orig)
+            final_matrix = circ_Final.get_Matrix(params)
+            matrix_error = np.linalg.norm(orig_matrix - final_matrix, 'fro')
+            print(f"Original vs Final circuit error: {matrix_error:.2e}")
+            
+            # Test without output permutation
+            circ_test = Circuit(circ.get_Qbit_Num())
+            circ_test.add_Permutation(input_perm)
+            circ_test.add_Circuit(circ)
+            test_matrix = circ_test.get_Matrix(params)
+            test_error = np.linalg.norm(orig_matrix - test_matrix, 'fro')
+            print(f"Without output perm error: {test_error:.2e}")
+        except Exception as e:
+            print(f"Matrix validation error: {e}")
+    
     PartAM_state = initial_state.copy()
-    circ_Final.apply_to(params,PartAM_state)
-    print(f"Decomposition error on random state: {1-abs(np.vdot(PartAM_state,original_state))}")
+    circ_Final.apply_to(params, PartAM_state)
+    state_error = 1 - abs(np.vdot(PartAM_state, original_state))
+    print(f"\n{'='*70}")
+    print(f"State Vector Validation")
+    print(f"{'='*70}")
+    print(f"Decomposition error on random state: {state_error:.10f}")
     print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
+    print(f"{'='*70}\n")
 
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 5ba9c5de7..7f38589a2 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -44,10 +44,16 @@ def __init__(self, config):
         self.config.setdefault('routed', False)
         self.config.setdefault('partition_strategy','ilp')
         self.config.setdefault('optimizer', 'BFGS')
+        self.config.setdefault('diagnostics', False)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
             raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
+    
+    def _diagnostic_print(self, *args, **kwargs):
+        """Print diagnostic information if diagnostics are enabled."""
+        if self.config.get('diagnostics', False):
+            print(*args, **kwargs)
 
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
@@ -159,6 +165,19 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis") ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
+        
+        # Diagnostic: Check first synthesis pass
+        if self.config.get('diagnostics', False):
+            self._diagnostic_print("\n=== Partition Synthesis Diagnostics (First Pass) ===")
+            for idx, opt_part in enumerate(optimized_results):
+                if isinstance(opt_part, SingleQubitPartitionResult):
+                    self._diagnostic_print(f"  Partition {idx}: Single-qubit")
+                else:
+                    self._diagnostic_print(f"  Partition {idx}: Multi-qubit, involved_qubits={opt_part.involved_qbits}")
+                    for tdx in range(opt_part.topology_count):
+                        if opt_part.cnot_counts[tdx]:
+                            min_cnots = min(opt_part.cnot_counts[tdx])
+                            self._diagnostic_print(f"    Topology {tdx}: min_CNOTs={min_cnots}, candidates={len(opt_part.cnot_counts[tdx])}")
 
         weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
@@ -193,23 +212,118 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis") ):
                 optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
+        
+        # Diagnostic: Check partition synthesis errors
+        if self.config.get('diagnostics', False):
+            self._diagnostic_print("\n=== Partition Synthesis Diagnostics (Second Pass) ===")
+            for idx, opt_part in enumerate(optimized_partitions):
+                if isinstance(opt_part, SingleQubitPartitionResult):
+                    self._diagnostic_print(f"  Partition {idx}: Single-qubit, involved_qubits={opt_part.circuit.get_Qbits()}")
+                else:
+                    self._diagnostic_print(f"  Partition {idx}: Multi-qubit, involved_qubits={opt_part.involved_qbits}, qubit_map={opt_part.qubit_map}")
+                    for tdx in range(opt_part.topology_count):
+                        if opt_part.cnot_counts[tdx]:
+                            min_cnots = min(opt_part.cnot_counts[tdx])
+                            avg_cnots = np.mean(opt_part.cnot_counts[tdx])
+                            self._diagnostic_print(f"    Topology {tdx}: min_CNOTs={min_cnots}, avg_CNOTs={avg_cnots:.2f}, candidates={len(opt_part.cnot_counts[tdx])}")
+        
         return optimized_partitions
 
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
+        self._diagnostic_print("\n" + "="*70)
+        self._diagnostic_print("PartAM: Starting Partition Aware Mapping")
+        self._diagnostic_print("="*70)
+        self._diagnostic_print(f"Original circuit: {circ.get_Qbit_Num()} qubits, {len(circ.get_Gates())} gates")
+        
         optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
+        self._diagnostic_print(f"\nSynthesized {len(optimized_partitions)} partitions")
+        
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
+        if self.config.get('diagnostics', False):
+            self._diagnostic_print("\n=== DAG Construction ===")
+            for idx in range(len(DAG)):
+                self._diagnostic_print(f"  Partition {idx}: parents={IDAG[idx][1]}, children={DAG[idx][1]}")
+        
         D = self.compute_distances_bfs(circ.get_Qbit_Num())
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
+        pi_list = pi.tolist() if hasattr(pi, 'tolist') else list(pi)
+        self._diagnostic_print(f"\nInitial layout (pi): {pi_list}")
+        
         F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
+        self._diagnostic_print(f"Initial front set (F): {F}")
+        
         partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,optimized_partitions,D)
+        pi_final_list = pi_final.tolist() if hasattr(pi_final, 'tolist') else list(pi_final)
+        self._diagnostic_print(f"\nFinal permutation (pi_final): {pi_final_list}")
+        self._diagnostic_print(f"Partition order length: {len(partition_order)}")
+        
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions, circ.get_Qbit_Num())
+        self._diagnostic_print(f"\nFinal circuit: {len(final_circuit.get_Gates())} gates, {len(final_parameters)} parameters")
+        
         if not check_circuit_compatibility(final_circuit, self.topology):
             raise Exception("The final circuit is not compatible with the topology.")
+        
+        # Matrix-level validation
+        if self.config.get('diagnostics', False):
+            try:
+                original_matrix = circ.get_Matrix(orig_parameters)
+                final_matrix = final_circuit.get_Matrix(final_parameters)
+                matrix_error = np.linalg.norm(original_matrix - final_matrix, 'fro')
+                self._diagnostic_print(f"\n=== Matrix Validation ===")
+                self._diagnostic_print(f"Original circuit matrix: {original_matrix.shape}")
+                self._diagnostic_print(f"Final circuit matrix: {final_matrix.shape}")
+                self._diagnostic_print(f"Matrix Frobenius norm error: {matrix_error:.2e}")
+                
+                # Check if matrices match (accounting for permutation)
+                # The final circuit should match original up to permutation
+                fidelity = abs(np.trace(original_matrix @ final_matrix.conj().T)) / original_matrix.shape[0]
+                self._diagnostic_print(f"Matrix fidelity: {fidelity:.10f}")
+                
+                # Check individual partition matrices
+                self._diagnostic_print(f"\n=== Partition Matrix Validation ===")
+                for idx, opt_part in enumerate(optimized_partitions):
+                    if not isinstance(opt_part, SingleQubitPartitionResult):
+                        # Test the best candidate for each partition
+                        for tdx in range(opt_part.topology_count):
+                            if opt_part.cnot_counts[tdx]:
+                                best_idx = np.argmin(opt_part.cnot_counts[tdx])
+                                test_circuit = opt_part.synthesised_circuits[tdx][best_idx]
+                                test_params = opt_part.synthesised_parameters[tdx][best_idx]
+                                try:
+                                    # Remap original circuit to partition space
+                                    qbit_map = opt_part.qubit_map
+                                    remapped_orig = opt_part.original_circuit.Remap_Qbits(qbit_map, opt_part.N)
+                                    
+                                    # Create test circuit with permutations like in synthesis
+                                    P_i, P_o = opt_part.permutations_pairs[tdx][best_idx]
+                                    test_full = Circuit(opt_part.N)
+                                    test_full.add_Permutation(P_i)
+                                    test_full.add_Circuit(opt_part.original_circuit)
+                                    test_full.add_Permutation(P_o)
+                                    orig_matrix = test_full.get_Matrix(opt_part.synthesised_parameters[tdx][best_idx])
+                                    
+                                    # Compare with synthesized
+                                    synth_matrix = test_circuit.get_Matrix(test_params)
+                                    part_error = np.linalg.norm(orig_matrix - synth_matrix, 'fro')
+                                    if part_error > 1e-6:
+                                        self._diagnostic_print(f"  Partition {idx}, topology {tdx}, candidate {best_idx}: error = {part_error:.2e} (P_i={P_i}, P_o={P_o})")
+                                except Exception as e:
+                                    self._diagnostic_print(f"  Partition {idx}, topology {tdx}: validation error - {e}")
+            except Exception as e:
+                self._diagnostic_print(f"\nMatrix validation failed: {e}")
+                import traceback
+                self._diagnostic_print(traceback.format_exc())
+        
         return final_circuit, final_parameters, pi, pi_final
 
     def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
         resolved_partitions = [False] * len(DAG)
         partition_order = []
+        step = 0
+        
+        if self.config.get('diagnostics', False):
+            self._diagnostic_print("\n=== Heuristic Search ===")
+        
         while len(F) != 0:
             scores = []
             partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
@@ -217,8 +331,25 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
                 for partition_candidate in partition_candidates:
                     score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, DAG)
                     scores.append(score)
+            if len(scores) == 0:
+                break
             min_idx = np.argmin(scores)
             min_partition_candidate = partition_candidates[min_idx]
+            
+            if self.config.get('diagnostics', False):
+                self._diagnostic_print(f"\n  Step {step}:")
+                self._diagnostic_print(f"    Selected partition: {min_partition_candidate.partition_idx}")
+                # Convert pi to list if it's a numpy array
+                pi_list = pi.tolist() if hasattr(pi, 'tolist') else list(pi)
+                self._diagnostic_print(f"    Current pi: {pi_list}")
+                pi_input = min_partition_candidate.transform_pi_input(pi)
+                pi_input_list = pi_input if isinstance(pi_input, list) else pi_input.tolist()
+                self._diagnostic_print(f"    Required pi_input: {pi_input_list}")
+                pi_output = min_partition_candidate.transform_pi_output(pi_input)
+                pi_output_list = pi_output if isinstance(pi_output, list) else pi_output.tolist()
+                self._diagnostic_print(f"    Result pi_output: {pi_output_list}")
+                self._diagnostic_print(f"    Best score: {scores[min_idx]:.4f}, candidates evaluated: {len(scores)}")
+            
             F.remove(min_partition_candidate.partition_idx)
             resolved_partitions[min_partition_candidate.partition_idx] = True
             partition_order.append(permutation_to_cnot_circuit(pi, min_partition_candidate.transform_pi_input(pi)))
@@ -226,6 +357,7 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
             partition_order.append(min_partition_candidate)
             pi = min_partition_candidate.transform_pi_output(pi)
             children = DAG[min_partition_candidate.partition_idx][1]
+            step += 1
             while len(children) != 0:
                 child = children.pop(0)
                 if not resolved_partitions[child] and child not in F:
@@ -242,17 +374,41 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
     def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         final_circuit = Circuit(N)
         final_parameters = []
+        perm_count = 0
+        partition_count = 0
+        
+        if self.config.get('diagnostics', False):
+            self._diagnostic_print("\n=== Circuit Construction ===")
+        
         for part in partition_order:
             if isinstance(part, Circuit):
                 final_circuit.add_Circuit(part)
+                perm_count += 1
+                if self.config.get('diagnostics', False):
+                    self._diagnostic_print(f"  Added permutation circuit ({perm_count} CNOT gates)")
             elif isinstance(part, SingleQubitPartitionResult):
                 final_circuit.add_Circuit(part.circuit)
                 final_parameters.append(part.parameters)
+                partition_count += 1
+                if self.config.get('diagnostics', False):
+                    self._diagnostic_print(f"  Added single-qubit partition {partition_count}, qubit={part.circuit.get_Qbits()}")
             else:
                 part_circ, part_parameters = part.get_final_circuit(optimized_partitions,N)
                 final_circuit.add_Circuit(part_circ)
                 final_parameters.append(part_parameters)
-        final_parameters = np.concatenate(final_parameters,axis=0)
+                partition_count += 1
+                if self.config.get('diagnostics', False):
+                    cnot_count = part_circ.get_Gate_Nums().get('CNOT', 0)
+                    self._diagnostic_print(f"  Added partition {part.partition_idx} (multi-qubit), CNOTs={cnot_count}, qubits={part.involved_qbits}")
+        
+        if final_parameters:
+            final_parameters = np.concatenate(final_parameters,axis=0)
+        else:
+            final_parameters = np.array([])
+        
+        if self.config.get('diagnostics', False):
+            self._diagnostic_print(f"  Total: {perm_count} permutation circuits, {partition_count} partitions")
+        
         return final_circuit, final_parameters
 
     def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, DAG):
@@ -275,7 +431,8 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                         for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx].permutations_pairs[tdx]):
                             new_cand = PartitionCandidate(partition_idx,tdx,pdx,optimized_partitions[partition_idx].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx].qubit_map,optimized_partitions[partition_idx].involved_qbits)
                             mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-                score_E += min(mini_scores)
+                if mini_scores:
+                    score_E += min(mini_scores)
             else:
                 is_single_qubit = True
                 partition_idx_new = DAG[partition_idx][1]
@@ -294,7 +451,8 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                             for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_new].permutations_pairs[tdx]):
                                 new_cand = PartitionCandidate(partition_idx_new,tdx,pdx,optimized_partitions[partition_idx_new].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_new].qubit_map,optimized_partitions[partition_idx_new].involved_qbits)
                                 mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-                    score_E += min(mini_scores)
+                    if mini_scores:
+                        score_E += min(mini_scores)
                 
         for partition_idx in F:
             partition = optimized_partitions[partition_idx]
@@ -305,7 +463,8 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
                         mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-            score_F += min(mini_scores)
+            if mini_scores:
+                score_F += min(mini_scores)
             for partition_idx_E in DAG[partition_idx][1]:
                 if not isinstance(optimized_partitions[partition_idx_E], SingleQubitPartitionResult):
                     if partition_idx_E in E_visited_partitions:
@@ -318,7 +477,8 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                             for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
                                 new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
                                 mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-                    score_E += min(mini_scores)
+                    if mini_scores:
+                        score_E += min(mini_scores)
                 else:
                     is_single_qubit = True
                     partition_idx_new = DAG[partition_idx_E][1]
@@ -327,19 +487,31 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                         partition_idx_new = DAG[partition_idx_new[0]][1]
                     if len(partition_idx_new) != 0 and not is_single_qubit:
                         partition_idx_new = partition_idx_new[0]
-                        if partition_idx_E in E_visited_partitions:
+                        if partition_idx_new in E_visited_partitions:
                             continue
-                        E_visited_partitions.append(partition_idx_E)
+                        E_visited_partitions.append(partition_idx_new)
                         mini_scores = []
-                        for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_E].mini_topologies):
+                        for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_new].mini_topologies):
                             topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
                             for topology_candidate in topology_candidates:
-                                for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
-                                    new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
+                                for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_new].permutations_pairs[tdx]):
+                                    new_cand = PartitionCandidate(partition_idx_new,tdx,pdx,optimized_partitions[partition_idx_new].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_new].qubit_map,optimized_partitions[partition_idx_new].involved_qbits)
                                     mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-                        score_E += min(mini_scores)
+                        if mini_scores:
+                            score_E += min(mini_scores)
 
-        return 0.2*score_E/len(E_visited_partitions) + score_F/len(F)
+        # Safety check for division by zero
+        if len(E_visited_partitions) == 0:
+            E_score = 0.0
+        else:
+            E_score = 0.2 * score_E / len(E_visited_partitions)
+        
+        if len(F) == 0:
+            F_score = 0.0
+        else:
+            F_score = score_F / len(F)
+        
+        return E_score + F_score
 
     def obtain_partition_candidates(self, F, optimized_partitions):
         partition_candidates = []

From 080faf9173f3c21e188ea7098996e337692eeba1 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Fri, 21 Nov 2025 17:36:29 +0100
Subject: [PATCH 031/232] Remove unneeded diagnostics

---
 examples/decomposition/PartAM_example.py |  52 +----
 examples/decomposition/example_SABRE.py  |  50 ++---
 squander/synthesis/PartAM.py             | 233 ++++++++---------------
 squander/synthesis/PartAM_utils.py       | 200 ++++++++++---------
 4 files changed, 206 insertions(+), 329 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 826a6ae6d..032838aae 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -60,60 +60,12 @@
     circ_Final = Circuit(circ.get_Qbit_Num() )
     output_perm_T = [0]* circ.get_Qbit_Num() 
     for i, j in enumerate(output_perm):
-        output_perm_T[j] = i
-    # Validate permutation inverse calculation
-    if config.get('diagnostics', False):
-        print(f"\n{'='*70}")
-        print(f"Permutation Validation")
-        print(f"{'='*70}")
-        print(f"input_perm (initial pi): {input_perm}")
-        print(f"output_perm (final pi): {output_perm}")
-        
-        # Compute inverse
-        output_perm_T = [0] * circ.get_Qbit_Num()
-        for i, j in enumerate(output_perm):
-            output_perm_T[j] = i
-        print(f"output_perm_T (inverse): {output_perm_T}")
-        
-        # Verify inverse: output_perm_T[output_perm[i]] should equal i
-        test_inverse = [output_perm_T[output_perm[i]] for i in range(len(output_perm))]
-        if test_inverse != list(range(len(output_perm))):
-            print(f"  ERROR: Inverse calculation is WRONG!")
-            print(f"  Expected: {list(range(len(output_perm)))}")
-            print(f"  Got: {test_inverse}")
-        else:
-            print(f"  Inverse verified: OK")
-    
-    if not config.get('diagnostics', False):
-        output_perm_T = [0] * circ.get_Qbit_Num()
-        for i, j in enumerate(output_perm):
-            output_perm_T[j] = i
-    
+        output_perm_T[j] = i        
     circ_Final.add_Permutation(input_perm)
     circ_Final.add_Circuit(circ)
     circ_Final.add_Permutation(output_perm_T)
     
-    # Additional matrix validation in example
-    if config.get('diagnostics', False):
-        try:
-            print(f"\n{'='*70}")
-            print(f"Final Circuit Matrix Validation")
-            print(f"{'='*70}")
-            orig_matrix = circ_orig.get_Matrix(parameters_orig)
-            final_matrix = circ_Final.get_Matrix(params)
-            matrix_error = np.linalg.norm(orig_matrix - final_matrix, 'fro')
-            print(f"Original vs Final circuit error: {matrix_error:.2e}")
-            
-            # Test without output permutation
-            circ_test = Circuit(circ.get_Qbit_Num())
-            circ_test.add_Permutation(input_perm)
-            circ_test.add_Circuit(circ)
-            test_matrix = circ_test.get_Matrix(params)
-            test_error = np.linalg.norm(orig_matrix - test_matrix, 'fro')
-            print(f"Without output perm error: {test_error:.2e}")
-        except Exception as e:
-            print(f"Matrix validation error: {e}")
-    
+    # Additional matrix validation in example    
     PartAM_state = initial_state.copy()
     circ_Final.apply_to(params, PartAM_state)
     state_error = 1 - abs(np.vdot(PartAM_state, original_state))
diff --git a/examples/decomposition/example_SABRE.py b/examples/decomposition/example_SABRE.py
index aa17184c4..68a235259 100644
--- a/examples/decomposition/example_SABRE.py
+++ b/examples/decomposition/example_SABRE.py
@@ -5,11 +5,6 @@
 
 from qiskit import transpile
 from qiskit import QuantumCircuit
-from qiskit.circuit import CircuitInstruction
-from qiskit.circuit.library import PermutationGate
-from qiskit_aer import AerSimulator
-from qiskit.quantum_info import Operator
-from qiskit import QuantumRegister, ClassicalRegister
 import numpy as np
 parameters = np.array([])
 
@@ -40,16 +35,16 @@
 print("INITIAL CIRCUIT:")
 #print( circuit_qiskit )
 print("mapping (q -> Q):", pi)
-print("Final mapping:", final_pi)
 qubits = list(range(N))
-pi_map = list(np.array(sabre.get_inverse_pi(pi)))
+pi_map = list(np.array(sabre.get_inverse_pi(final_pi)))
+print("Final mapping:", final_pi)
 final_circuit = Circuit(N)
-final_circuit.add_Permutation(list(pi_map))
+final_circuit.add_Permutation(list(pi)) 
 final_circuit.add_Circuit(Squander_remapped_circuit)
-final_circuit.add_Permutation(list(final_pi))
+final_circuit.add_Permutation(list(pi_map))
 Qiskit_circuit = Qiskit_IO.get_Qiskit_Circuit( final_circuit.get_Flat_Circuit(), parameters_remapped_circuit )
 print("CIRCUIT MAPPED WITH SABRE:")
-print( Qiskit_circuit )
+#print( Qiskit_circuit )
 print("SABRE SWAP COUNT:", swap_count)
 # defining the qubit topology/connectivity for Squander
 coupling_map = [
@@ -63,27 +58,14 @@
 print("CIRCUIT MAPPED WITH QISKIT:")
 #print( Qiskit_circuit_mapped )
 print("QISKIT SWAP COUNT:",  dict(Qiskit_circuit_mapped.count_ops())['swap'])
-
-# test the generated squander circuits
-#matrix_size = 1 << Squander_initial_circuit.get_Qbit_Num()
-#unitary_squander_initial = utils.get_unitary_from_qiskit_circuit_operator(circuit_qiskit)
-
-#unitary_squander_remapped_circuit = np.eye( 1 << Squander_initial_circuit.get_Qbit_Num(), dtype=np.complex128 )
-#Squander_remapped_circuit.apply_to( parameters_remapped_circuit, unitary_squander_remapped_circuit)
-"""
-unitary_squander_remapped_circuit = utils.get_unitary_from_qiskit_circuit_operator(Qiskit_circuit)
-
-
-product_matrix = np.dot(unitary_squander_initial.conj().T, unitary_squander_remapped_circuit)
-phase = np.angle(product_matrix[0,0])
-product_matrix = product_matrix*np.exp(-1j*phase)
-
-    
-product_matrix = np.eye(matrix_size)*2 - product_matrix - product_matrix.conj().T
-
-# the error of the decomposition
-decomposition_error =  (np.real(np.trace(product_matrix)))/2
-       
-print('The error of the decomposition is ' + str(decomposition_error))
-
-"""
\ No newline at end of file
+num_qubits = final_circuit.get_Qbit_Num() 
+matrix_size = 1 << num_qubits 
+initial_state_real = np.random.uniform(-1.0,1.0, (matrix_size,) )
+initial_state_imag = np.random.uniform(-1.0,1.0, (matrix_size,) )
+initial_state = initial_state_real + initial_state_imag*1j
+initial_state = initial_state/np.linalg.norm(initial_state)
+original_state = initial_state.copy()
+Squander_initial_circuit.apply_to(parameters_initial,original_state)
+SABRE_state = initial_state.copy()
+final_circuit.apply_to(parameters_remapped_circuit,SABRE_state)
+print(f"ERROR: {1-abs(np.vdot(SABRE_state,original_state))}")
\ No newline at end of file
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 7f38589a2..0c15f51de 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -26,7 +26,7 @@
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 from squander.synthesis.PartAM_utils import (get_subtopologies_of_type, get_unique_subtopologies, 
 SingleQubitPartitionResult, PartitionSynthesisResult, 
-PartitionCandidate, permutation_to_cnot_circuit, min_cnots_between_permutations, check_circuit_compatibility)
+PartitionCandidate, check_circuit_compatibility, construct_swap_circuit)
 
 class qgd_Partition_Aware_Mapping:
 
@@ -239,10 +239,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         self._diagnostic_print(f"\nSynthesized {len(optimized_partitions)} partitions")
         
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
-        if self.config.get('diagnostics', False):
-            self._diagnostic_print("\n=== DAG Construction ===")
-            for idx in range(len(DAG)):
-                self._diagnostic_print(f"  Partition {idx}: parents={IDAG[idx][1]}, children={DAG[idx][1]}")
+        sDAG = self.construct_sDAG(optimized_partitions)
         
         D = self.compute_distances_bfs(circ.get_Qbit_Num())
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
@@ -252,7 +249,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
         self._diagnostic_print(f"Initial front set (F): {F}")
         
-        partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,optimized_partitions,D)
+        partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,IDAG, optimized_partitions,D, sDAG)
         pi_final_list = pi_final.tolist() if hasattr(pi_final, 'tolist') else list(pi_final)
         self._diagnostic_print(f"\nFinal permutation (pi_final): {pi_final_list}")
         self._diagnostic_print(f"Partition order length: {len(partition_order)}")
@@ -260,63 +257,9 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions, circ.get_Qbit_Num())
         self._diagnostic_print(f"\nFinal circuit: {len(final_circuit.get_Gates())} gates, {len(final_parameters)} parameters")
         
-        if not check_circuit_compatibility(final_circuit, self.topology):
-            raise Exception("The final circuit is not compatible with the topology.")
-        
-        # Matrix-level validation
-        if self.config.get('diagnostics', False):
-            try:
-                original_matrix = circ.get_Matrix(orig_parameters)
-                final_matrix = final_circuit.get_Matrix(final_parameters)
-                matrix_error = np.linalg.norm(original_matrix - final_matrix, 'fro')
-                self._diagnostic_print(f"\n=== Matrix Validation ===")
-                self._diagnostic_print(f"Original circuit matrix: {original_matrix.shape}")
-                self._diagnostic_print(f"Final circuit matrix: {final_matrix.shape}")
-                self._diagnostic_print(f"Matrix Frobenius norm error: {matrix_error:.2e}")
-                
-                # Check if matrices match (accounting for permutation)
-                # The final circuit should match original up to permutation
-                fidelity = abs(np.trace(original_matrix @ final_matrix.conj().T)) / original_matrix.shape[0]
-                self._diagnostic_print(f"Matrix fidelity: {fidelity:.10f}")
-                
-                # Check individual partition matrices
-                self._diagnostic_print(f"\n=== Partition Matrix Validation ===")
-                for idx, opt_part in enumerate(optimized_partitions):
-                    if not isinstance(opt_part, SingleQubitPartitionResult):
-                        # Test the best candidate for each partition
-                        for tdx in range(opt_part.topology_count):
-                            if opt_part.cnot_counts[tdx]:
-                                best_idx = np.argmin(opt_part.cnot_counts[tdx])
-                                test_circuit = opt_part.synthesised_circuits[tdx][best_idx]
-                                test_params = opt_part.synthesised_parameters[tdx][best_idx]
-                                try:
-                                    # Remap original circuit to partition space
-                                    qbit_map = opt_part.qubit_map
-                                    remapped_orig = opt_part.original_circuit.Remap_Qbits(qbit_map, opt_part.N)
-                                    
-                                    # Create test circuit with permutations like in synthesis
-                                    P_i, P_o = opt_part.permutations_pairs[tdx][best_idx]
-                                    test_full = Circuit(opt_part.N)
-                                    test_full.add_Permutation(P_i)
-                                    test_full.add_Circuit(opt_part.original_circuit)
-                                    test_full.add_Permutation(P_o)
-                                    orig_matrix = test_full.get_Matrix(opt_part.synthesised_parameters[tdx][best_idx])
-                                    
-                                    # Compare with synthesized
-                                    synth_matrix = test_circuit.get_Matrix(test_params)
-                                    part_error = np.linalg.norm(orig_matrix - synth_matrix, 'fro')
-                                    if part_error > 1e-6:
-                                        self._diagnostic_print(f"  Partition {idx}, topology {tdx}, candidate {best_idx}: error = {part_error:.2e} (P_i={P_i}, P_o={P_o})")
-                                except Exception as e:
-                                    self._diagnostic_print(f"  Partition {idx}, topology {tdx}: validation error - {e}")
-            except Exception as e:
-                self._diagnostic_print(f"\nMatrix validation failed: {e}")
-                import traceback
-                self._diagnostic_print(traceback.format_exc())
-        
         return final_circuit, final_parameters, pi, pi_final
 
-    def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
+    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
@@ -329,7 +272,7 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
             partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
             if len(partition_candidates) != 0:
                 for partition_candidate in partition_candidates:
-                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, DAG)
+                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, sDAG, D)
                     scores.append(score)
             if len(scores) == 0:
                 break
@@ -342,31 +285,28 @@ def Heuristic_Search(self, F, pi, DAG, optimized_partitions, D):
                 # Convert pi to list if it's a numpy array
                 pi_list = pi.tolist() if hasattr(pi, 'tolist') else list(pi)
                 self._diagnostic_print(f"    Current pi: {pi_list}")
-                pi_input = min_partition_candidate.transform_pi_input(pi)
-                pi_input_list = pi_input if isinstance(pi_input, list) else pi_input.tolist()
-                self._diagnostic_print(f"    Required pi_input: {pi_input_list}")
-                pi_output = min_partition_candidate.transform_pi_output(pi_input)
-                pi_output_list = pi_output if isinstance(pi_output, list) else pi_output.tolist()
-                self._diagnostic_print(f"    Result pi_output: {pi_output_list}")
                 self._diagnostic_print(f"    Best score: {scores[min_idx]:.4f}, candidates evaluated: {len(scores)}")
             
             F.remove(min_partition_candidate.partition_idx)
             resolved_partitions[min_partition_candidate.partition_idx] = True
-            partition_order.append(permutation_to_cnot_circuit(pi, min_partition_candidate.transform_pi_input(pi)))
-            pi = min_partition_candidate.transform_pi_input(pi)
+            swap_order, pi = min_partition_candidate.transform_pi(pi, D)
+            if len(swap_order)!=0:
+                partition_order.append(construct_swap_circuit(swap_order, len(pi)))
             partition_order.append(min_partition_candidate)
-            pi = min_partition_candidate.transform_pi_output(pi)
-            children = DAG[min_partition_candidate.partition_idx][1]
+            children = DAG[min_partition_candidate.partition_idx]
             step += 1
             while len(children) != 0:
                 child = children.pop(0)
-                if not resolved_partitions[child] and child not in F:
+                parents_resolved = True
+                for parent in IDAG[child]:
+                    parents_resolved *= resolved_partitions[parent]
+                if (not resolved_partitions[child] and child not in F) and parents_resolved:
                     if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
                         child_partition = optimized_partitions[child]
                         qubit = child_partition.circuit.get_Qbits()[0]
                         child_partition.circuit.map_circuit({qubit: pi[qubit]})
                         partition_order.append(child_partition)
-                        children.extend(DAG[child][1])
+                        children.extend(DAG[child])
                     else:
                         F.append(child)
         return partition_order, pi
@@ -411,49 +351,28 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         
         return final_circuit, final_parameters
 
-    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, DAG):
+    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, sDAG, D):
         score_F = 0
         score_E = 0
         E_visited_partitions = []
-        input_perm = partition_candidate.transform_pi_input(pi)
-        output_perm = partition_candidate.transform_pi_output(input_perm)
-        score_F += min_cnots_between_permutations(pi,input_perm)
+        swaps, output_perm = partition_candidate.transform_pi(pi, D)
+        score_F += len(swaps)*3
         score_F += len(partition_candidate.circuit_structure)
-        for partition_idx in DAG[partition_candidate.partition_idx][1]:
-            if not isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
-                if partition_idx in E_visited_partitions:
-                    continue
-                E_visited_partitions.append(partition_idx)
-                mini_scores = []
-                for tdx, mini_topology in enumerate(optimized_partitions[partition_idx].mini_topologies):
-                    topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
-                    for topology_candidate in topology_candidates:
-                        for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx].permutations_pairs[tdx]):
-                            new_cand = PartitionCandidate(partition_idx,tdx,pdx,optimized_partitions[partition_idx].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx].qubit_map,optimized_partitions[partition_idx].involved_qbits)
-                            mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-                if mini_scores:
-                    score_E += min(mini_scores)
-            else:
-                is_single_qubit = True
-                partition_idx_new = DAG[partition_idx][1]
-                while is_single_qubit and len(partition_idx_new) != 0:
-                    is_single_qubit = isinstance(optimized_partitions[partition_idx_new[0]], SingleQubitPartitionResult)
-                    partition_idx_new = DAG[partition_idx_new[0]][1]
-                if len(partition_idx_new) != 0 and not is_single_qubit:
-                    partition_idx_new = partition_idx_new[0]
-                    if partition_idx_new in E_visited_partitions:
-                        continue
-                    E_visited_partitions.append(partition_idx_new)
-                    mini_scores = []
-                    for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_new].mini_topologies):
-                        topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
-                        for topology_candidate in topology_candidates:
-                            for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_new].permutations_pairs[tdx]):
-                                new_cand = PartitionCandidate(partition_idx_new,tdx,pdx,optimized_partitions[partition_idx_new].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_new].qubit_map,optimized_partitions[partition_idx_new].involved_qbits)
-                                mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-                    if mini_scores:
-                        score_E += min(mini_scores)
-                
+
+        for partition_idx in sDAG[partition_candidate.partition_idx]:
+            if partition_idx in E_visited_partitions:
+                continue
+            E_visited_partitions.append(partition_idx)
+            mini_scores = []
+            for tdx, mini_topology in enumerate(optimized_partitions[partition_idx].mini_topologies):
+                topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                for topology_candidate in topology_candidates:
+                    for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx].permutations_pairs[tdx]):
+                        new_cand = PartitionCandidate(partition_idx,tdx,pdx,optimized_partitions[partition_idx].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx].qubit_map,optimized_partitions[partition_idx].involved_qbits)
+                        mini_scores.append(len(new_cand.transform_pi(output_perm,D)[0])*3+len(new_cand.circuit_structure))
+            if mini_scores:
+                score_E += min(mini_scores)
+
         for partition_idx in F:
             partition = optimized_partitions[partition_idx]
             mini_scores = []
@@ -462,44 +381,23 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                        mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
+                        mini_scores.append(len(new_cand.transform_pi(output_perm,D)[0])*3+len(new_cand.circuit_structure))
             if mini_scores:
                 score_F += min(mini_scores)
-            for partition_idx_E in DAG[partition_idx][1]:
-                if not isinstance(optimized_partitions[partition_idx_E], SingleQubitPartitionResult):
-                    if partition_idx_E in E_visited_partitions:
-                        continue
-                    E_visited_partitions.append(partition_idx_E)
-                    mini_scores = []
-                    for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_E].mini_topologies):
-                        topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
-                        for topology_candidate in topology_candidates:
-                            for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
-                                new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
-                                mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-                    if mini_scores:
-                        score_E += min(mini_scores)
-                else:
-                    is_single_qubit = True
-                    partition_idx_new = DAG[partition_idx_E][1]
-                    while is_single_qubit and len(partition_idx_new) != 0:
-                        is_single_qubit = isinstance(optimized_partitions[partition_idx_new[0]], SingleQubitPartitionResult)
-                        partition_idx_new = DAG[partition_idx_new[0]][1]
-                    if len(partition_idx_new) != 0 and not is_single_qubit:
-                        partition_idx_new = partition_idx_new[0]
-                        if partition_idx_new in E_visited_partitions:
-                            continue
-                        E_visited_partitions.append(partition_idx_new)
-                        mini_scores = []
-                        for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_new].mini_topologies):
-                            topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
-                            for topology_candidate in topology_candidates:
-                                for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_new].permutations_pairs[tdx]):
-                                    new_cand = PartitionCandidate(partition_idx_new,tdx,pdx,optimized_partitions[partition_idx_new].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_new].qubit_map,optimized_partitions[partition_idx_new].involved_qbits)
-                                    mini_scores.append(min_cnots_between_permutations(output_perm,new_cand.transform_pi_input(output_perm))+len(new_cand.circuit_structure))
-                        if mini_scores:
-                            score_E += min(mini_scores)
 
+            for partition_idx_E in sDAG[partition_idx]:
+                if partition_idx_E in E_visited_partitions:
+                    continue
+                E_visited_partitions.append(partition_idx_E)
+                mini_scores = []
+                for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_E].mini_topologies):
+                    topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                    for topology_candidate in topology_candidates:
+                        for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
+                            new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
+                            mini_scores.append(len(new_cand.transform_pi(output_perm,D)[0])*3+len(new_cand.circuit_structure))
+                if mini_scores:
+                    score_E += min(mini_scores)
         # Safety check for division by zero
         if len(E_visited_partitions) == 0:
             E_score = 0.0
@@ -528,9 +426,9 @@ def get_initial_layer(self, IDAG, N, optimized_partitions):
         initial_layer = []
         active_qbits = list(range(N))
         for idx in range(len(IDAG)):
-            if len(IDAG[idx][1]) == 0:
+            if len(IDAG[idx]) == 0:
                 initial_layer.append(idx)
-                for qbit in optimized_partitions[IDAG[idx][0]].involved_qbits:
+                for qbit in optimized_partitions[idx].involved_qbits:
                     active_qbits.remove(qbit)
             if len(active_qbits) == 0:
                 break
@@ -564,9 +462,38 @@ def construct_DAG_and_IDAG(self, optimized_partitions):
                         involved_qbits_current.remove(intersection_qbit)
                     if len(involved_qbits_current) == 0:
                         break
-            DAG.append([idx, children])
-            IDAG.append([idx, parents])
+            DAG.append(children)
+            IDAG.append(parents)
         return DAG, IDAG
+    
+    def construct_sDAG(self, optimized_partitions):
+        sDAG = []
+        
+        for idx in range(len(optimized_partitions)):
+            # Skip single-qubit partitions
+            if len(optimized_partitions[idx].involved_qbits) <= 1:
+                continue
+                
+            children = []
+            
+            if idx != len(optimized_partitions)-1:
+                involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
+                for next_idx in range(idx+1, len(optimized_partitions)):
+                    # Skip single-qubit partitions when searching for children
+                    if len(optimized_partitions[next_idx].involved_qbits) <= 1:
+                        continue
+                        
+                    involved_qbits_next = optimized_partitions[next_idx].involved_qbits
+                    intersection = [i for i in involved_qbits_current if i in involved_qbits_next]
+                    if len(intersection) > 0:
+                        children.append(next_idx)
+                        for intersection_qbit in intersection:
+                            involved_qbits_current.remove(intersection_qbit)
+                    if len(involved_qbits_current) == 0:
+                        break                        
+            sDAG.append(children)
+            
+        return sDAG
 
     def compute_distances_bfs(self, N):
         """BFS distance computation - faster than Floyd-Warshall."""
@@ -592,7 +519,7 @@ def compute_distances_bfs(self, N):
                         D[start][neighbor] = dist + 1
                         queue.append((neighbor, dist + 1))
         
-        return D*3 #multiply by 3 to make it CNOT cost instead of SWAP cost
+        return D #multiply by 3 to make it CNOT cost instead of SWAP cost
 
     def _compute_smart_initial_layout(self, circuit, N, D):
 
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index e3d46103e..2bfa9501a 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -2,6 +2,8 @@
 from typing import List, Tuple, Set, FrozenSet
 from itertools import permutations
 from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
+import heapq
+
 def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
     adj_list = {}
     for u, v in edges:
@@ -107,90 +109,99 @@ def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int
             return mapping
     return {}
 
-def min_cnots_between_permutations(A, B):
-    n = len(A)
-    inv_B = [0] * n
-    for pos, qubit in enumerate(B):
-        inv_B[qubit] = pos
+def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
+    """
+    Find SWAP sequence to route subset of virtual qubits to targets.
     
-    P = [inv_B[A[i]] for i in range(n)]
-    visited = [False] * n
-    total_cnots = 0
+    Args:
+        pi_A: List [Q0, Q1, ...] where pi_A[q] = Q (complete initial mapping)
+        pi_B_dict: Dict {q: Q} specifying only qubits that need routing
+        dist_matrix: Pre-computed distance matrix dist[i][j] between physical qubits
     
-    for i in range(n):
-        if not visited[i]:
-            cycle_len = 0
-            j = i
-            while not visited[j]:
-                visited[j] = True
-                j = P[j]
-                cycle_len += 1
-            if cycle_len >= 2:
-                total_cnots += 2 * cycle_len - 3
+    Returns:
+        swaps: List of (i, j) SWAP operations on adjacent physical qubits
+        final_permutation: List showing final virtual→physical mapping
+    """
+    n = len(pi_A)
     
-    return total_cnots
-
-def permutation_to_cnot_circuit(A, B):
-    n = len(A)
-    inv_B = {qubit: pos for pos, qubit in enumerate(B)}
-    P = [inv_B[A[i]] for i in range(n)]
+    # Build adjacency list from distance matrix
+    adj = [set() for _ in range(n)]
+    for i in range(n):
+        for j in range(i+1, n):
+            if dist_matrix[i][j] == 1:  # Adjacent in topology
+                adj[i].add(j)
+                adj[j].add(i)
     
-    visited = [False] * n
-    cnot_circuit = Circuit(n)
+    # Use physical-to-virtual representation for easier SWAP handling
+    # state[P] = q means physical qubit P contains virtual qubit q
+    def to_phys_to_virt(virt_to_phys):
+        """Convert virtual→physical list to physical→virtual list"""
+        p2v = [0] * n
+        for q in range(n):
+            P = virt_to_phys[q]
+            p2v[P] = q
+        return p2v
     
-    for i in range(n):
-        if not visited[i]:
-            # Extract cycle
-            cycle = []
-            j = i
-            while not visited[j]:
-                visited[j] = True
-                cycle.append(j)
-                j = P[j]
-            
-            # Convert cycle to CNOTs
-            k = len(cycle)
-            if k == 2:
-                cnot_circuit.add_CNOT(cycle[1], cycle[0])
-            elif k >= 3:
-                # Forward pass
-                for idx in range(k - 1):
-                    cnot_circuit.add_CNOT(cycle[idx + 1], cycle[idx])
-                # Backward pass
-                for idx in range(k - 2, 0, -1):
-                    cnot_circuit.add_CNOT(cycle[idx + 1], cycle[idx])
+    def to_virt_to_phys(phys_to_virt):
+        """Convert physical→virtual list to virtual→physical list"""
+        v2p = [0] * n
+        for P in range(n):
+            q = phys_to_virt[P]
+            v2p[q] = P
+        return v2p
     
-    return cnot_circuit
-
-def find_best_permutation_with_constraints(A, constraints, strategy='greedy'):
-    n = len(A)
-    B = [None] * n
-    used_qubits = set()
+    start_state = tuple(to_phys_to_virt(pi_A))
     
-    # Apply constraints
-    for pos, qubit in constraints.items():
-        B[pos] = qubit
-        used_qubits.add(qubit)
+    def is_goal(state):
+        """Check if target qubits are in correct physical positions"""
+        for q, target_P in pi_B_dict.items():
+            if state[target_P] != q:  # Physical position target_P should contain virtual q
+                return False
+        return True
     
-    # Fill unconstrained positions
-    available_qubits = [q for q in range(n) if q not in used_qubits]
-    unconstrained_positions = [i for i in range(n) if B[i] is None]
-
-    for pos in unconstrained_positions:
-        if A[pos] in available_qubits:
-            B[pos] = A[pos]
-            available_qubits.remove(A[pos])
+    def heuristic(state):
+        """Lower bound: sum of distances for qubits needing routing"""
+        total = 0
+        for q, target_P in pi_B_dict.items():
+            # Find where virtual qubit q currently is
+            current_P = state.index(q)
+            total += dist_matrix[current_P][target_P]
+        return total // 2  # Optimistic: each SWAP helps 2 qubits
     
-    # Fill remaining positions with remaining qubits
-    j = 0
-    for pos in unconstrained_positions:
-        if B[pos] is None:
-            B[pos] = available_qubits[j]
-            j += 1
+    # A* search
+    heap = [(heuristic(start_state), 0, start_state, [])]
+    visited = {start_state: 0}
     
+    while heap:
+        f, g, current, path = heapq.heappop(heap)
+        
+        if is_goal(current):
+            # Convert final state back to virtual→physical mapping
+            final_permutation = to_virt_to_phys(current)
+            return path, final_permutation
+        
+        if visited.get(current, float('inf')) < g:
+            continue
+        
+        # Try all valid SWAPs on adjacent physical qubits
+        current_list = list(current)
+        for i in range(n):
+            for j in adj[i]:
+                if i < j:  # Avoid duplicate (i,j) and (j,i)
+                    # SWAP physical qubits i and j
+                    new_state = current_list[:]
+                    new_state[i], new_state[j] = new_state[j], new_state[i]
+                    new_state_tuple = tuple(new_state)
+                    
+                    new_g = g + 1
+                    
+                    if visited.get(new_state_tuple, float('inf')) > new_g:
+                        visited[new_state_tuple] = new_g
+                        new_f = new_g + heuristic(new_state_tuple)
+                        new_path = path + [(i, j)]
+                        heapq.heappush(heap, (new_f, new_g, new_state_tuple, new_path))
     
-    return B
-
+    return None, None  # No solution found
 
 def extract_subtopology(involved_qbits, qbit_map, config ):
     mini_topology = []
@@ -266,12 +277,9 @@ def get_original_circuit_structure(self):
     def get_partition_synthesis_score(self):
         score = 0
         for topology_idx in range(self.topology_count):
-            cnot_count_topology = np.mean(self.cnot_counts[topology_idx])*0.1 + np.min(self.cnot_counts[topology_idx])*0.9
-            if len(self.mini_topologies[topology_idx]) == self.N*(self.N-1)/2:
-                score += cnot_count_topology*0.3/self.topology_count
-            else:
-                score += cnot_count_topology*0.7/self.topology_count
-        return score 
+            cnot_count_topology = np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
+            score += cnot_count_topology/self.topology_count
+        return score
 
 class PartitionCandidate:
     
@@ -299,22 +307,22 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
-    def transform_pi_input(self, pi):
-        #Q->q
-        qbit_map_swapped = {self.node_mapping[self.P_i.index(v)]: k for k, v in self.qbit_map.items()}
-        return find_best_permutation_with_constraints(pi, qbit_map_swapped)
-
-    def transform_pi_output(self, pi):
-        #Q->q
-        qbit_map_swapped = {self.node_mapping[self.P_o.index(v)]: k for k, v in self.qbit_map.items()}
-        return find_best_permutation_with_constraints(pi, qbit_map_swapped)
-
+    def transform_pi(self, pi, D):
+        qbit_map_input = {k : self.node_mapping[self.P_i[v]] for k,v in self.qbit_map.items()}
+        print(qbit_map_input)
+        swaps, pi_init = find_constrained_swaps_partial(pi, qbit_map_input,D)
+        print(swaps, pi_init)
+        pi_output = pi_init.copy()
+        node_mapping_reversed = {v : k for k,v in self.node_mapping.items()}
+        for v in self.node_mapping.values():
+            pi_output[pi_init.index(v)] = self.node_mapping[self.P_o[node_mapping_reversed[v]]]
+        return swaps, pi_output
+    
     def get_final_circuit(self,optimized_partitions,N):
         partition = optimized_partitions[self.partition_idx]
         part_parameters = partition.synthesised_parameters[self.topology_idx][self.permutation_idx]
         part_circuit = partition.synthesised_circuits[self.topology_idx][self.permutation_idx]
-        qbit_map_swapped = {v : self.node_mapping[self.P_i.index(v)] for k, v in self.qbit_map.items()}
-        part_circuit = part_circuit.Remap_Qbits(qbit_map_swapped,N)
+        part_circuit = part_circuit.Remap_Qbits(self.node_mapping, N)
         return part_circuit, part_parameters
 
 def check_circuit_compatibility(circuit: Circuit, topology):
@@ -340,4 +348,12 @@ def check_circuit_compatibility(circuit: Circuit, topology):
     for qubits in circuit_topology:
         if qubits not in topology and qubits[::-1] not in topology:
             return False
-    return True
\ No newline at end of file
+    return True
+
+def construct_swap_circuit(swap_order, N):
+    swap_circ = Circuit(N)
+    for swap in swap_order:
+        swap_circ.add_CNOT(swap[0],swap[1])
+        swap_circ.add_CNOT(swap[1],swap[0])
+        swap_circ.add_CNOT(swap[0],swap[1])
+    return swap_circ
\ No newline at end of file

From f4ba3756370c316d2a0acfd62cbaf214c031ce28 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 27 Nov 2025 15:29:29 +0100
Subject: [PATCH 032/232] Fix error....

---
 examples/decomposition/PartAM_example.py |  9 ++++++--
 squander/synthesis/PartAM.py             |  8 +++----
 squander/synthesis/PartAM_utils.py       | 27 ++++++++++++++++++------
 3 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 032838aae..13f52710d 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -61,14 +61,19 @@
     output_perm_T = [0]* circ.get_Qbit_Num() 
     for i, j in enumerate(output_perm):
         output_perm_T[j] = i        
-    circ_Final.add_Permutation(input_perm)
+    # Convert numpy arrays/ints to plain Python lists for add_Permutation
+    input_perm_list = [int(x) for x in input_perm]
+    circ_Final.add_Permutation(input_perm_list)
     circ_Final.add_Circuit(circ)
     circ_Final.add_Permutation(output_perm_T)
     
-    # Additional matrix validation in example    
+    # Additional matrix validation in example     
     PartAM_state = initial_state.copy()
     circ_Final.apply_to(params, PartAM_state)
     state_error = 1 - abs(np.vdot(PartAM_state, original_state))
+    print(Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params))
+    print(f"PartAM_state probability: {np.abs(PartAM_state)**2}")
+    print(f"original_state probability: {np.abs(original_state)**2}")
     print(f"\n{'='*70}")
     print(f"State Vector Validation")
     print(f"{'='*70}")
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 0c15f51de..a2ececf59 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -70,18 +70,18 @@ def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_paramete
                 P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
                 for P_i in perumations_all:
                     Partition_circuit_tmp = Circuit(N)
-                    Partition_circuit_tmp.add_Permutation(P_i)
+                    Partition_circuit_tmp.add_Permutation(list(P_i))  # Must convert tuple to list
                     Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                    Partition_circuit_tmp.add_Permutation(P_o_initial)
+                    Partition_circuit_tmp.add_Permutation(list(P_o_initial))  # Must convert tuple to list
                     synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
                     result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
 
                 P_i_best, _ = result.get_best_result(topology_idx)[0]
                 for P_o in perumations_all:
                     Partition_circuit_tmp = Circuit(N)
-                    Partition_circuit_tmp.add_Permutation(P_i_best)
+                    Partition_circuit_tmp.add_Permutation(list(P_i_best))  # Must convert tuple to list
                     Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                    Partition_circuit_tmp.add_Permutation(P_o)
+                    Partition_circuit_tmp.add_Permutation(list(P_o))  # Must convert tuple to list
                     synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
                     result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
         else:
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 2bfa9501a..96b7ca57f 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -308,14 +308,27 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
     def transform_pi(self, pi, D):
-        qbit_map_input = {k : self.node_mapping[self.P_i[v]] for k,v in self.qbit_map.items()}
-        print(qbit_map_input)
-        swaps, pi_init = find_constrained_swaps_partial(pi, qbit_map_input,D)
-        print(swaps, pi_init)
+        # Fixed: Use P_i^{-1} instead of P_i for input routing
+        # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
+        # For Original to see logical qubit q* at partition position q*, we need:
+        # - After P_i, position q* should have logical qubit q*'s data
+        # - Before P_i (= input to S), position P_i^{-1}[q*] should have logical qubit q*'s data
+        # So we route logical qubit k (with qbit_map[k] = q*) to partition position P_i^{-1}[q*]
+        P_i_inv = [self.P_i.index(i) for i in range(len(self.P_i))]  # Compute inverse
+        qbit_map_input = {k : self.node_mapping[P_i_inv[v]] for k,v in self.qbit_map.items()}
+        # Convert pi to plain Python list of ints (may contain np.int64)
+        pi_list = [int(x) for x in pi]
+        swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+        
         pi_output = pi_init.copy()
-        node_mapping_reversed = {v : k for k,v in self.node_mapping.items()}
-        for v in self.node_mapping.values():
-            pi_output[pi_init.index(v)] = self.node_mapping[self.P_o[node_mapping_reversed[v]]]
+        # Fixed: P_o should be indexed by partition virtual index q*, not physical index Q*
+        # After the circuit, logical qubit k with qbit_map[k] = q* ends up at 
+        # physical position node_mapping[P_o[q*]]
+        qbit_map_inverse = {v: k for k, v in self.qbit_map.items()}
+        for q_star in range(len(self.P_o)):
+            if q_star in qbit_map_inverse:
+                k = qbit_map_inverse[q_star]
+                pi_output[k] = self.node_mapping[self.P_o[q_star]]
         return swaps, pi_output
     
     def get_final_circuit(self,optimized_partitions,N):

From 87dfee607cff656d9385cf1e72eed724a1037841 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 27 Nov 2025 16:14:42 +0100
Subject: [PATCH 033/232] Add in caching and progressbar

---
 squander/synthesis/PartAM.py       | 319 +++++++++++++++++++++--------
 squander/synthesis/PartAM_utils.py |  58 +++++-
 2 files changed, 285 insertions(+), 92 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index a2ececf59..2c113b0dc 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -24,10 +24,121 @@
 import numpy as np
 
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
-from squander.synthesis.PartAM_utils import (get_subtopologies_of_type, get_unique_subtopologies, 
+from squander.synthesis.PartAM_utils import (get_subtopologies_of_type, get_unique_subtopologies, get_canonical_form,
 SingleQubitPartitionResult, PartitionSynthesisResult, 
 PartitionCandidate, check_circuit_compatibility, construct_swap_circuit)
 
+def _score_partition_candidate_helper(partition_candidate, F, pi, optimized_partitions, sDAG, D, swap_cache, topology_cache, topology):
+    """
+    Helper function for parallel candidate scoring.
+    This is a module-level function to enable pickling for multiprocessing.
+    """
+    score_F = 0
+    score_E = 0
+    E_visited_partitions = set()
+    
+    # Use swap cache if provided
+    swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
+    score_F += len(swaps)*3
+    score_F += len(partition_candidate.circuit_structure)
+
+    # Cache for transform_pi results to avoid redundant computation
+    transform_cache = {}
+
+    # Helper function to get topology candidates
+    def get_topology_candidates_cached(mini_topology):
+        if topology_cache is not None:
+            from squander.synthesis.PartAM_utils import get_canonical_form
+            target_qubits = set()
+            for u, v in mini_topology:
+                target_qubits.add(u)
+                target_qubits.add(v)
+            if target_qubits:
+                canonical_key = get_canonical_form(target_qubits, mini_topology)
+                if canonical_key in topology_cache:
+                    return topology_cache[canonical_key]
+                else:
+                    candidates = get_subtopologies_of_type(topology, mini_topology)
+                    topology_cache[canonical_key] = candidates
+                    return candidates
+        return get_subtopologies_of_type(topology, mini_topology)
+
+    for partition_idx in sDAG[partition_candidate.partition_idx]:
+        if partition_idx in E_visited_partitions:
+            continue
+        E_visited_partitions.add(partition_idx)
+        mini_scores = []
+        partition_result = optimized_partitions[partition_idx]
+        for tdx, mini_topology in enumerate(partition_result.mini_topologies):
+            if hasattr(partition_result, 'get_topology_candidates'):
+                topology_candidates = partition_result.get_topology_candidates(tdx)
+            else:
+                topology_candidates = get_topology_candidates_cached(mini_topology)
+            for topology_candidate in topology_candidates:
+                for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
+                    cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
+                    if cache_key not in transform_cache:
+                        new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition_result.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result.qubit_map,partition_result.involved_qbits)
+                        swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
+                        transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                    mini_scores.append(transform_cache[cache_key])
+        if mini_scores:
+            score_E += min(mini_scores)
+
+    for partition_idx in F:
+        partition = optimized_partitions[partition_idx]
+        mini_scores = []
+        for tdx, mini_topology in enumerate(partition.mini_topologies):
+            if hasattr(partition, 'get_topology_candidates'):
+                topology_candidates = partition.get_topology_candidates(tdx)
+            else:
+                topology_candidates = get_topology_candidates_cached(mini_topology)
+            for topology_candidate in topology_candidates:
+                for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
+                    cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
+                    if cache_key not in transform_cache:
+                        new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
+                        swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
+                        transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                    mini_scores.append(transform_cache[cache_key])
+        if mini_scores:
+            score_F += min(mini_scores)
+
+        for partition_idx_E in sDAG[partition_idx]:
+            if partition_idx_E in E_visited_partitions:
+                continue
+            E_visited_partitions.add(partition_idx_E)
+            mini_scores = []
+            partition_result_E = optimized_partitions[partition_idx_E]
+            for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
+                if hasattr(partition_result_E, 'get_topology_candidates'):
+                    topology_candidates = partition_result_E.get_topology_candidates(tdx)
+                else:
+                    topology_candidates = get_topology_candidates_cached(mini_topology)
+                for topology_candidate in topology_candidates:
+                    for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
+                        cache_key = (partition_idx_E, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
+                        if cache_key not in transform_cache:
+                            new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,partition_result_E.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result_E.qubit_map,partition_result_E.involved_qbits)
+                            swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
+                            transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                        mini_scores.append(transform_cache[cache_key])
+            if mini_scores:
+                score_E += min(mini_scores)
+    
+    # Safety check for division by zero
+    if len(E_visited_partitions) == 0:
+        E_score = 0.0
+    else:
+        E_score = 0.2 * score_E / len(E_visited_partitions)
+    
+    if len(F) == 0:
+        F_score = 0.0
+    else:
+        F_score = score_F / len(F)
+    
+    return E_score + F_score
+
 class qgd_Partition_Aware_Mapping:
 
     def __init__(self, config):
@@ -44,16 +155,37 @@ def __init__(self, config):
         self.config.setdefault('routed', False)
         self.config.setdefault('partition_strategy','ilp')
         self.config.setdefault('optimizer', 'BFGS')
-        self.config.setdefault('diagnostics', False)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
             raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
-    
-    def _diagnostic_print(self, *args, **kwargs):
-        """Print diagnostic information if diagnostics are enabled."""
-        if self.config.get('diagnostics', False):
-            print(*args, **kwargs)
+        
+        # Initialize caches for performance optimization
+        self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
+        self._swap_cache = {}     # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)}
+
+    def _get_subtopologies_of_type_cached(self, mini_topology):
+        """
+        Cached version of get_subtopologies_of_type.
+        Uses canonical form of mini_topology as cache key.
+        """
+        from squander.synthesis.PartAM_utils import get_canonical_form
+        
+        # Create canonical form key
+        target_qubits = set()
+        for u, v in mini_topology:
+            target_qubits.add(u)
+            target_qubits.add(v)
+        if not target_qubits:
+            return []
+        
+        # Use canonical form as cache key
+        canonical_key = get_canonical_form(target_qubits, mini_topology)
+        
+        if canonical_key not in self._topology_cache:
+            self._topology_cache[canonical_key] = get_subtopologies_of_type(self.topology, mini_topology)
+        
+        return self._topology_cache[canonical_key]
 
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
@@ -166,19 +298,6 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis") ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
         
-        # Diagnostic: Check first synthesis pass
-        if self.config.get('diagnostics', False):
-            self._diagnostic_print("\n=== Partition Synthesis Diagnostics (First Pass) ===")
-            for idx, opt_part in enumerate(optimized_results):
-                if isinstance(opt_part, SingleQubitPartitionResult):
-                    self._diagnostic_print(f"  Partition {idx}: Single-qubit")
-                else:
-                    self._diagnostic_print(f"  Partition {idx}: Multi-qubit, involved_qubits={opt_part.involved_qbits}")
-                    for tdx in range(opt_part.topology_count):
-                        if opt_part.cnot_counts[tdx]:
-                            min_cnots = min(opt_part.cnot_counts[tdx])
-                            self._diagnostic_print(f"    Topology {tdx}: min_CNOTs={min_cnots}, candidates={len(opt_part.cnot_counts[tdx])}")
-
         weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
@@ -212,31 +331,19 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis") ):
                 optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
-        
-        # Diagnostic: Check partition synthesis errors
-        if self.config.get('diagnostics', False):
-            self._diagnostic_print("\n=== Partition Synthesis Diagnostics (Second Pass) ===")
-            for idx, opt_part in enumerate(optimized_partitions):
-                if isinstance(opt_part, SingleQubitPartitionResult):
-                    self._diagnostic_print(f"  Partition {idx}: Single-qubit, involved_qubits={opt_part.circuit.get_Qbits()}")
-                else:
-                    self._diagnostic_print(f"  Partition {idx}: Multi-qubit, involved_qubits={opt_part.involved_qbits}, qubit_map={opt_part.qubit_map}")
-                    for tdx in range(opt_part.topology_count):
-                        if opt_part.cnot_counts[tdx]:
-                            min_cnots = min(opt_part.cnot_counts[tdx])
-                            avg_cnots = np.mean(opt_part.cnot_counts[tdx])
-                            self._diagnostic_print(f"    Topology {tdx}: min_CNOTs={min_cnots}, avg_CNOTs={avg_cnots:.2f}, candidates={len(opt_part.cnot_counts[tdx])}")
+                
         
         return optimized_partitions
 
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
-        self._diagnostic_print("\n" + "="*70)
-        self._diagnostic_print("PartAM: Starting Partition Aware Mapping")
-        self._diagnostic_print("="*70)
-        self._diagnostic_print(f"Original circuit: {circ.get_Qbit_Num()} qubits, {len(circ.get_Gates())} gates")
         
         optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
-        self._diagnostic_print(f"\nSynthesized {len(optimized_partitions)} partitions")
+        
+        # Initialize topology candidates in PartitionSynthesisResult objects
+        for partition in optimized_partitions:
+            if isinstance(partition, PartitionSynthesisResult):
+                partition._topology = self.topology
+                partition._topology_cache = self._topology_cache
         
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
         sDAG = self.construct_sDAG(optimized_partitions)
@@ -244,18 +351,13 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         D = self.compute_distances_bfs(circ.get_Qbit_Num())
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
         pi_list = pi.tolist() if hasattr(pi, 'tolist') else list(pi)
-        self._diagnostic_print(f"\nInitial layout (pi): {pi_list}")
         
         F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
-        self._diagnostic_print(f"Initial front set (F): {F}")
         
         partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,IDAG, optimized_partitions,D, sDAG)
         pi_final_list = pi_final.tolist() if hasattr(pi_final, 'tolist') else list(pi_final)
-        self._diagnostic_print(f"\nFinal permutation (pi_final): {pi_final_list}")
-        self._diagnostic_print(f"Partition order length: {len(partition_order)}")
         
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions, circ.get_Qbit_Num())
-        self._diagnostic_print(f"\nFinal circuit: {len(final_circuit.get_Gates())} gates, {len(final_parameters)} parameters")
         
         return final_circuit, final_parameters, pi, pi_final
 
@@ -264,32 +366,42 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
         partition_order = []
         step = 0
         
-        if self.config.get('diagnostics', False):
-            self._diagnostic_print("\n=== Heuristic Search ===")
+        # Determine number of processes for parallel scoring
+        num_processes = self.config.get('parallel_scoring_processes', min(mp.cpu_count(), 4))
+        use_parallel = num_processes > 1 and len(F) > 1
+        
+        # Initialize progress bar
+        total_partitions = len(DAG)
+        pbar = tqdm(total=total_partitions, desc="Heuristic Search", 
+                   bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved', 
+                   disable=self.config.get('verbosity', 0) < 1)
         
         while len(F) != 0:
             scores = []
             partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
             if len(partition_candidates) != 0:
-                for partition_candidate in partition_candidates:
-                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, sDAG, D)
-                    scores.append(score)
+                if use_parallel and len(partition_candidates) > 1:
+                    # Parallel scoring
+                    with Pool(processes=num_processes) as pool:
+                        score_args = [(cand, F, pi, optimized_partitions, sDAG, D, self._swap_cache, self._topology_cache, self.topology) 
+                                     for cand in partition_candidates]
+                        scores = pool.starmap(_score_partition_candidate_helper, score_args)
+                else:
+                    # Sequential scoring
+                    for partition_candidate in partition_candidates:
+                        score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, sDAG, D)
+                        scores.append(score)
             if len(scores) == 0:
                 break
             min_idx = np.argmin(scores)
             min_partition_candidate = partition_candidates[min_idx]
             
-            if self.config.get('diagnostics', False):
-                self._diagnostic_print(f"\n  Step {step}:")
-                self._diagnostic_print(f"    Selected partition: {min_partition_candidate.partition_idx}")
-                # Convert pi to list if it's a numpy array
-                pi_list = pi.tolist() if hasattr(pi, 'tolist') else list(pi)
-                self._diagnostic_print(f"    Current pi: {pi_list}")
-                self._diagnostic_print(f"    Best score: {scores[min_idx]:.4f}, candidates evaluated: {len(scores)}")
-            
             F.remove(min_partition_candidate.partition_idx)
             resolved_partitions[min_partition_candidate.partition_idx] = True
-            swap_order, pi = min_partition_candidate.transform_pi(pi, D)
+            resolved_count = sum(resolved_partitions)
+            pbar.n = resolved_count
+            pbar.refresh()
+            swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
             if len(swap_order)!=0:
                 partition_order.append(construct_swap_circuit(swap_order, len(pi)))
             partition_order.append(min_partition_candidate)
@@ -306,9 +418,14 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
                         qubit = child_partition.circuit.get_Qbits()[0]
                         child_partition.circuit.map_circuit({qubit: pi[qubit]})
                         partition_order.append(child_partition)
+                        resolved_partitions[child] = True
+                        resolved_count = sum(resolved_partitions)
+                        pbar.n = resolved_count
+                        pbar.refresh()
                         children.extend(DAG[child])
                     else:
                         F.append(child)
+        pbar.close()
         return partition_order, pi
 
     def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
@@ -317,59 +434,60 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         perm_count = 0
         partition_count = 0
         
-        if self.config.get('diagnostics', False):
-            self._diagnostic_print("\n=== Circuit Construction ===")
-        
         for part in partition_order:
             if isinstance(part, Circuit):
                 final_circuit.add_Circuit(part)
                 perm_count += 1
-                if self.config.get('diagnostics', False):
-                    self._diagnostic_print(f"  Added permutation circuit ({perm_count} CNOT gates)")
             elif isinstance(part, SingleQubitPartitionResult):
                 final_circuit.add_Circuit(part.circuit)
                 final_parameters.append(part.parameters)
                 partition_count += 1
-                if self.config.get('diagnostics', False):
-                    self._diagnostic_print(f"  Added single-qubit partition {partition_count}, qubit={part.circuit.get_Qbits()}")
             else:
                 part_circ, part_parameters = part.get_final_circuit(optimized_partitions,N)
                 final_circuit.add_Circuit(part_circ)
                 final_parameters.append(part_parameters)
                 partition_count += 1
-                if self.config.get('diagnostics', False):
-                    cnot_count = part_circ.get_Gate_Nums().get('CNOT', 0)
-                    self._diagnostic_print(f"  Added partition {part.partition_idx} (multi-qubit), CNOTs={cnot_count}, qubits={part.involved_qbits}")
         
         if final_parameters:
             final_parameters = np.concatenate(final_parameters,axis=0)
         else:
             final_parameters = np.array([])
         
-        if self.config.get('diagnostics', False):
-            self._diagnostic_print(f"  Total: {perm_count} permutation circuits, {partition_count} partitions")
-        
         return final_circuit, final_parameters
 
     def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, sDAG, D):
         score_F = 0
         score_E = 0
-        E_visited_partitions = []
-        swaps, output_perm = partition_candidate.transform_pi(pi, D)
+        E_visited_partitions = set()  # Changed to set for O(1) membership checks
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, self._swap_cache)
         score_F += len(swaps)*3
         score_F += len(partition_candidate.circuit_structure)
 
+        # Cache for transform_pi results to avoid redundant computation
+        # Key: (partition_idx, topology_idx, permutation_idx, topology_candidate_tuple, output_perm_tuple)
+        transform_cache = {}
+
         for partition_idx in sDAG[partition_candidate.partition_idx]:
             if partition_idx in E_visited_partitions:
                 continue
-            E_visited_partitions.append(partition_idx)
+            E_visited_partitions.add(partition_idx)
             mini_scores = []
-            for tdx, mini_topology in enumerate(optimized_partitions[partition_idx].mini_topologies):
-                topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+            partition_result = optimized_partitions[partition_idx]
+            for tdx, mini_topology in enumerate(partition_result.mini_topologies):
+                # Use pre-computed topology candidates if available, otherwise compute and cache
+                if hasattr(partition_result, 'get_topology_candidates'):
+                    topology_candidates = partition_result.get_topology_candidates(tdx)
+                else:
+                    topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
                 for topology_candidate in topology_candidates:
-                    for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx].permutations_pairs[tdx]):
-                        new_cand = PartitionCandidate(partition_idx,tdx,pdx,optimized_partitions[partition_idx].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx].qubit_map,optimized_partitions[partition_idx].involved_qbits)
-                        mini_scores.append(len(new_cand.transform_pi(output_perm,D)[0])*3+len(new_cand.circuit_structure))
+                    for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
+                        # Create cache key for this candidate's transform_pi result
+                        cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
+                        if cache_key not in transform_cache:
+                            new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition_result.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result.qubit_map,partition_result.involved_qbits)
+                            swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                            transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                        mini_scores.append(transform_cache[cache_key])
             if mini_scores:
                 score_E += min(mini_scores)
 
@@ -377,25 +495,44 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
             partition = optimized_partitions[partition_idx]
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                # Use pre-computed topology candidates if available, otherwise compute and cache
+                if hasattr(partition, 'get_topology_candidates'):
+                    topology_candidates = partition.get_topology_candidates(tdx)
+                else:
+                    topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
-                        new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                        mini_scores.append(len(new_cand.transform_pi(output_perm,D)[0])*3+len(new_cand.circuit_structure))
+                        # Create cache key for this candidate's transform_pi result
+                        cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
+                        if cache_key not in transform_cache:
+                            new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
+                            swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                            transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                        mini_scores.append(transform_cache[cache_key])
             if mini_scores:
                 score_F += min(mini_scores)
 
             for partition_idx_E in sDAG[partition_idx]:
                 if partition_idx_E in E_visited_partitions:
                     continue
-                E_visited_partitions.append(partition_idx_E)
+                E_visited_partitions.add(partition_idx_E)
                 mini_scores = []
-                for tdx, mini_topology in enumerate(optimized_partitions[partition_idx_E].mini_topologies):
-                    topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                partition_result_E = optimized_partitions[partition_idx_E]
+                for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
+                    # Use pre-computed topology candidates if available, otherwise compute and cache
+                    if hasattr(partition_result_E, 'get_topology_candidates'):
+                        topology_candidates = partition_result_E.get_topology_candidates(tdx)
+                    else:
+                        topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
                     for topology_candidate in topology_candidates:
-                        for pdx, permutation_pair in enumerate(optimized_partitions[partition_idx_E].permutations_pairs[tdx]):
-                            new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,optimized_partitions[partition_idx_E].circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,optimized_partitions[partition_idx_E].qubit_map,optimized_partitions[partition_idx_E].involved_qbits)
-                            mini_scores.append(len(new_cand.transform_pi(output_perm,D)[0])*3+len(new_cand.circuit_structure))
+                        for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
+                            # Create cache key for this candidate's transform_pi result
+                            cache_key = (partition_idx_E, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
+                            if cache_key not in transform_cache:
+                                new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,partition_result_E.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result_E.qubit_map,partition_result_E.involved_qbits)
+                                swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                                transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                            mini_scores.append(transform_cache[cache_key])
                 if mini_scores:
                     score_E += min(mini_scores)
         # Safety check for division by zero
@@ -416,7 +553,11 @@ def obtain_partition_candidates(self, F, optimized_partitions):
         for partition_idx in F:
             partition = optimized_partitions[partition_idx]
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                topology_candidates = get_subtopologies_of_type(self.topology,mini_topology)
+                # Use pre-computed topology candidates if available, otherwise compute and cache
+                if hasattr(partition, 'get_topology_candidates'):
+                    topology_candidates = partition.get_topology_candidates(tdx)
+                else:
+                    topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         partition_candidates.append(PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits))
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 96b7ca57f..710870711 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -222,7 +222,7 @@ def get_partition_synthesis_score(self):
 # Physical qubits Q, reduced physical qubits Q* 
 class PartitionSynthesisResult:
     
-    def __init__(self, N , mini_topologies, involved_qbits, qubit_map, original_circuit):
+    def __init__(self, N , mini_topologies, involved_qbits, qubit_map, original_circuit, topology=None, topology_cache=None):
         #The physical mini_topology of the partition q*
         self.mini_topologies = mini_topologies
         #number of topologies
@@ -243,6 +243,10 @@ def __init__(self, N , mini_topologies, involved_qbits, qubit_map, original_circ
         self.qubit_map = qubit_map
         # the original circuit
         self.original_circuit = original_circuit
+        # Pre-computed topology candidates for each mini_topology (lazy initialization)
+        self._topology_candidates = [None] * len(mini_topologies)
+        self._topology = topology  # Full topology for computing candidates
+        self._topology_cache = topology_cache  # Cache to use for lookups
 
     def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
         self.permutations_pairs[topology_idx].append(permutations_pair)
@@ -280,6 +284,40 @@ def get_partition_synthesis_score(self):
             cnot_count_topology = np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
             score += cnot_count_topology/self.topology_count
         return score
+    
+    def get_topology_candidates(self, topology_idx):
+        """
+        Get topology candidates for a given topology index, using cache if available.
+        """
+        if self._topology_candidates[topology_idx] is None:
+            mini_topology = self.mini_topologies[topology_idx]
+            if self._topology_cache is not None:
+                # Use cached version if available
+                target_qubits = set()
+                for u, v in mini_topology:
+                    target_qubits.add(u)
+                    target_qubits.add(v)
+                if target_qubits:
+                    canonical_key = get_canonical_form(target_qubits, mini_topology)
+                    if canonical_key in self._topology_cache:
+                        self._topology_candidates[topology_idx] = self._topology_cache[canonical_key]
+                    else:
+                        # Compute and cache
+                        if self._topology is not None:
+                            candidates = get_subtopologies_of_type(self._topology, mini_topology)
+                            self._topology_cache[canonical_key] = candidates
+                            self._topology_candidates[topology_idx] = candidates
+                        else:
+                            self._topology_candidates[topology_idx] = []
+                else:
+                    self._topology_candidates[topology_idx] = []
+            else:
+                # No cache, compute directly
+                if self._topology is not None:
+                    self._topology_candidates[topology_idx] = get_subtopologies_of_type(self._topology, mini_topology)
+                else:
+                    self._topology_candidates[topology_idx] = []
+        return self._topology_candidates[topology_idx]
 
 class PartitionCandidate:
     
@@ -307,7 +345,7 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
-    def transform_pi(self, pi, D):
+    def transform_pi(self, pi, D, swap_cache=None):
         # Fixed: Use P_i^{-1} instead of P_i for input routing
         # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
         # For Original to see logical qubit q* at partition position q*, we need:
@@ -318,7 +356,21 @@ def transform_pi(self, pi, D):
         qbit_map_input = {k : self.node_mapping[P_i_inv[v]] for k,v in self.qbit_map.items()}
         # Convert pi to plain Python list of ints (may contain np.int64)
         pi_list = [int(x) for x in pi]
-        swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+        
+        # Check cache if provided
+        cache_key = None
+        if swap_cache is not None:
+            # Create cache key: (pi_tuple, frozenset of qbit_map_input items)
+            pi_tuple = tuple(pi_list)
+            qbit_map_frozen = frozenset(qbit_map_input.items())
+            cache_key = (pi_tuple, qbit_map_frozen)
+            if cache_key in swap_cache:
+                swaps, pi_init = swap_cache[cache_key]
+            else:
+                swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+                swap_cache[cache_key] = (swaps, pi_init)
+        else:
+            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
         
         pi_output = pi_init.copy()
         # Fixed: P_o should be indexed by partition virtual index q*, not physical index Q*

From 90d5431365d2c50880f0f198ba80f0b06c638fc7 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 27 Nov 2025 16:17:09 +0100
Subject: [PATCH 034/232] add progressbar to config

---
 squander/synthesis/PartAM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2c113b0dc..9255cb174 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -374,7 +374,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
         total_partitions = len(DAG)
         pbar = tqdm(total=total_partitions, desc="Heuristic Search", 
                    bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved', 
-                   disable=self.config.get('verbosity', 0) < 1)
+                   disable=self.config.get('progressbar', 0) == False)
         
         while len(F) != 0:
             scores = []

From bd356ac460be13c5d49b965bda4a9783ba5e8ef0 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 27 Nov 2025 16:21:38 +0100
Subject: [PATCH 035/232] Remove state diagnostics from example

---
 examples/decomposition/PartAM_example.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 13f52710d..7a560f802 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -35,7 +35,7 @@
             'test_subcircuits': True,
             'test_final_circuit': True,
             'max_partition_size': 3,
-            'diagnostics': True,  # Enable diagnostic output
+            'progressbar': True,  # Enable diagnostic output
     }
 
     filename = "benchmarks/qfast/4q/adder_q4.qasm"
@@ -71,9 +71,6 @@
     PartAM_state = initial_state.copy()
     circ_Final.apply_to(params, PartAM_state)
     state_error = 1 - abs(np.vdot(PartAM_state, original_state))
-    print(Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params))
-    print(f"PartAM_state probability: {np.abs(PartAM_state)**2}")
-    print(f"original_state probability: {np.abs(original_state)**2}")
     print(f"\n{'='*70}")
     print(f"State Vector Validation")
     print(f"{'='*70}")

From b38e48d1b326f3a76ebd62501ddda35ac8cf6440 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Thu, 27 Nov 2025 17:04:41 +0100
Subject: [PATCH 036/232] Increase core count for scoring

---
 squander/synthesis/PartAM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 9255cb174..e5ac6f125 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -367,7 +367,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
         step = 0
         
         # Determine number of processes for parallel scoring
-        num_processes = self.config.get('parallel_scoring_processes', min(mp.cpu_count(), 4))
+        num_processes = self.config.get('parallel_scoring_processes', min(mp.cpu_count(), 64))
         use_parallel = num_processes > 1 and len(F) > 1
         
         # Initialize progress bar

From e975b3c511708a36af3a30b4ba627b6903782360 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 27 Nov 2025 17:06:12 +0100
Subject: [PATCH 037/232] Add in progressbar verbosity

---
 squander/synthesis/PartAM.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 9255cb174..07a45b38a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -295,7 +295,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
                 optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
-            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis") ):
+            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
         
         weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
@@ -329,7 +329,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
                 optimized_partitions[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
-            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis") ):
+            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
                 
         

From 01e38e722a4179c5e50188bcaa56ec4332a4ba62 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 28 Nov 2025 21:16:37 +0100
Subject: [PATCH 038/232] Remove bad parallelization

---
 squander/synthesis/PartAM.py | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index c96c4c7a4..6fdca8eeb 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -366,9 +366,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
         partition_order = []
         step = 0
         
-        # Determine number of processes for parallel scoring
-        num_processes = self.config.get('parallel_scoring_processes', min(mp.cpu_count(), 64))
-        use_parallel = num_processes > 1 and len(F) > 1
         
         # Initialize progress bar
         total_partitions = len(DAG)
@@ -379,18 +376,9 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
         while len(F) != 0:
             scores = []
             partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
-            if len(partition_candidates) != 0:
-                if use_parallel and len(partition_candidates) > 1:
-                    # Parallel scoring
-                    with Pool(processes=num_processes) as pool:
-                        score_args = [(cand, F, pi, optimized_partitions, sDAG, D, self._swap_cache, self._topology_cache, self.topology) 
-                                     for cand in partition_candidates]
-                        scores = pool.starmap(_score_partition_candidate_helper, score_args)
-                else:
-                    # Sequential scoring
-                    for partition_candidate in partition_candidates:
-                        score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, sDAG, D)
-                        scores.append(score)
+            for partition_candidate in partition_candidates:
+                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, sDAG, D)
+                    scores.append(score)
             if len(scores) == 0:
                 break
             min_idx = np.argmin(scores)

From dcecb7416fd0cc26a9718560125ff16b4286b5f8 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 2 Dec 2025 11:27:31 +0100
Subject: [PATCH 039/232] Parallelize cost function evaluation

---
 squander/synthesis/PartAM.py | 143 +++++------------------------------
 1 file changed, 18 insertions(+), 125 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6fdca8eeb..40f67d09a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -28,117 +28,6 @@
 SingleQubitPartitionResult, PartitionSynthesisResult, 
 PartitionCandidate, check_circuit_compatibility, construct_swap_circuit)
 
-def _score_partition_candidate_helper(partition_candidate, F, pi, optimized_partitions, sDAG, D, swap_cache, topology_cache, topology):
-    """
-    Helper function for parallel candidate scoring.
-    This is a module-level function to enable pickling for multiprocessing.
-    """
-    score_F = 0
-    score_E = 0
-    E_visited_partitions = set()
-    
-    # Use swap cache if provided
-    swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
-    score_F += len(swaps)*3
-    score_F += len(partition_candidate.circuit_structure)
-
-    # Cache for transform_pi results to avoid redundant computation
-    transform_cache = {}
-
-    # Helper function to get topology candidates
-    def get_topology_candidates_cached(mini_topology):
-        if topology_cache is not None:
-            from squander.synthesis.PartAM_utils import get_canonical_form
-            target_qubits = set()
-            for u, v in mini_topology:
-                target_qubits.add(u)
-                target_qubits.add(v)
-            if target_qubits:
-                canonical_key = get_canonical_form(target_qubits, mini_topology)
-                if canonical_key in topology_cache:
-                    return topology_cache[canonical_key]
-                else:
-                    candidates = get_subtopologies_of_type(topology, mini_topology)
-                    topology_cache[canonical_key] = candidates
-                    return candidates
-        return get_subtopologies_of_type(topology, mini_topology)
-
-    for partition_idx in sDAG[partition_candidate.partition_idx]:
-        if partition_idx in E_visited_partitions:
-            continue
-        E_visited_partitions.add(partition_idx)
-        mini_scores = []
-        partition_result = optimized_partitions[partition_idx]
-        for tdx, mini_topology in enumerate(partition_result.mini_topologies):
-            if hasattr(partition_result, 'get_topology_candidates'):
-                topology_candidates = partition_result.get_topology_candidates(tdx)
-            else:
-                topology_candidates = get_topology_candidates_cached(mini_topology)
-            for topology_candidate in topology_candidates:
-                for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
-                    cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
-                    if cache_key not in transform_cache:
-                        new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition_result.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result.qubit_map,partition_result.involved_qbits)
-                        swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
-                        transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
-                    mini_scores.append(transform_cache[cache_key])
-        if mini_scores:
-            score_E += min(mini_scores)
-
-    for partition_idx in F:
-        partition = optimized_partitions[partition_idx]
-        mini_scores = []
-        for tdx, mini_topology in enumerate(partition.mini_topologies):
-            if hasattr(partition, 'get_topology_candidates'):
-                topology_candidates = partition.get_topology_candidates(tdx)
-            else:
-                topology_candidates = get_topology_candidates_cached(mini_topology)
-            for topology_candidate in topology_candidates:
-                for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
-                    cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
-                    if cache_key not in transform_cache:
-                        new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                        swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
-                        transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
-                    mini_scores.append(transform_cache[cache_key])
-        if mini_scores:
-            score_F += min(mini_scores)
-
-        for partition_idx_E in sDAG[partition_idx]:
-            if partition_idx_E in E_visited_partitions:
-                continue
-            E_visited_partitions.add(partition_idx_E)
-            mini_scores = []
-            partition_result_E = optimized_partitions[partition_idx_E]
-            for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
-                if hasattr(partition_result_E, 'get_topology_candidates'):
-                    topology_candidates = partition_result_E.get_topology_candidates(tdx)
-                else:
-                    topology_candidates = get_topology_candidates_cached(mini_topology)
-                for topology_candidate in topology_candidates:
-                    for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
-                        cache_key = (partition_idx_E, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
-                        if cache_key not in transform_cache:
-                            new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,partition_result_E.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result_E.qubit_map,partition_result_E.involved_qbits)
-                            swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
-                            transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
-                        mini_scores.append(transform_cache[cache_key])
-            if mini_scores:
-                score_E += min(mini_scores)
-    
-    # Safety check for division by zero
-    if len(E_visited_partitions) == 0:
-        E_score = 0.0
-    else:
-        E_score = 0.2 * score_E / len(E_visited_partitions)
-    
-    if len(F) == 0:
-        F_score = 0.0
-    else:
-        F_score = score_F / len(F)
-    
-    return E_score + F_score
-
 class qgd_Partition_Aware_Mapping:
 
     def __init__(self, config):
@@ -219,6 +108,7 @@ def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_paramete
         else:
             result = SingleQubitPartitionResult(Partition_circuit,Partition_parameters)
         return result
+
     @staticmethod
     def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None) -> Circuit:
         """
@@ -374,13 +264,15 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
                    disable=self.config.get('progressbar', 0) == False)
         
         while len(F) != 0:
-            scores = []
             partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
-            for partition_candidate in partition_candidates:
-                    score = self.score_partition_candidate(partition_candidate, F, pi, optimized_partitions, sDAG, D)
-                    scores.append(score)
-            if len(scores) == 0:
+            if len(partition_candidates) == 0:
                 break
+            scores = [None] * len(partition_candidates)
+            with Pool(processes=mp.cpu_count()) as pool:
+                for idx, partition_candidate in enumerate(partition_candidates):
+                    scores[idx] = pool.apply_async(self.score_partition_candidate, (partition_candidate, F, pi, optimized_partitions, sDAG, D, self._swap_cache))
+            for idx, score in enumerate(scores):
+                scores[idx] = scores[idx].get()
             min_idx = np.argmin(scores)
             min_partition_candidate = partition_candidates[min_idx]
             
@@ -442,12 +334,13 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
             final_parameters = np.array([])
         
         return final_circuit, final_parameters
-
-    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, sDAG, D):
+    
+    @staticmethod
+    def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions, sDAG, D, swap_cache):
         score_F = 0
         score_E = 0
         E_visited_partitions = set()  # Changed to set for O(1) membership checks
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, self._swap_cache)
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
         score_F += len(swaps)*3
         score_F += len(partition_candidate.circuit_structure)
 
@@ -466,14 +359,14 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                 if hasattr(partition_result, 'get_topology_candidates'):
                     topology_candidates = partition_result.get_topology_candidates(tdx)
                 else:
-                    topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
+                    topology_candidates = get_unique_subtopologies(mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
                         # Create cache key for this candidate's transform_pi result
                         cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
                         if cache_key not in transform_cache:
                             new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition_result.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result.qubit_map,partition_result.involved_qbits)
-                            swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                            swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
                             transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
                         mini_scores.append(transform_cache[cache_key])
             if mini_scores:
@@ -487,14 +380,14 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                 if hasattr(partition, 'get_topology_candidates'):
                     topology_candidates = partition.get_topology_candidates(tdx)
                 else:
-                    topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
+                    topology_candidates = get_unique_subtopologies(mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         # Create cache key for this candidate's transform_pi result
                         cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
                         if cache_key not in transform_cache:
                             new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                            swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                            swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
                             transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
                         mini_scores.append(transform_cache[cache_key])
             if mini_scores:
@@ -511,14 +404,14 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                     if hasattr(partition_result_E, 'get_topology_candidates'):
                         topology_candidates = partition_result_E.get_topology_candidates(tdx)
                     else:
-                        topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
+                        topology_candidates = get_unique_subtopologies(mini_topology)
                     for topology_candidate in topology_candidates:
                         for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
                             # Create cache key for this candidate's transform_pi result
                             cache_key = (partition_idx_E, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
                             if cache_key not in transform_cache:
                                 new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,partition_result_E.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result_E.qubit_map,partition_result_E.involved_qbits)
-                                swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                                swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
                                 transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
                             mini_scores.append(transform_cache[cache_key])
                 if mini_scores:

From ca361f2a841795a81172f67f326b180119d1898c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 2 Dec 2025 11:34:04 +0100
Subject: [PATCH 040/232] fix non static function calls

---
 squander/synthesis/PartAM.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 40f67d09a..81bb3067c 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -270,7 +270,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
             scores = [None] * len(partition_candidates)
             with Pool(processes=mp.cpu_count()) as pool:
                 for idx, partition_candidate in enumerate(partition_candidates):
-                    scores[idx] = pool.apply_async(self.score_partition_candidate, (partition_candidate, F, pi, optimized_partitions, sDAG, D, self._swap_cache))
+                    scores[idx] = pool.apply_async(qgd_Partition_Aware_Mapping.score_partition_candidate, (partition_candidate, F, pi, optimized_partitions, sDAG, D, self._swap_cache, self.topology))
             for idx, score in enumerate(scores):
                 scores[idx] = scores[idx].get()
             min_idx = np.argmin(scores)
@@ -336,7 +336,7 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         return final_circuit, final_parameters
     
     @staticmethod
-    def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions, sDAG, D, swap_cache):
+    def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions, sDAG, D, swap_cache, topology):
         score_F = 0
         score_E = 0
         E_visited_partitions = set()  # Changed to set for O(1) membership checks
@@ -359,7 +359,7 @@ def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions,
                 if hasattr(partition_result, 'get_topology_candidates'):
                     topology_candidates = partition_result.get_topology_candidates(tdx)
                 else:
-                    topology_candidates = get_unique_subtopologies(mini_topology)
+                    topology_candidates = get_subtopologies_of_type(topology, mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
                         # Create cache key for this candidate's transform_pi result
@@ -380,7 +380,7 @@ def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions,
                 if hasattr(partition, 'get_topology_candidates'):
                     topology_candidates = partition.get_topology_candidates(tdx)
                 else:
-                    topology_candidates = get_unique_subtopologies(mini_topology)
+                    topology_candidates = get_subtopologies_of_type(topology, mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         # Create cache key for this candidate's transform_pi result
@@ -404,7 +404,7 @@ def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions,
                     if hasattr(partition_result_E, 'get_topology_candidates'):
                         topology_candidates = partition_result_E.get_topology_candidates(tdx)
                     else:
-                        topology_candidates = get_unique_subtopologies(mini_topology)
+                        topology_candidates = get_subtopologies_of_type(topology, mini_topology)
                     for topology_candidate in topology_candidates:
                         for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
                             # Create cache key for this candidate's transform_pi result

From b1595dd41a291fb98b6483d53163c00eb15a4b91 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 2 Dec 2025 11:59:21 +0100
Subject: [PATCH 041/232] re-try parallelization

---
 squander/synthesis/PartAM.py | 205 +++++++++++++++++++----------------
 1 file changed, 110 insertions(+), 95 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 81bb3067c..6fb8f63eb 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -7,6 +7,7 @@
     qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search,
 )
 from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
+from concurrent.futures import ThreadPoolExecutor
 from itertools import permutations
 from squander.partitioning.ilp import get_all_partitions, _get_topo_order, topo_sort_partitions, ilp_global_optimal, recombine_single_qubit_chains
 
@@ -263,48 +264,59 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
                    bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved', 
                    disable=self.config.get('progressbar', 0) == False)
         
-        while len(F) != 0:
-            partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
-            if len(partition_candidates) == 0:
-                break
-            scores = [None] * len(partition_candidates)
-            with Pool(processes=mp.cpu_count()) as pool:
-                for idx, partition_candidate in enumerate(partition_candidates):
-                    scores[idx] = pool.apply_async(qgd_Partition_Aware_Mapping.score_partition_candidate, (partition_candidate, F, pi, optimized_partitions, sDAG, D, self._swap_cache, self.topology))
-            for idx, score in enumerate(scores):
-                scores[idx] = scores[idx].get()
-            min_idx = np.argmin(scores)
-            min_partition_candidate = partition_candidates[min_idx]
-            
-            F.remove(min_partition_candidate.partition_idx)
-            resolved_partitions[min_partition_candidate.partition_idx] = True
-            resolved_count = sum(resolved_partitions)
-            pbar.n = resolved_count
-            pbar.refresh()
-            swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
-            if len(swap_order)!=0:
-                partition_order.append(construct_swap_circuit(swap_order, len(pi)))
-            partition_order.append(min_partition_candidate)
-            children = DAG[min_partition_candidate.partition_idx]
-            step += 1
-            while len(children) != 0:
-                child = children.pop(0)
-                parents_resolved = True
-                for parent in IDAG[child]:
-                    parents_resolved *= resolved_partitions[parent]
-                if (not resolved_partitions[child] and child not in F) and parents_resolved:
-                    if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
-                        child_partition = optimized_partitions[child]
-                        qubit = child_partition.circuit.get_Qbits()[0]
-                        child_partition.circuit.map_circuit({qubit: pi[qubit]})
-                        partition_order.append(child_partition)
-                        resolved_partitions[child] = True
-                        resolved_count = sum(resolved_partitions)
-                        pbar.n = resolved_count
-                        pbar.refresh()
-                        children.extend(DAG[child])
-                    else:
-                        F.append(child)
+        score_workers = max(1, os.cpu_count() or 1)
+        with ThreadPoolExecutor(max_workers=score_workers) as executor:
+            while len(F) != 0:
+                partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
+                if len(partition_candidates) == 0:
+                    break
+                # Parallelize scoring while keeping the underlying logic intact
+                F_snapshot = tuple(F)
+                futures = [
+                    executor.submit(
+                        self.score_partition_candidate,
+                        partition_candidate,
+                        F_snapshot,
+                        pi,
+                        optimized_partitions,
+                        sDAG,
+                        D,
+                    )
+                    for partition_candidate in partition_candidates
+                ]
+                scores = [future.result() for future in futures]
+                min_idx = np.argmin(scores)
+                min_partition_candidate = partition_candidates[min_idx]
+                
+                F.remove(min_partition_candidate.partition_idx)
+                resolved_partitions[min_partition_candidate.partition_idx] = True
+                resolved_count = sum(resolved_partitions)
+                pbar.n = resolved_count
+                pbar.refresh()
+                swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
+                if len(swap_order)!=0:
+                    partition_order.append(construct_swap_circuit(swap_order, len(pi)))
+                partition_order.append(min_partition_candidate)
+                children = DAG[min_partition_candidate.partition_idx]
+                step += 1
+                while len(children) != 0:
+                    child = children.pop(0)
+                    parents_resolved = True
+                    for parent in IDAG[child]:
+                        parents_resolved *= resolved_partitions[parent]
+                    if (not resolved_partitions[child] and child not in F) and parents_resolved:
+                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                            child_partition = optimized_partitions[child]
+                            qubit = child_partition.circuit.get_Qbits()[0]
+                            child_partition.circuit.map_circuit({qubit: pi[qubit]})
+                            partition_order.append(child_partition)
+                            resolved_partitions[child] = True
+                            resolved_count = sum(resolved_partitions)
+                            pbar.n = resolved_count
+                            pbar.refresh()
+                            children.extend(DAG[child])
+                        else:
+                            F.append(child)
         pbar.close()
         return partition_order, pi
 
@@ -335,12 +347,11 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         
         return final_circuit, final_parameters
     
-    @staticmethod
-    def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions, sDAG, D, swap_cache, topology):
+    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, sDAG, D):
         score_F = 0
         score_E = 0
         E_visited_partitions = set()  # Changed to set for O(1) membership checks
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, self._swap_cache)
         score_F += len(swaps)*3
         score_F += len(partition_candidate.circuit_structure)
 
@@ -348,29 +359,31 @@ def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions,
         # Key: (partition_idx, topology_idx, permutation_idx, topology_candidate_tuple, output_perm_tuple)
         transform_cache = {}
 
-        for partition_idx in sDAG[partition_candidate.partition_idx]:
-            if partition_idx in E_visited_partitions:
-                continue
-            E_visited_partitions.add(partition_idx)
-            mini_scores = []
-            partition_result = optimized_partitions[partition_idx]
-            for tdx, mini_topology in enumerate(partition_result.mini_topologies):
-                # Use pre-computed topology candidates if available, otherwise compute and cache
-                if hasattr(partition_result, 'get_topology_candidates'):
-                    topology_candidates = partition_result.get_topology_candidates(tdx)
-                else:
-                    topology_candidates = get_subtopologies_of_type(topology, mini_topology)
-                for topology_candidate in topology_candidates:
-                    for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
-                        # Create cache key for this candidate's transform_pi result
-                        cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
-                        if cache_key not in transform_cache:
-                            new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition_result.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result.qubit_map,partition_result.involved_qbits)
-                            swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
-                            transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
-                        mini_scores.append(transform_cache[cache_key])
-            if mini_scores:
-                score_E += min(mini_scores)
+        # Safety check: ensure partition_idx is valid for sDAG
+        if partition_candidate.partition_idx < len(sDAG):
+            for partition_idx in sDAG[partition_candidate.partition_idx]:
+                if partition_idx in E_visited_partitions:
+                    continue
+                E_visited_partitions.add(partition_idx)
+                mini_scores = []
+                partition_result = optimized_partitions[partition_idx]
+                for tdx, mini_topology in enumerate(partition_result.mini_topologies):
+                    # Use pre-computed topology candidates if available, otherwise compute and cache
+                    if hasattr(partition_result, 'get_topology_candidates'):
+                        topology_candidates = partition_result.get_topology_candidates(tdx)
+                    else:
+                        topology_candidates = get_subtopologies_of_type(self.topology, mini_topology)
+                    for topology_candidate in topology_candidates:
+                        for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
+                            # Create cache key for this candidate's transform_pi result
+                            cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
+                            if cache_key not in transform_cache:
+                                new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition_result.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result.qubit_map,partition_result.involved_qbits)
+                                swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                                transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                            mini_scores.append(transform_cache[cache_key])
+                if mini_scores:
+                    score_E += min(mini_scores)
 
         for partition_idx in F:
             partition = optimized_partitions[partition_idx]
@@ -380,42 +393,44 @@ def score_partition_candidate(partition_candidate, F,  pi, optimized_partitions,
                 if hasattr(partition, 'get_topology_candidates'):
                     topology_candidates = partition.get_topology_candidates(tdx)
                 else:
-                    topology_candidates = get_subtopologies_of_type(topology, mini_topology)
+                    topology_candidates = get_subtopologies_of_type(self.topology, mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         # Create cache key for this candidate's transform_pi result
                         cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
                         if cache_key not in transform_cache:
                             new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                            swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
+                            swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
                             transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
                         mini_scores.append(transform_cache[cache_key])
             if mini_scores:
                 score_F += min(mini_scores)
 
-            for partition_idx_E in sDAG[partition_idx]:
-                if partition_idx_E in E_visited_partitions:
-                    continue
-                E_visited_partitions.add(partition_idx_E)
-                mini_scores = []
-                partition_result_E = optimized_partitions[partition_idx_E]
-                for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
-                    # Use pre-computed topology candidates if available, otherwise compute and cache
-                    if hasattr(partition_result_E, 'get_topology_candidates'):
-                        topology_candidates = partition_result_E.get_topology_candidates(tdx)
-                    else:
-                        topology_candidates = get_subtopologies_of_type(topology, mini_topology)
-                    for topology_candidate in topology_candidates:
-                        for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
-                            # Create cache key for this candidate's transform_pi result
-                            cache_key = (partition_idx_E, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
-                            if cache_key not in transform_cache:
-                                new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,partition_result_E.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result_E.qubit_map,partition_result_E.involved_qbits)
-                                swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
-                                transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
-                            mini_scores.append(transform_cache[cache_key])
-                if mini_scores:
-                    score_E += min(mini_scores)
+            # Safety check: ensure partition_idx is valid for sDAG
+            if partition_idx < len(sDAG):
+                for partition_idx_E in sDAG[partition_idx]:
+                    if partition_idx_E in E_visited_partitions:
+                        continue
+                    E_visited_partitions.add(partition_idx_E)
+                    mini_scores = []
+                    partition_result_E = optimized_partitions[partition_idx_E]
+                    for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
+                        # Use pre-computed topology candidates if available, otherwise compute and cache
+                        if hasattr(partition_result_E, 'get_topology_candidates'):
+                            topology_candidates = partition_result_E.get_topology_candidates(tdx)
+                        else:
+                            topology_candidates = get_subtopologies_of_type(self.topology, mini_topology)
+                        for topology_candidate in topology_candidates:
+                            for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
+                                # Create cache key for this candidate's transform_pi result
+                                cache_key = (partition_idx_E, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
+                                if cache_key not in transform_cache:
+                                    new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,partition_result_E.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result_E.qubit_map,partition_result_E.involved_qbits)
+                                    swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                                    transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                                mini_scores.append(transform_cache[cache_key])
+                    if mini_scores:
+                        score_E += min(mini_scores)
         # Safety check for division by zero
         if len(E_visited_partitions) == 0:
             E_score = 0.0
@@ -489,7 +504,7 @@ def construct_DAG_and_IDAG(self, optimized_partitions):
         return DAG, IDAG
     
     def construct_sDAG(self, optimized_partitions):
-        sDAG = []
+        sDAG = [[] for _ in range(len(optimized_partitions))]
         
         for idx in range(len(optimized_partitions)):
             # Skip single-qubit partitions
@@ -513,7 +528,7 @@ def construct_sDAG(self, optimized_partitions):
                             involved_qbits_current.remove(intersection_qbit)
                     if len(involved_qbits_current) == 0:
                         break                        
-            sDAG.append(children)
+            sDAG[idx] = children
             
         return sDAG
 

From 0348423cad251caf1e44c274c7ee01faea45c5a2 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Tue, 2 Dec 2025 12:01:21 +0100
Subject: [PATCH 042/232] Change scoring to minimum count

---
 squander/synthesis/PartAM_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 710870711..cbd09a794 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -281,7 +281,7 @@ def get_original_circuit_structure(self):
     def get_partition_synthesis_score(self):
         score = 0
         for topology_idx in range(self.topology_count):
-            cnot_count_topology = np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
+            cnot_count_topology = np.min(self.cnot_counts[topology_idx])#np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
             score += cnot_count_topology/self.topology_count
         return score
     

From 68251c0901856ed862c3e2ad1f5abb9bc0122402 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 2 Dec 2025 12:17:48 +0100
Subject: [PATCH 043/232] Fix runtime error in partam utils

---
 squander/synthesis/PartAM_utils.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index cbd09a794..cb0a017cf 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -3,6 +3,8 @@
 from itertools import permutations
 from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
 import heapq
+import math
+import logging
 
 def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
     adj_list = {}
@@ -161,12 +163,20 @@ def is_goal(state):
     
     def heuristic(state):
         """Lower bound: sum of distances for qubits needing routing"""
-        total = 0
+        total = 0.0
         for q, target_P in pi_B_dict.items():
             # Find where virtual qubit q currently is
             current_P = state.index(q)
-            total += dist_matrix[current_P][target_P]
-        return total // 2  # Optimistic: each SWAP helps 2 qubits
+            distance = dist_matrix[current_P][target_P]
+            if np.isinf(distance):
+                logging.warning(
+                    "Encountered unreachable qubit pair (%s, %s) in routing heuristic; returning inf cost.",
+                    current_P,
+                    target_P,
+                )
+                return math.inf
+            total += float(distance)
+        return math.floor(total / 2)  # Optimistic: each SWAP helps 2 qubits
     
     # A* search
     heap = [(heuristic(start_state), 0, start_state, [])]

From ecd1dc5bfca7fb131fcdce9f072a3851cd7a4fc2 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 2 Dec 2025 13:10:19 +0100
Subject: [PATCH 044/232] Rework multi threading again

---
 squander/synthesis/PartAM.py | 230 +++++++++++++++++++++++++++--------
 1 file changed, 182 insertions(+), 48 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6fb8f63eb..988b43164 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -7,27 +7,91 @@
     qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search,
 )
 from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ProcessPoolExecutor
 from itertools import permutations
-from squander.partitioning.ilp import get_all_partitions, _get_topo_order, topo_sort_partitions, ilp_global_optimal, recombine_single_qubit_chains
+from squander.partitioning.ilp import (
+    get_all_partitions,
+    _get_topo_order,
+    topo_sort_partitions,
+    ilp_global_optimal,
+    recombine_single_qubit_chains,
+)
 
 import numpy as np
 from qiskit import QuantumCircuit
 
-from typing import List, Callable
+from typing import Callable, Dict, List, Optional, Set, Tuple, FrozenSet
+from dataclasses import dataclass
 
 import multiprocessing as mp
 from multiprocessing import Process, Pool
 import os
-from typing import List, Set, Tuple, FrozenSet
+import logging
 from tqdm import tqdm
 from collections import deque, defaultdict
 import numpy as np
 
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
-from squander.synthesis.PartAM_utils import (get_subtopologies_of_type, get_unique_subtopologies, get_canonical_form,
-SingleQubitPartitionResult, PartitionSynthesisResult, 
-PartitionCandidate, check_circuit_compatibility, construct_swap_circuit)
+from squander.synthesis.PartAM_utils import (
+    get_subtopologies_of_type,
+    get_unique_subtopologies,
+    get_canonical_form,
+    SingleQubitPartitionResult,
+    PartitionSynthesisResult,
+    PartitionCandidate,
+    check_circuit_compatibility,
+    construct_swap_circuit,
+)
+
+
+@dataclass(frozen=True)
+class PartitionScoreData:
+    mini_topologies: Tuple[Tuple[Tuple[int, int], ...], ...]
+    topology_candidates: Tuple[Tuple[Tuple[int, int], ...], ...]
+    permutations_pairs: Tuple[
+        Tuple[Tuple[Tuple[int, ...], Tuple[int, ...]], ...], ...
+    ]
+    circuit_structures: Tuple[Tuple[Tuple[int, ...], ...], ...]
+    qubit_map: Dict[int, int]
+    involved_qbits: Tuple[int, ...]
+
+
+_WORKER_SCORING_PARTITIONS: Optional[List[Optional[PartitionScoreData]]] = None
+_WORKER_S_DAG: Optional[List[List[int]]] = None
+_WORKER_DISTANCE_MATRIX: Optional[np.ndarray] = None
+_WORKER_SWAP_CACHE: Optional[Dict] = None
+
+
+def _init_scoring_worker(scoring_partitions, sdag, distance_matrix):
+    """Initializer for process-based scoring workers."""
+    global _WORKER_SCORING_PARTITIONS, _WORKER_S_DAG, _WORKER_DISTANCE_MATRIX, _WORKER_SWAP_CACHE
+    _WORKER_SCORING_PARTITIONS = scoring_partitions
+    _WORKER_S_DAG = sdag
+    _WORKER_DISTANCE_MATRIX = distance_matrix
+    _WORKER_SWAP_CACHE = {}
+
+
+def _score_candidate_worker(payload):
+    """
+    Worker wrapper that reconstructs scoring inputs from a lightweight payload.
+    Payload format: (PartitionCandidate, F_snapshot, pi_snapshot)
+    """
+    if (
+        _WORKER_SCORING_PARTITIONS is None
+        or _WORKER_S_DAG is None
+        or _WORKER_DISTANCE_MATRIX is None
+    ):
+        raise RuntimeError("Scoring worker not initialized with shared data.")
+    partition_candidate, F_snapshot, pi_snapshot = payload
+    return qgd_Partition_Aware_Mapping.score_partition_candidate(
+        partition_candidate,
+        F_snapshot,
+        pi_snapshot,
+        _WORKER_SCORING_PARTITIONS,
+        _WORKER_S_DAG,
+        _WORKER_DISTANCE_MATRIX,
+        _WORKER_SWAP_CACHE,
+    )
 
 class qgd_Partition_Aware_Mapping:
 
@@ -45,6 +109,7 @@ def __init__(self, config):
         self.config.setdefault('routed', False)
         self.config.setdefault('partition_strategy','ilp')
         self.config.setdefault('optimizer', 'BFGS')
+        self.config.setdefault('hs_score_workers', os.cpu_count() or 1)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -77,6 +142,54 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
         
         return self._topology_cache[canonical_key]
 
+    def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[PartitionScoreData]]:
+        """
+        Create lightweight, picklable views of partitions that contain only the
+        data required during heuristic scoring.
+        """
+        scoring_partitions: List[Optional[PartitionScoreData]] = []
+        for partition in optimized_partitions:
+            if isinstance(partition, SingleQubitPartitionResult):
+                scoring_partitions.append(None)
+                continue
+
+            mini_topologies = tuple(
+                tuple(tuple(edge) for edge in mini_topology)
+                for mini_topology in partition.mini_topologies
+            )
+
+            topology_candidates = []
+            for tdx, mini_topology in enumerate(partition.mini_topologies):
+                if hasattr(partition, "get_topology_candidates"):
+                    candidates = partition.get_topology_candidates(tdx)
+                else:
+                    candidates = self._get_subtopologies_of_type_cached(mini_topology)
+                topology_candidates.append(
+                    tuple(tuple(edge) for edge in candidates)
+                )
+
+            permutations_pairs = tuple(
+                tuple((tuple(P_i), tuple(P_o)) for (P_i, P_o) in partition.permutations_pairs[tdx])
+                for tdx in range(len(partition.mini_topologies))
+            )
+
+            circuit_structures = tuple(
+                tuple(tuple(struct) for struct in partition.circuit_structures[tdx])
+                for tdx in range(len(partition.mini_topologies))
+            )
+
+            scoring_partitions.append(
+                PartitionScoreData(
+                    mini_topologies=mini_topologies,
+                    topology_candidates=tuple(topology_candidates),
+                    permutations_pairs=permutations_pairs,
+                    circuit_structures=circuit_structures,
+                    qubit_map=dict(partition.qubit_map),
+                    involved_qbits=tuple(partition.involved_qbits),
+                )
+            )
+        return scoring_partitions
+
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
         """
@@ -244,15 +357,16 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         pi_list = pi.tolist() if hasattr(pi, 'tolist') else list(pi)
         
         F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
+        scoring_partitions = self._build_scoring_partitions(optimized_partitions)
         
-        partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,IDAG, optimized_partitions,D, sDAG)
+        partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,IDAG, optimized_partitions,scoring_partitions,D, sDAG)
         pi_final_list = pi_final.tolist() if hasattr(pi_final, 'tolist') else list(pi_final)
         
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions, circ.get_Qbit_Num())
         
         return final_circuit, final_parameters, pi, pi_final
 
-    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
+    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, sDAG):
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
@@ -264,27 +378,49 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
                    bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved', 
                    disable=self.config.get('progressbar', 0) == False)
         
-        score_workers = max(1, os.cpu_count() or 1)
-        with ThreadPoolExecutor(max_workers=score_workers) as executor:
+        configured_workers = self.config.get('hs_score_workers', os.cpu_count() or 1)
+        score_workers = max(1, configured_workers if configured_workers else 1)
+        executor: Optional[ProcessPoolExecutor] = None
+        if score_workers > 1:
+            try:
+                executor = ProcessPoolExecutor(
+                    max_workers=score_workers,
+                    initializer=_init_scoring_worker,
+                    initargs=(scoring_partitions, sDAG, D),
+                )
+            except Exception as exc:
+                logging.warning(
+                    "Falling back to sequential heuristic scoring: %s",
+                    exc,
+                )
+                executor = None
+
+        try:
             while len(F) != 0:
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
                     break
-                # Parallelize scoring while keeping the underlying logic intact
                 F_snapshot = tuple(F)
-                futures = [
-                    executor.submit(
-                        self.score_partition_candidate,
-                        partition_candidate,
-                        F_snapshot,
-                        pi,
-                        optimized_partitions,
-                        sDAG,
-                        D,
-                    )
-                    for partition_candidate in partition_candidates
-                ]
-                scores = [future.result() for future in futures]
+                if executor is not None:
+                    pi_snapshot = tuple(int(x) for x in pi)
+                    payloads = [
+                        (partition_candidate, F_snapshot, pi_snapshot)
+                        for partition_candidate in partition_candidates
+                    ]
+                    scores = list(executor.map(_score_candidate_worker, payloads))
+                else:
+                    scores = [
+                        self.score_partition_candidate(
+                            partition_candidate,
+                            F_snapshot,
+                            pi,
+                            scoring_partitions,
+                            sDAG,
+                            D,
+                            self._swap_cache,
+                        )
+                        for partition_candidate in partition_candidates
+                    ]
                 min_idx = np.argmin(scores)
                 min_partition_candidate = partition_candidates[min_idx]
                 
@@ -317,6 +453,9 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, D, sDAG):
                             children.extend(DAG[child])
                         else:
                             F.append(child)
+        finally:
+            if executor is not None:
+                executor.shutdown()
         pbar.close()
         return partition_order, pi
 
@@ -347,11 +486,12 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         
         return final_circuit, final_parameters
     
-    def score_partition_candidate(self, partition_candidate, F,  pi, optimized_partitions, sDAG, D):
+    @staticmethod
+    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache):
         score_F = 0
         score_E = 0
         E_visited_partitions = set()  # Changed to set for O(1) membership checks
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, self._swap_cache)
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
         score_F += len(swaps)*3
         score_F += len(partition_candidate.circuit_structure)
 
@@ -366,41 +506,37 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                     continue
                 E_visited_partitions.add(partition_idx)
                 mini_scores = []
-                partition_result = optimized_partitions[partition_idx]
+                partition_result = scoring_partitions[partition_idx]
+                if partition_result is None:
+                    continue
                 for tdx, mini_topology in enumerate(partition_result.mini_topologies):
-                    # Use pre-computed topology candidates if available, otherwise compute and cache
-                    if hasattr(partition_result, 'get_topology_candidates'):
-                        topology_candidates = partition_result.get_topology_candidates(tdx)
-                    else:
-                        topology_candidates = get_subtopologies_of_type(self.topology, mini_topology)
+                    topology_candidates = partition_result.topology_candidates[tdx]
                     for topology_candidate in topology_candidates:
                         for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
                             # Create cache key for this candidate's transform_pi result
                             cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
                             if cache_key not in transform_cache:
                                 new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition_result.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result.qubit_map,partition_result.involved_qbits)
-                                swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                                swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
                                 transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
                             mini_scores.append(transform_cache[cache_key])
                 if mini_scores:
                     score_E += min(mini_scores)
 
         for partition_idx in F:
-            partition = optimized_partitions[partition_idx]
+            partition = scoring_partitions[partition_idx]
+            if partition is None:
+                continue
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                # Use pre-computed topology candidates if available, otherwise compute and cache
-                if hasattr(partition, 'get_topology_candidates'):
-                    topology_candidates = partition.get_topology_candidates(tdx)
-                else:
-                    topology_candidates = get_subtopologies_of_type(self.topology, mini_topology)
+                topology_candidates = partition.topology_candidates[tdx]
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         # Create cache key for this candidate's transform_pi result
                         cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
                         if cache_key not in transform_cache:
                             new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                            swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                            swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
                             transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
                         mini_scores.append(transform_cache[cache_key])
             if mini_scores:
@@ -413,20 +549,18 @@ def score_partition_candidate(self, partition_candidate, F,  pi, optimized_parti
                         continue
                     E_visited_partitions.add(partition_idx_E)
                     mini_scores = []
-                    partition_result_E = optimized_partitions[partition_idx_E]
+                    partition_result_E = scoring_partitions[partition_idx_E]
+                    if partition_result_E is None:
+                        continue
                     for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
-                        # Use pre-computed topology candidates if available, otherwise compute and cache
-                        if hasattr(partition_result_E, 'get_topology_candidates'):
-                            topology_candidates = partition_result_E.get_topology_candidates(tdx)
-                        else:
-                            topology_candidates = get_subtopologies_of_type(self.topology, mini_topology)
+                        topology_candidates = partition_result_E.topology_candidates[tdx]
                         for topology_candidate in topology_candidates:
                             for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
                                 # Create cache key for this candidate's transform_pi result
                                 cache_key = (partition_idx_E, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
                                 if cache_key not in transform_cache:
                                     new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,partition_result_E.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result_E.qubit_map,partition_result_E.involved_qbits)
-                                    swap_count = len(new_cand.transform_pi(output_perm,D, self._swap_cache)[0])
+                                    swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
                                     transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
                                 mini_scores.append(transform_cache[cache_key])
                     if mini_scores:

From c87cd9256944a4db8eb243b7e04d66006e917012 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 4 Dec 2025 16:37:37 +0100
Subject: [PATCH 045/232] Add permutation tests

---
 squander/gates/gates_Wrapper.cpp | 159 ++++++++--
 tests/gates/test_Permutation.py  | 490 +++++++++++++++++++++++++++++++
 2 files changed, 629 insertions(+), 20 deletions(-)
 create mode 100644 tests/gates/test_Permutation.py

diff --git a/squander/gates/gates_Wrapper.cpp b/squander/gates/gates_Wrapper.cpp
index 4c89f59cd..71a8bf2a1 100644
--- a/squander/gates/gates_Wrapper.cpp
+++ b/squander/gates/gates_Wrapper.cpp
@@ -470,32 +470,89 @@ static PyObject *
         return NULL;
     }
 
-    if (!PyList_Check(pattern_py)) {
-        PyErr_SetString(PyExc_TypeError, "pattern must be a list!");
+    // Convert tuple to list if necessary, or check if it's a list
+    PyObject* pattern_list = NULL;
+    bool created_list = false;
+    if (PyTuple_Check(pattern_py)) {
+        pattern_list = PySequence_List(pattern_py);
+        if (pattern_list == NULL) {
+            PyErr_SetString(PyExc_TypeError, "Failed to convert tuple to list");
+            return NULL;
+        }
+        created_list = true;  // We created it, so we need to DECREF
+    } else if (PyList_Check(pattern_py)) {
+        pattern_list = pattern_py;
+        // We're borrowing the reference, no need to INCREF/DECREF
+    } else {
+        PyErr_SetString(PyExc_TypeError, "pattern must be a list or tuple!");
         return NULL;
     }
 
     std::vector<int> pattern;
-    Py_ssize_t pattern_size = PyList_Size(pattern_py);
+    Py_ssize_t pattern_size = PyList_Size(pattern_list);
+
+    // Check pattern size matches qbit_num
+    if (pattern_size != qbit_num) {
+        if (created_list) {
+            Py_DECREF(pattern_list);
+        }
+        std::string err = "Pattern size " + std::to_string(pattern_size) + 
+                         " does not match qubit number " + std::to_string(qbit_num);
+        PyErr_SetString(PyExc_ValueError, err.c_str());
+        return NULL;
+    }
+
+    // Track which values we've seen to validate it's a permutation
+    std::vector<bool> seen(qbit_num, false);
 
     for (Py_ssize_t i = 0; i < pattern_size; i++) {
-        PyObject* item = PyList_GetItem(pattern_py, i);
+        PyObject* item = PyList_GetItem(pattern_list, i);
         if (!PyLong_Check(item)) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
             PyErr_SetString(PyExc_TypeError, "pattern must contain integers!");
             return NULL;
         }
         int qbit = PyLong_AsLong(item);
-        if (qbit >= qbit_num) {
-            PyErr_SetString(PyExc_ValueError, "Pattern qubit index out of range!");
+        if (qbit < 0 || qbit >= qbit_num) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err = "Pattern qubit index " + std::to_string(qbit) + 
+                             " out of range [0, " + std::to_string(qbit_num - 1) + "]";
+            PyErr_SetString(PyExc_ValueError, err.c_str());
+            return NULL;
+        }
+        if (seen[qbit]) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err = "Pattern contains duplicate value " + std::to_string(qbit);
+            PyErr_SetString(PyExc_ValueError, err.c_str());
             return NULL;
         }
+        seen[qbit] = true;
         pattern.push_back(qbit);
     }
 
+    // Release the pattern_list reference (only if we created it from a tuple)
+    if (created_list) {
+        Py_DECREF(pattern_list);
+    }
+
     Gate_Wrapper *self;
     self = (Gate_Wrapper *) type->tp_alloc(type, 0);
     if (self != NULL) {
-        self->gate = create_permutation_gate(qbit_num, pattern);
+        try {
+            self->gate = create_permutation_gate(qbit_num, pattern);
+        } catch (const std::string& e) {
+            PyErr_SetString(PyExc_ValueError, e.c_str());
+            return NULL;
+        } catch (const std::exception& e) {
+            PyErr_SetString(PyExc_ValueError, e.what());
+            return NULL;
+        }
     }
 
     return (PyObject *) self;
@@ -1320,29 +1377,91 @@ static PyObject * Gate_Wrapper_set_Pattern( Gate_Wrapper *self, PyObject *args )
         PyErr_SetString(PyExc_Exception, err.c_str());
         return NULL;
     }
-    if (!PyList_Check(pattern_py)) {
-        std::string err("Pattern must be a list!");
-        PyErr_SetString(PyExc_Exception, err.c_str());
+    // Convert tuple to list if necessary, or check if it's a list
+    PyObject* pattern_list = NULL;
+    bool created_list = false;
+    if (PyTuple_Check(pattern_py)) {
+        pattern_list = PySequence_List(pattern_py);
+        if (pattern_list == NULL) {
+            PyErr_SetString(PyExc_TypeError, "Failed to convert tuple to list");
+            return NULL;
+        }
+        created_list = true;  // We created it, so we need to DECREF
+    } else if (PyList_Check(pattern_py)) {
+        pattern_list = pattern_py;
+        // We're borrowing the reference, no need to INCREF/DECREF
+    } else {
+        std::string err("Pattern must be a list or tuple!");
+        PyErr_SetString(PyExc_TypeError, err.c_str());
+        return NULL;
+    }
+    
+    // Cast to Permutation* to access pattern methods and get qbit_num
+    Permutation* perm_gate = dynamic_cast<Permutation*>(self->gate);
+    if (perm_gate == nullptr) {
+        if (created_list) {
+            Py_DECREF(pattern_list);
+        }
+        PyErr_SetString(PyExc_TypeError, "Gate is not a Permutation gate");
         return NULL;
     }
+    
+    int qbit_num = perm_gate->get_qbit_num();
     std::vector<int> pattern;
-    Py_ssize_t pattern_size = PyList_Size(pattern_py);
+    Py_ssize_t pattern_size = PyList_Size(pattern_list);
+    
+    // Check pattern size matches qbit_num
+    if (pattern_size != qbit_num) {
+        if (created_list) {
+            Py_DECREF(pattern_list);
+        }
+        std::string err = "Pattern size " + std::to_string(pattern_size) + 
+                         " does not match qubit number " + std::to_string(qbit_num);
+        PyErr_SetString(PyExc_ValueError, err.c_str());
+        return NULL;
+    }
+    
+    // Track which values we've seen to validate it's a permutation
+    std::vector<bool> seen(qbit_num, false);
+    
     for (Py_ssize_t i = 0; i < pattern_size; i++) {
-        PyObject* item = PyList_GetItem(pattern_py, i);
+        PyObject* item = PyList_GetItem(pattern_list, i);
         if (!PyLong_Check(item)) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
             std::string err("Pattern must contain integers!");
-            PyErr_SetString(PyExc_Exception, err.c_str());
+            PyErr_SetString(PyExc_TypeError, err.c_str());
             return NULL;
         }
-        pattern.push_back(PyLong_AsLong(item));
-    }
-    try {
-        // Cast to Permutation* to access pattern methods
-        Permutation* perm_gate = dynamic_cast<Permutation*>(self->gate);
-        if (perm_gate == nullptr) {
-            PyErr_SetString(PyExc_TypeError, "Gate is not a Permutation gate");
+        int qbit = PyLong_AsLong(item);
+        if (qbit < 0 || qbit >= qbit_num) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err = "Pattern qubit index " + std::to_string(qbit) + 
+                             " out of range [0, " + std::to_string(qbit_num - 1) + "]";
+            PyErr_SetString(PyExc_ValueError, err.c_str());
+            return NULL;
+        }
+        if (seen[qbit]) {
+            if (created_list) {
+                Py_DECREF(pattern_list);
+            }
+            std::string err = "Pattern contains duplicate value " + std::to_string(qbit);
+            PyErr_SetString(PyExc_ValueError, err.c_str());
             return NULL;
         }
+        seen[qbit] = true;
+        pattern.push_back(qbit);
+    }
+    
+    // Release the pattern_list reference (only if we created it from a tuple)
+    if (created_list) {
+        Py_DECREF(pattern_list);
+    }
+    
+    try {
         perm_gate->set_pattern(pattern);
     }
     catch (std::string err) {
diff --git a/tests/gates/test_Permutation.py b/tests/gates/test_Permutation.py
new file mode 100644
index 000000000..4a4ecbb06
--- /dev/null
+++ b/tests/gates/test_Permutation.py
@@ -0,0 +1,490 @@
+'''
+Copyright 2020 Peter Rakyta, Ph.D.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see http://www.gnu.org/licenses/.
+'''
+
+import numpy as np
+import pytest
+from itertools import permutations
+
+from squander.gates.gates_Wrapper import Permutation
+from squander.gates.qgd_Circuit import qgd_Circuit
+
+
+class Test_Permutation:
+    """Test class for Permutation gate"""
+
+    def test_permutation_creation_identity(self):
+        """
+        Test creating identity permutation gates
+        """
+        for qbit_num in range(1, 6):
+            # Identity permutation: [0, 1, 2, ..., n-1]
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            assert perm_gate.get_Parameter_Num() == 0
+            pattern_retrieved = perm_gate.get_Pattern()
+            assert pattern_retrieved == pattern
+
+    def test_permutation_creation_swap(self):
+        """
+        Test creating swap permutation gates
+        """
+        for qbit_num in range(2, 6):
+            # Swap first and last qubits: [n-1, 1, 2, ..., n-2, 0]
+            pattern = list(range(qbit_num))
+            pattern[0], pattern[-1] = pattern[-1], pattern[0]
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            pattern_retrieved = perm_gate.get_Pattern()
+            assert pattern_retrieved == pattern
+
+    def test_permutation_creation_reverse(self):
+        """
+        Test creating reverse permutation gates
+        """
+        for qbit_num in range(1, 6):
+            # Reverse permutation: [n-1, n-2, ..., 1, 0]
+            pattern = list(range(qbit_num))[::-1]
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            pattern_retrieved = perm_gate.get_Pattern()
+            assert pattern_retrieved == pattern
+
+    def test_permutation_creation_random(self):
+        """
+        Test creating random permutation gates
+        """
+        np.random.seed(42)
+        for qbit_num in range(2, 6):
+            # Random permutation
+            pattern = list(range(qbit_num))
+            np.random.shuffle(pattern)
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            pattern_retrieved = perm_gate.get_Pattern()
+            assert pattern_retrieved == pattern
+
+    def test_permutation_creation_invalid_size(self):
+        """
+        Test that creating permutation with wrong pattern size raises error
+        """
+        qbit_num = 3
+        # Pattern too small
+        with pytest.raises(ValueError, match="Pattern size.*does not match"):
+            Permutation(qbit_num, [0, 1])
+        
+        # Pattern too large
+        with pytest.raises(ValueError, match="Pattern size.*does not match"):
+            Permutation(qbit_num, [0, 1, 2, 3])
+
+    def test_permutation_creation_invalid_range(self):
+        """
+        Test that creating permutation with out-of-range indices raises error
+        """
+        qbit_num = 3
+        # Negative index
+        with pytest.raises(ValueError, match="out of range"):
+            Permutation(qbit_num, [-1, 1, 2])
+        
+        # Index too large
+        with pytest.raises(ValueError, match="out of range"):
+            Permutation(qbit_num, [0, 1, 3])
+
+    def test_permutation_creation_duplicates(self):
+        """
+        Test that creating permutation with duplicate values raises error
+        """
+        qbit_num = 3
+        # Duplicate values
+        with pytest.raises(ValueError, match="duplicate"):
+            Permutation(qbit_num, [0, 1, 1])
+        
+        with pytest.raises(ValueError, match="duplicate"):
+            Permutation(qbit_num, [0, 0, 2])
+
+    def test_permutation_creation_invalid_type(self):
+        """
+        Test that creating permutation with invalid type raises error
+        """
+        qbit_num = 3
+        # Tuple should work (converted to list)
+        perm_gate = Permutation(qbit_num, (0, 1, 2))
+        assert perm_gate.get_Pattern() == [0, 1, 2]
+        
+        # Non-integer values
+        with pytest.raises(TypeError, match="pattern must contain integers"):
+            Permutation(qbit_num, [0.0, 1.0, 2.0])
+        
+        with pytest.raises(TypeError, match="pattern must contain integers"):
+            Permutation(qbit_num, ["0", "1", "2"])
+        
+        # Invalid type (not list or tuple)
+        with pytest.raises(TypeError, match="pattern must be a list or tuple"):
+            Permutation(qbit_num, "012")
+
+    def test_permutation_get_pattern(self):
+        """
+        Test getting pattern from permutation gate
+        """
+        for qbit_num in range(1, 5):
+            for pattern_tuple in permutations(range(qbit_num)):
+                pattern = list(pattern_tuple)
+                perm_gate = Permutation(qbit_num, pattern)
+                retrieved_pattern = perm_gate.get_Pattern()
+                assert retrieved_pattern == pattern
+
+    def test_permutation_tuple_conversion(self):
+        """
+        Test that tuples are properly converted to lists
+        """
+        for qbit_num in range(1, 5):
+            for pattern_tuple in permutations(range(qbit_num)):
+                # Create with tuple
+                perm_gate = Permutation(qbit_num, pattern_tuple)
+                retrieved_pattern = perm_gate.get_Pattern()
+                # Should return as list
+                assert retrieved_pattern == list(pattern_tuple)
+                assert isinstance(retrieved_pattern, list)
+                
+                # Set with tuple
+                perm_gate.set_Pattern(pattern_tuple)
+                retrieved_pattern = perm_gate.get_Pattern()
+                assert retrieved_pattern == list(pattern_tuple)
+                assert isinstance(retrieved_pattern, list)
+
+    def test_permutation_set_pattern(self):
+        """
+        Test setting pattern on permutation gate
+        """
+        qbit_num = 4
+        initial_pattern = [0, 1, 2, 3]
+        perm_gate = Permutation(qbit_num, initial_pattern)
+        
+        # Set new pattern
+        new_pattern = [3, 2, 1, 0]
+        perm_gate.set_Pattern(new_pattern)
+        assert perm_gate.get_Pattern() == new_pattern
+        
+        # Set another pattern
+        another_pattern = [1, 0, 3, 2]
+        perm_gate.set_Pattern(another_pattern)
+        assert perm_gate.get_Pattern() == another_pattern
+
+    def test_permutation_set_pattern_invalid(self):
+        """
+        Test that setting invalid pattern raises error
+        """
+        qbit_num = 3
+        perm_gate = Permutation(qbit_num, [0, 1, 2])
+        
+        # Wrong size
+        with pytest.raises(ValueError, match="Pattern size.*does not match"):
+            perm_gate.set_Pattern([0, 1])
+        
+        # Out of range
+        with pytest.raises(ValueError, match="out of range"):
+            perm_gate.set_Pattern([0, 1, 3])
+        
+        # Duplicates
+        with pytest.raises(ValueError, match="duplicate"):
+            perm_gate.set_Pattern([0, 1, 1])
+        
+        # Invalid type (not list or tuple)
+        with pytest.raises(TypeError, match="Pattern must be a list or tuple"):
+            perm_gate.set_Pattern("012")
+        
+        # Tuple should work (converted to list)
+        perm_gate.set_Pattern((0, 1, 2))
+        assert perm_gate.get_Pattern() == [0, 1, 2]
+        
+        # Tuple with different pattern
+        perm_gate.set_Pattern((2, 0, 1))
+        assert perm_gate.get_Pattern() == [2, 0, 1]
+
+    def test_permutation_get_matrix_identity(self):
+        """
+        Test that identity permutation gives identity matrix
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            matrix = perm_gate.get_Matrix()
+            
+            expected = np.eye(2**qbit_num, dtype=np.complex128)
+            error = np.linalg.norm(matrix - expected)
+            assert error < 1e-10, f"Identity permutation failed for {qbit_num} qubits"
+
+    def test_permutation_get_matrix_swap(self):
+        """
+        Test permutation matrix for swap operation
+        """
+        qbit_num = 2
+        # Swap qubits: [1, 0]
+        pattern = [1, 0]
+        perm_gate = Permutation(qbit_num, pattern)
+        matrix = perm_gate.get_Matrix()
+        
+        # For 2 qubits, swap should exchange |01> and |10>
+        # Identity: |00> -> |00>, |01> -> |01>, |10> -> |10>, |11> -> |11>
+        # Swap:     |00> -> |00>, |01> -> |10>, |10> -> |01>, |11> -> |11>
+        expected = np.array([
+            [1, 0, 0, 0],
+            [0, 0, 1, 0],
+            [0, 1, 0, 0],
+            [0, 0, 0, 1]
+        ], dtype=np.complex128)
+        
+        error = np.linalg.norm(matrix - expected)
+        assert error < 1e-10, "Swap permutation matrix incorrect"
+
+    def test_permutation_get_matrix_unitary(self):
+        """
+        Test that permutation matrices are unitary
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            np.random.shuffle(pattern)
+            perm_gate = Permutation(qbit_num, pattern)
+            matrix = perm_gate.get_Matrix()
+            
+            # Check unitarity: U @ U^dagger = I
+            unitary_check = matrix @ matrix.conj().T
+            identity = np.eye(2**qbit_num, dtype=np.complex128)
+            error = np.linalg.norm(unitary_check - identity)
+            assert error < 1e-10, f"Matrix not unitary for pattern {pattern}"
+
+    def test_permutation_apply_to_identity(self):
+        """
+        Test applying identity permutation to a state
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            # Create random state
+            matrix_size = 2**qbit_num
+            state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size)
+            state = state / np.linalg.norm(state)
+            
+            state_copy = state.copy()
+            perm_gate.apply_to(state_copy)
+            
+            # Identity should not change the state
+            error = np.linalg.norm(state_copy - state)
+            assert error < 1e-10, "Identity permutation changed state"
+
+    def test_permutation_apply_to_swap(self):
+        """
+        Test applying swap permutation to a state
+        """
+        qbit_num = 2
+        pattern = [1, 0]  # Swap qubits
+        perm_gate = Permutation(qbit_num, pattern)
+        
+        # Create test state |01> = [0, 1, 0, 0]
+        state = np.array([0, 1, 0, 0], dtype=np.complex128)
+        perm_gate.apply_to(state)
+        
+        # After swap, should be |10> = [0, 0, 1, 0]
+        expected = np.array([0, 0, 1, 0], dtype=np.complex128)
+        error = np.linalg.norm(state - expected)
+        assert error < 1e-10, "Swap permutation incorrect"
+
+    def test_permutation_apply_to_matrix(self):
+        """
+        Test applying permutation to a matrix
+        """
+        qbit_num = 3
+        pattern = [2, 0, 1]  # Rotate: 0->2, 1->0, 2->1
+        perm_gate = Permutation(qbit_num, pattern)
+        
+        # Create test matrix
+        matrix_size = 2**qbit_num
+        test_matrix = np.random.rand(matrix_size, matrix_size) + 1j * np.random.rand(matrix_size, matrix_size)
+        test_matrix = test_matrix / np.linalg.norm(test_matrix)
+        
+        # Apply permutation
+        test_matrix_copy = test_matrix.copy()
+        perm_gate.apply_to(test_matrix_copy)
+        
+        # Check that it's different (unless it's identity)
+        if pattern != list(range(qbit_num)):
+            assert not np.allclose(test_matrix_copy, test_matrix), "Permutation should change matrix"
+
+    def test_permutation_composition(self):
+        """
+        Test that applying two permutations is equivalent to their composition
+        """
+        qbit_num = 3
+        pattern1 = [1, 2, 0]  # Rotate left
+        pattern2 = [2, 0, 1]  # Rotate right
+        
+        perm1 = Permutation(qbit_num, pattern1)
+        perm2 = Permutation(qbit_num, pattern2)
+        
+        # Compose patterns: pattern2(pattern1(x))
+        composed_pattern = [pattern2[pattern1[i]] for i in range(qbit_num)]
+        perm_composed = Permutation(qbit_num, composed_pattern)
+        
+        # Create test state
+        matrix_size = 2**qbit_num
+        state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size)
+        state = state / np.linalg.norm(state)
+        
+        # Apply sequentially
+        state_seq = state.copy()
+        perm1.apply_to(state_seq)
+        perm2.apply_to(state_seq)
+        
+        # Apply composed
+        state_comp = state.copy()
+        perm_composed.apply_to(state_comp)
+        
+        error = np.linalg.norm(state_seq - state_comp)
+        assert error < 1e-10, "Composition of permutations incorrect"
+
+    def test_permutation_inverse(self):
+        """
+        Test that applying permutation and its inverse gives identity
+        """
+        for qbit_num in range(2, 5):
+            pattern = list(range(qbit_num))
+            np.random.shuffle(pattern)
+            
+            # Compute inverse permutation
+            inverse_pattern = [0] * qbit_num
+            for i in range(qbit_num):
+                inverse_pattern[pattern[i]] = i
+            
+            perm = Permutation(qbit_num, pattern)
+            perm_inv = Permutation(qbit_num, inverse_pattern)
+            
+            # Create test state
+            matrix_size = 2**qbit_num
+            state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size)
+            state = state / np.linalg.norm(state)
+            
+            # Apply permutation then inverse
+            state_transformed = state.copy()
+            perm.apply_to(state_transformed)
+            perm_inv.apply_to(state_transformed)
+            
+            error = np.linalg.norm(state_transformed - state)
+            assert error < 1e-10, f"Inverse permutation failed for pattern {pattern}"
+
+    def test_permutation_circuit_integration(self):
+        """
+        Test adding permutation gate to circuit
+        """
+        qbit_num = 3
+        pattern = [2, 0, 1]
+        
+        circuit = qgd_Circuit(qbit_num)
+        circuit.add_Permutation(pattern)
+        
+        gates = circuit.get_Gates()
+        assert len(gates) == 1
+        
+        gate = gates[0]
+        assert gate.get_Name() == "Permutation"
+        retrieved_pattern = gate.get_Pattern()
+        assert retrieved_pattern == pattern
+
+    def test_permutation_circuit_multiple(self):
+        """
+        Test adding multiple permutation gates to circuit
+        """
+        qbit_num = 3
+        
+        circuit = qgd_Circuit(qbit_num)
+        pattern1 = [1, 2, 0]
+        pattern2 = [2, 0, 1]
+        
+        circuit.add_Permutation(pattern1)
+        circuit.add_Permutation(pattern2)
+        
+        gates = circuit.get_Gates()
+        assert len(gates) == 2
+        
+        assert gates[0].get_Pattern() == pattern1
+        assert gates[1].get_Pattern() == pattern2
+
+    def test_permutation_get_involved_qubits(self):
+        """
+        Test getting involved qubits from permutation gate
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            involved_qbits = perm_gate.get_Involved_Qbits()
+            # Permutation gate involves all qubits
+            assert involved_qbits == list(range(qbit_num))
+
+    def test_permutation_get_target_qubits(self):
+        """
+        Test getting target qubits from permutation gate
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            target_qbits = perm_gate.get_Target_Qbits()
+            # Permutation gate targets all qubits
+            assert target_qbits == list(range(qbit_num))
+
+    def test_permutation_get_control_qubits(self):
+        """
+        Test getting control qubits from permutation gate (should be empty)
+        """
+        for qbit_num in range(1, 5):
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            control_qbits = perm_gate.get_Control_Qbits()
+            # Permutation gate has no control qubits
+            assert control_qbits == []
+
+    def test_permutation_large_patterns(self):
+        """
+        Test permutation gates with larger numbers of qubits
+        """
+        for qbit_num in [5, 6, 7]:
+            # Test identity
+            pattern = list(range(qbit_num))
+            perm_gate = Permutation(qbit_num, pattern)
+            matrix = perm_gate.get_Matrix()
+            
+            expected = np.eye(2**qbit_num, dtype=np.complex128)
+            error = np.linalg.norm(matrix - expected)
+            assert error < 1e-10, f"Large identity permutation failed for {qbit_num} qubits"
+            
+            # Test random permutation
+            np.random.seed(42)
+            pattern = list(range(qbit_num))
+            np.random.shuffle(pattern)
+            perm_gate = Permutation(qbit_num, pattern)
+            
+            # Check unitarity
+            matrix = perm_gate.get_Matrix()
+            unitary_check = matrix @ matrix.conj().T
+            identity = np.eye(2**qbit_num, dtype=np.complex128)
+            error = np.linalg.norm(unitary_check - identity)
+            assert error < 1e-10, f"Large permutation not unitary for {qbit_num} qubits"
+

From e10c28fac1245ae9044ef15441ac8cac748319de Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 4 Dec 2025 17:28:23 +0100
Subject: [PATCH 046/232] rework cost calculation

---
 squander/synthesis/PartAM.py       | 64 +++++++++++++++++++-----------
 squander/synthesis/PartAM_utils.py |  7 ++++
 2 files changed, 47 insertions(+), 24 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 988b43164..0f52f87cb 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -41,6 +41,7 @@
     PartitionCandidate,
     check_circuit_compatibility,
     construct_swap_circuit,
+    calculate_dist_small
 )
 
 
@@ -223,6 +224,30 @@ def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_paramete
             result = SingleQubitPartitionResult(Partition_circuit,Partition_parameters)
         return result
 
+    @staticmethod
+    def DecomposePartition_Full(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
+        """
+        Call to decompose a partition sequentially
+        """
+        N = Partition_circuit.get_Qbit_Num()
+        if N !=1:
+            permutations_all = list(permutations(range(N)))
+            result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
+            # Sequential permutation search
+            for topology_idx in range(len(topologies)):
+                mini_topology = topologies[topology_idx]
+                for P_i in permutations_all:
+                    for P_o in permutations_all:
+                        Partition_circuit_tmp = Circuit(N)
+                        Partition_circuit_tmp.add_Permutation(list(P_i))  # Must convert tuple to list
+                        Partition_circuit_tmp.add_Circuit(Partition_circuit)
+                        Partition_circuit_tmp.add_Permutation(list(P_o))  # Must convert tuple to list
+                        synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+                        result.add_result((P_i, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
+        else:
+            result = SingleQubitPartitionResult(Partition_circuit,Partition_parameters)
+        return result
+
     @staticmethod
     def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None) -> Circuit:
         """
@@ -331,7 +356,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
-                optimized_partitions[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+                optimized_partitions[partition_idx] = pool.apply_async( self.DecomposePartition_Full, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
@@ -348,6 +373,8 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
             if isinstance(partition, PartitionSynthesisResult):
                 partition._topology = self.topology
                 partition._topology_cache = self._topology_cache
+
+                print(partition.cnot_counts,partition.involved_qbits)
         
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
         sDAG = self.construct_sDAG(optimized_partitions)
@@ -360,7 +387,6 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         scoring_partitions = self._build_scoring_partitions(optimized_partitions)
         
         partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,IDAG, optimized_partitions,scoring_partitions,D, sDAG)
-        pi_final_list = pi_final.tolist() if hasattr(pi_final, 'tolist') else list(pi_final)
         
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions, circ.get_Qbit_Num())
         
@@ -510,16 +536,10 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                 if partition_result is None:
                     continue
                 for tdx, mini_topology in enumerate(partition_result.mini_topologies):
-                    topology_candidates = partition_result.topology_candidates[tdx]
-                    for topology_candidate in topology_candidates:
-                        for pdx, permutation_pair in enumerate(partition_result.permutations_pairs[tdx]):
-                            # Create cache key for this candidate's transform_pi result
-                            cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
-                            if cache_key not in transform_cache:
-                                new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition_result.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result.qubit_map,partition_result.involved_qbits)
-                                swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
-                                transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
-                            mini_scores.append(transform_cache[cache_key])
+                    dist_placeholder = calculate_dist_small(mini_topology,partition_result.qubit_map,D,output_perm)
+                    circuit_length = min([len(circ) for circ in partition_result.circuit_structures[tdx]])
+                    score = dist_placeholder + circuit_length
+                    mini_scores.append(score)
                 if mini_scores:
                     score_E += min(mini_scores)
 
@@ -553,28 +573,24 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                     if partition_result_E is None:
                         continue
                     for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
-                        topology_candidates = partition_result_E.topology_candidates[tdx]
-                        for topology_candidate in topology_candidates:
-                            for pdx, permutation_pair in enumerate(partition_result_E.permutations_pairs[tdx]):
-                                # Create cache key for this candidate's transform_pi result
-                                cache_key = (partition_idx_E, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
-                                if cache_key not in transform_cache:
-                                    new_cand = PartitionCandidate(partition_idx_E,tdx,pdx,partition_result_E.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition_result_E.qubit_map,partition_result_E.involved_qbits)
-                                    swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
-                                    transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
-                                mini_scores.append(transform_cache[cache_key])
+                        dist_placeholder = calculate_dist_small(mini_topology,partition_result_E.qubit_map,D,output_perm)
+                        circuit_length = min([len(circ) for circ in partition_result_E.circuit_structures[tdx]])
+                        score = dist_placeholder + circuit_length
+                        mini_scores.append(score)
                     if mini_scores:
                         score_E += min(mini_scores)
         # Safety check for division by zero
+        coeff_F = 0.6
         if len(E_visited_partitions) == 0:
             E_score = 0.0
+            coeff_F = 1.
         else:
-            E_score = 0.2 * score_E / len(E_visited_partitions)
+            E_score = (1-coeff_F) * score_E / len(E_visited_partitions)
         
         if len(F) == 0:
             F_score = 0.0
         else:
-            F_score = score_F / len(F)
+            F_score = coeff_F*score_F / len(F)
         
         return E_score + F_score
 
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index cb0a017cf..801cce7d1 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -213,6 +213,13 @@ def heuristic(state):
     
     return None, None  # No solution found
 
+def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
+    dist_placeholder = 0
+    qbit_map_inv = { k:v for v,k in qbit_map.items()}
+    for u,v in mini_topology:
+        dist_placeholder += dist_matrix[pi[qbit_map_inv[u]]][pi[qbit_map_inv[v]]]-3
+    return dist_placeholder
+
 def extract_subtopology(involved_qbits, qbit_map, config ):
     mini_topology = []
     for edge in config["topology"]:

From 0d04d13bc0790a371520fb74a2dbcdeeddd4ca96 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 5 Dec 2025 12:59:16 +0100
Subject: [PATCH 047/232] Add less dependence for initial layout

---
 examples/decomposition/PartAM_example.py |  1 +
 squander/synthesis/PartAM.py             | 68 ++++++++++++++++++------
 squander/synthesis/PartAM_utils.py       |  2 +-
 3 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 7a560f802..cffdda844 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -77,5 +77,6 @@
     print(f"Decomposition error on random state: {state_error:.10f}")
     print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
     print(f"{'='*70}\n")
+    print(circ_Final.get_Gate_Nums())
 
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 0f52f87cb..2cd726e24 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -61,15 +61,51 @@ class PartitionScoreData:
 _WORKER_S_DAG: Optional[List[List[int]]] = None
 _WORKER_DISTANCE_MATRIX: Optional[np.ndarray] = None
 _WORKER_SWAP_CACHE: Optional[Dict] = None
+_WORKER_PI_INITIAL: Optional[Tuple[int, ...]] = None
 
 
-def _init_scoring_worker(scoring_partitions, sdag, distance_matrix):
+def _init_scoring_worker(scoring_partitions, sdag, distance_matrix, pi_initial):
     """Initializer for process-based scoring workers."""
-    global _WORKER_SCORING_PARTITIONS, _WORKER_S_DAG, _WORKER_DISTANCE_MATRIX, _WORKER_SWAP_CACHE
+    global _WORKER_SCORING_PARTITIONS, _WORKER_S_DAG, _WORKER_DISTANCE_MATRIX, _WORKER_SWAP_CACHE, _WORKER_PI_INITIAL
     _WORKER_SCORING_PARTITIONS = scoring_partitions
     _WORKER_S_DAG = sdag
     _WORKER_DISTANCE_MATRIX = distance_matrix
     _WORKER_SWAP_CACHE = {}
+    _WORKER_PI_INITIAL = pi_initial
+
+
+def _calculate_swap_cost(swaps, current_pi, pi_initial):
+    """
+    Calculate swap cost with discount for swaps where both qubits are at initial positions.
+    """
+    cost = 0
+    temp_pi = list(current_pi)
+    # Build inverse map for O(1) lookup: physical -> logical
+    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
+
+    for p1, p2 in swaps:
+        l1 = phys_to_logical[p1]
+        l2 = phys_to_logical[p2]
+
+        # Check if both logical qubits are at their initial positions
+        # Note: pi_initial maps logical -> physical
+        is_l1_initial = (temp_pi[l1] == pi_initial[l1])
+        is_l2_initial = (temp_pi[l2] == pi_initial[l2])
+
+        if is_l1_initial and is_l2_initial:
+            step_cost = 0
+        else:
+            step_cost = 3
+
+        cost += step_cost
+
+        # Update state
+        temp_pi[l1] = p2
+        temp_pi[l2] = p1
+        phys_to_logical[p1] = l2
+        phys_to_logical[p2] = l1
+
+    return cost
 
 
 def _score_candidate_worker(payload):
@@ -81,6 +117,7 @@ def _score_candidate_worker(payload):
         _WORKER_SCORING_PARTITIONS is None
         or _WORKER_S_DAG is None
         or _WORKER_DISTANCE_MATRIX is None
+        or _WORKER_PI_INITIAL is None
     ):
         raise RuntimeError("Scoring worker not initialized with shared data.")
     partition_candidate, F_snapshot, pi_snapshot = payload
@@ -92,6 +129,7 @@ def _score_candidate_worker(payload):
         _WORKER_S_DAG,
         _WORKER_DISTANCE_MATRIX,
         _WORKER_SWAP_CACHE,
+        _WORKER_PI_INITIAL,
     )
 
 class qgd_Partition_Aware_Mapping:
@@ -374,7 +412,6 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                 partition._topology = self.topology
                 partition._topology_cache = self._topology_cache
 
-                print(partition.cnot_counts,partition.involved_qbits)
         
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
         sDAG = self.construct_sDAG(optimized_partitions)
@@ -393,6 +430,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         return final_circuit, final_parameters, pi, pi_final
 
     def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, sDAG):
+        pi_initial = pi.copy()
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
@@ -412,7 +450,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 executor = ProcessPoolExecutor(
                     max_workers=score_workers,
                     initializer=_init_scoring_worker,
-                    initargs=(scoring_partitions, sDAG, D),
+                    initargs=(scoring_partitions, sDAG, D, pi_initial),
                 )
             except Exception as exc:
                 logging.warning(
@@ -444,6 +482,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             sDAG,
                             D,
                             self._swap_cache,
+                            pi_initial,
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -513,12 +552,12 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         return final_circuit, final_parameters
     
     @staticmethod
-    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache):
+    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache, pi_initial):
         score_F = 0
         score_E = 0
         E_visited_partitions = set()  # Changed to set for O(1) membership checks
         swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
-        score_F += len(swaps)*3
+        score_F += _calculate_swap_cost(swaps, pi, pi_initial)
         score_F += len(partition_candidate.circuit_structure)
 
         # Cache for transform_pi results to avoid redundant computation
@@ -545,7 +584,7 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
 
         for partition_idx in F:
             partition = scoring_partitions[partition_idx]
-            if partition is None:
+            if partition is None or partition_idx == partition_candidate.partition_idx:
                 continue
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
@@ -556,8 +595,9 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                         cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
                         if cache_key not in transform_cache:
                             new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                            swap_count = len(new_cand.transform_pi(output_perm,D, swap_cache)[0])
-                            transform_cache[cache_key] = swap_count * 3 + len(new_cand.circuit_structure)
+                            swaps_next, transformed_pi = new_cand.transform_pi(output_perm,D, swap_cache)
+                            swap_cost = _calculate_swap_cost(swaps_next, transformed_pi, pi_initial)
+                            transform_cache[cache_key] = swap_cost + len(new_cand.circuit_structure)
                         mini_scores.append(transform_cache[cache_key])
             if mini_scores:
                 score_F += min(mini_scores)
@@ -580,17 +620,13 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                     if mini_scores:
                         score_E += min(mini_scores)
         # Safety check for division by zero
-        coeff_F = 0.6
+        coeff_E = 0.5
         if len(E_visited_partitions) == 0:
             E_score = 0.0
-            coeff_F = 1.
         else:
-            E_score = (1-coeff_F) * score_E / len(E_visited_partitions)
+            E_score = coeff_E * score_E / len(E_visited_partitions)
         
-        if len(F) == 0:
-            F_score = 0.0
-        else:
-            F_score = coeff_F*score_F / len(F)
+        F_score = score_F / len(F)
         
         return E_score + F_score
 
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 801cce7d1..3d9f76bc7 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -300,7 +300,7 @@ def get_partition_synthesis_score(self):
         for topology_idx in range(self.topology_count):
             cnot_count_topology = np.min(self.cnot_counts[topology_idx])#np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
             score += cnot_count_topology/self.topology_count
-        return score
+        return score/len(self.involved_qbits)
     
     def get_topology_candidates(self, topology_idx):
         """

From d54d577e5a5a5e6fd2338c165f4c08b97fdf7ba1 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 5 Dec 2025 13:04:20 +0100
Subject: [PATCH 048/232] Revert scoring mistake

---
 squander/synthesis/PartAM_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 3d9f76bc7..801cce7d1 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -300,7 +300,7 @@ def get_partition_synthesis_score(self):
         for topology_idx in range(self.topology_count):
             cnot_count_topology = np.min(self.cnot_counts[topology_idx])#np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
             score += cnot_count_topology/self.topology_count
-        return score/len(self.involved_qbits)
+        return score
     
     def get_topology_candidates(self, topology_idx):
         """

From 488c98995d267e497285d36f1b855566915b131c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 5 Dec 2025 14:06:45 +0100
Subject: [PATCH 049/232] Rework used qubits feature

---
 squander/synthesis/PartAM.py       |  84 +++++++++---------------
 squander/synthesis/PartAM_utils.py | 102 ++++++++++++++++++++++++++++-
 2 files changed, 132 insertions(+), 54 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2cd726e24..5b61c9a90 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -41,7 +41,9 @@
     PartitionCandidate,
     check_circuit_compatibility,
     construct_swap_circuit,
-    calculate_dist_small
+    calculate_dist_small,
+    calculate_swap_cost,
+    filter_required_swaps
 )
 
 
@@ -61,66 +63,29 @@ class PartitionScoreData:
 _WORKER_S_DAG: Optional[List[List[int]]] = None
 _WORKER_DISTANCE_MATRIX: Optional[np.ndarray] = None
 _WORKER_SWAP_CACHE: Optional[Dict] = None
-_WORKER_PI_INITIAL: Optional[Tuple[int, ...]] = None
 
 
-def _init_scoring_worker(scoring_partitions, sdag, distance_matrix, pi_initial):
+def _init_scoring_worker(scoring_partitions, sdag, distance_matrix):
     """Initializer for process-based scoring workers."""
-    global _WORKER_SCORING_PARTITIONS, _WORKER_S_DAG, _WORKER_DISTANCE_MATRIX, _WORKER_SWAP_CACHE, _WORKER_PI_INITIAL
+    global _WORKER_SCORING_PARTITIONS, _WORKER_S_DAG, _WORKER_DISTANCE_MATRIX, _WORKER_SWAP_CACHE
     _WORKER_SCORING_PARTITIONS = scoring_partitions
     _WORKER_S_DAG = sdag
     _WORKER_DISTANCE_MATRIX = distance_matrix
     _WORKER_SWAP_CACHE = {}
-    _WORKER_PI_INITIAL = pi_initial
-
-
-def _calculate_swap_cost(swaps, current_pi, pi_initial):
-    """
-    Calculate swap cost with discount for swaps where both qubits are at initial positions.
-    """
-    cost = 0
-    temp_pi = list(current_pi)
-    # Build inverse map for O(1) lookup: physical -> logical
-    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
-
-    for p1, p2 in swaps:
-        l1 = phys_to_logical[p1]
-        l2 = phys_to_logical[p2]
-
-        # Check if both logical qubits are at their initial positions
-        # Note: pi_initial maps logical -> physical
-        is_l1_initial = (temp_pi[l1] == pi_initial[l1])
-        is_l2_initial = (temp_pi[l2] == pi_initial[l2])
-
-        if is_l1_initial and is_l2_initial:
-            step_cost = 0
-        else:
-            step_cost = 3
-
-        cost += step_cost
-
-        # Update state
-        temp_pi[l1] = p2
-        temp_pi[l2] = p1
-        phys_to_logical[p1] = l2
-        phys_to_logical[p2] = l1
-
-    return cost
 
 
 def _score_candidate_worker(payload):
     """
     Worker wrapper that reconstructs scoring inputs from a lightweight payload.
-    Payload format: (PartitionCandidate, F_snapshot, pi_snapshot)
+    Payload format: (PartitionCandidate, F_snapshot, pi_snapshot, used_qubits)
     """
     if (
         _WORKER_SCORING_PARTITIONS is None
         or _WORKER_S_DAG is None
         or _WORKER_DISTANCE_MATRIX is None
-        or _WORKER_PI_INITIAL is None
     ):
         raise RuntimeError("Scoring worker not initialized with shared data.")
-    partition_candidate, F_snapshot, pi_snapshot = payload
+    partition_candidate, F_snapshot, pi_snapshot, used_qubits = payload
     return qgd_Partition_Aware_Mapping.score_partition_candidate(
         partition_candidate,
         F_snapshot,
@@ -129,7 +94,7 @@ def _score_candidate_worker(payload):
         _WORKER_S_DAG,
         _WORKER_DISTANCE_MATRIX,
         _WORKER_SWAP_CACHE,
-        _WORKER_PI_INITIAL,
+        used_qubits,
     )
 
 class qgd_Partition_Aware_Mapping:
@@ -423,14 +388,15 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
         scoring_partitions = self._build_scoring_partitions(optimized_partitions)
         
-        partition_order, pi_final = self.Heuristic_Search(F,pi.copy(),DAG,IDAG, optimized_partitions,scoring_partitions,D, sDAG)
+        partition_order, pi, pi_initial = self.Heuristic_Search(F,pi.copy(),DAG,IDAG, optimized_partitions,scoring_partitions,D, sDAG)
         
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions, circ.get_Qbit_Num())
         
-        return final_circuit, final_parameters, pi, pi_final
+        return final_circuit, final_parameters, pi_initial, pi
 
     def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, sDAG):
         pi_initial = pi.copy()
+        used_qubits = set()
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
@@ -450,7 +416,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 executor = ProcessPoolExecutor(
                     max_workers=score_workers,
                     initializer=_init_scoring_worker,
-                    initargs=(scoring_partitions, sDAG, D, pi_initial),
+                    initargs=(scoring_partitions, sDAG, D),
                 )
             except Exception as exc:
                 logging.warning(
@@ -468,7 +434,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 if executor is not None:
                     pi_snapshot = tuple(int(x) for x in pi)
                     payloads = [
-                        (partition_candidate, F_snapshot, pi_snapshot)
+                        (partition_candidate, F_snapshot, pi_snapshot, used_qubits)
                         for partition_candidate in partition_candidates
                     ]
                     scores = list(executor.map(_score_candidate_worker, payloads))
@@ -482,7 +448,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             sDAG,
                             D,
                             self._swap_cache,
-                            pi_initial,
+                            used_qubits,
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -494,9 +460,15 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 resolved_count = sum(resolved_partitions)
                 pbar.n = resolved_count
                 pbar.refresh()
+                pi_prev = pi # Save previous pi state for filtering
                 swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
                 if len(swap_order)!=0:
-                    partition_order.append(construct_swap_circuit(swap_order, len(pi)))
+                    filtered_swap_order, pi_initial = filter_required_swaps(swap_order, pi_prev, pi_initial, used_qubits)
+                    partition_order.append(construct_swap_circuit(filtered_swap_order, len(pi)))
+                
+                # Add involved qubits to used set
+                used_qubits.update(min_partition_candidate.involved_qbits)
+                
                 partition_order.append(min_partition_candidate)
                 children = DAG[min_partition_candidate.partition_idx]
                 step += 1
@@ -511,6 +483,9 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             qubit = child_partition.circuit.get_Qbits()[0]
                             child_partition.circuit.map_circuit({qubit: pi[qubit]})
                             partition_order.append(child_partition)
+                            # Update used qubits for single qubit partition
+                            used_qubits.add(qubit) 
+                            
                             resolved_partitions[child] = True
                             resolved_count = sum(resolved_partitions)
                             pbar.n = resolved_count
@@ -522,7 +497,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
             if executor is not None:
                 executor.shutdown()
         pbar.close()
-        return partition_order, pi
+        return partition_order, pi, pi_initial
 
     def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         final_circuit = Circuit(N)
@@ -552,17 +527,20 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         return final_circuit, final_parameters
     
     @staticmethod
-    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache, pi_initial):
+    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache, used_qubits):
         score_F = 0
         score_E = 0
         E_visited_partitions = set()  # Changed to set for O(1) membership checks
         swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
-        score_F += _calculate_swap_cost(swaps, pi, pi_initial)
+        score_F += calculate_swap_cost(swaps, pi, used_qubits)
         score_F += len(partition_candidate.circuit_structure)
 
         # Cache for transform_pi results to avoid redundant computation
         # Key: (partition_idx, topology_idx, permutation_idx, topology_candidate_tuple, output_perm_tuple)
         transform_cache = {}
+        
+        # Qubits used by current partition are now 'used' for lookahead
+        next_used_qubits = used_qubits.union(partition_candidate.involved_qbits)
 
         # Safety check: ensure partition_idx is valid for sDAG
         if partition_candidate.partition_idx < len(sDAG):
@@ -596,7 +574,7 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                         if cache_key not in transform_cache:
                             new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
                             swaps_next, transformed_pi = new_cand.transform_pi(output_perm,D, swap_cache)
-                            swap_cost = _calculate_swap_cost(swaps_next, transformed_pi, pi_initial)
+                            swap_cost = calculate_swap_cost(swaps_next, transformed_pi, next_used_qubits) # Use next_used_qubits
                             transform_cache[cache_key] = swap_cost + len(new_cand.circuit_structure)
                         mini_scores.append(transform_cache[cache_key])
             if mini_scores:
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 801cce7d1..59ad6c04e 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -226,7 +226,28 @@ def extract_subtopology(involved_qbits, qbit_map, config ):
         if edge[0] in involved_qbits and edge[1] in involved_qbits:
             mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
     return mini_topology
+def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
+    P_i_inv = [P_i.index(i) for i in range(len(P_i))]  # Compute inverse
+    qbit_map_input = {k : node_mapping[P_i_inv[v]] for k,v in qbit_map.items()}
+    # Convert pi to plain Python list of ints (may contain np.int64)
+    pi_list = [int(x) for x in pi]
 
+    # Check cache if provided
+    cache_key = None
+    if swap_cache is not None:
+        # Create cache key: (pi_tuple, frozenset of qbit_map_input items)
+        pi_tuple = tuple(pi_list)
+        qbit_map_frozen = frozenset(qbit_map_input.items())
+        cache_key = (pi_tuple, qbit_map_frozen)
+        cache_key = (pi_tuple, qbit_map_frozen)
+        if cache_key in swap_cache:
+            swaps, pi_init = swap_cache[cache_key]
+        else:
+            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+            swap_cache[cache_key] = (swaps, pi_init)
+    else:
+        swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+    return len(swaps)
 class SingleQubitPartitionResult:
     
     def __init__(self,circuit_in,parameters_in):
@@ -336,6 +357,8 @@ def get_topology_candidates(self, topology_idx):
                     self._topology_candidates[topology_idx] = []
         return self._topology_candidates[topology_idx]
 
+
+
 class PartitionCandidate:
     
     def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits):
@@ -432,10 +455,87 @@ def check_circuit_compatibility(circuit: Circuit, topology):
             return False
     return True
 
+def calculate_swap_cost(swaps, current_pi, used_qubits):
+    """
+    Calculate swap cost. Swaps involving unused qubits are costless (0).
+    unused qubits are those not in used_qubits set.
+    """
+    cost = 0
+    temp_pi = list(current_pi)
+    # Build inverse map for O(1) lookup: physical -> logical
+    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
+
+    for p1, p2 in swaps:
+        l1 = phys_to_logical[p1]
+        l2 = phys_to_logical[p2]
+
+        is_l1_unused = (l1 not in used_qubits)
+        is_l2_unused = (l2 not in used_qubits)
+
+        if is_l1_unused and is_l2_unused:
+            step_cost = 0
+        else:
+            step_cost = 3
+
+        cost += step_cost
+
+        # Update state
+        temp_pi[l1] = p2
+        temp_pi[l2] = p1
+        phys_to_logical[p1] = l2
+        phys_to_logical[p2] = l1
+
+    return cost
+
+def filter_required_swaps(swaps, current_pi, pi_initial, used_qubits):
+    """
+    Filter swaps that are effectively 'costless' (involve unused qubits).
+    Returns filtered swaps and the updated pi_initial.
+    """
+    required_swaps = []
+    temp_pi = list(current_pi)
+    
+    # pi_initial might be numpy array, convert to list for mutation if needed, 
+    # but we'll return a new list/array to be safe.
+    updated_pi_initial = list(pi_initial)
+    
+    # Build inverse map for O(1) lookup: physical -> logical
+    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
+
+    for p1, p2 in swaps:
+        l1 = phys_to_logical[p1]
+        l2 = phys_to_logical[p2]
+
+        is_l1_unused = (l1 not in used_qubits)
+        is_l2_unused = (l2 not in used_qubits)
+
+        if not (is_l1_unused and is_l2_unused):
+            required_swaps.append((p1, p2))
+        else:
+            # If unused, we update the initial mapping to reflect this swap
+            # effectively retconning that they started in these positions.
+            # pi_initial maps logical -> physical.
+            # We swap the physical locations for these logical qubits.
+            # Note: updated_pi_initial[l1] should track where l1 'started'.
+            # If we swap l1 and l2 physically, and it's costless, 
+            # it means l1 is now 'initially' at p2, and l2 at p1.
+            # But wait, temp_pi[l1] is currently p1. After swap it is p2.
+            # So we update pi_initial to match the new temp_pi.
+            updated_pi_initial[l1] = p2
+            updated_pi_initial[l2] = p1
+
+        # Always update the tracking state
+        temp_pi[l1] = p2
+        temp_pi[l2] = p1
+        phys_to_logical[p1] = l2
+        phys_to_logical[p2] = l1
+
+    return required_swaps, updated_pi_initial
+
 def construct_swap_circuit(swap_order, N):
     swap_circ = Circuit(N)
     for swap in swap_order:
         swap_circ.add_CNOT(swap[0],swap[1])
         swap_circ.add_CNOT(swap[1],swap[0])
         swap_circ.add_CNOT(swap[0],swap[1])
-    return swap_circ
\ No newline at end of file
+    return swap_circ

From fabba27dc1f0a45ede0cd185956d8404cba42cfe Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 5 Dec 2025 15:34:53 +0100
Subject: [PATCH 050/232] Add error handling and circuit check

---
 .../qgd_N_Qubit_Decompositions_Wrapper.cpp    | 16 +++++++++-
 squander/gates/gates_Wrapper.cpp              | 30 +++++++++++++++++--
 squander/gates/qgd_Circuit_Wrapper.cpp        | 18 +++++++++--
 squander/synthesis/PartAM.py                  |  3 +-
 squander/synthesis/PartAM_utils.py            |  1 +
 5 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp b/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
index 12f3b8f83..7d54036a6 100644
--- a/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
+++ b/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
@@ -1145,7 +1145,21 @@ qgd_N_Qubit_Decomposition_Wrapper_get_Matrix(qgd_N_Qubit_Decomposition_Wrapper *
 
     Matrix unitary_mtx;
 
-    unitary_mtx = self->decomp->get_matrix(parameters_mtx);
+    try {
+        unitary_mtx = self->decomp->get_matrix(parameters_mtx);
+    }
+    catch (std::string err) {
+        Py_DECREF(parameters_arr);
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        std::cout << err << std::endl;
+        return NULL;
+    }
+    catch(...) {
+        Py_DECREF(parameters_arr);
+        std::string err( "Invalid pointer to decomposition class or error in get_matrix");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
 
     // convert to numpy array
     unitary_mtx.set_owner(false);
diff --git a/squander/gates/gates_Wrapper.cpp b/squander/gates/gates_Wrapper.cpp
index 71a8bf2a1..1d8f1dc9a 100644
--- a/squander/gates/gates_Wrapper.cpp
+++ b/squander/gates/gates_Wrapper.cpp
@@ -610,7 +610,19 @@ Gate_Wrapper_get_Matrix( Gate_Wrapper *self, PyObject *args, PyObject *kwds ) {
         }
 
         int parallel = 1;
-        gate_mtx = gate->get_matrix( parallel );
+        try {
+            gate_mtx = gate->get_matrix( parallel );
+        }
+        catch (std::string err) {
+            PyErr_SetString(PyExc_Exception, err.c_str());
+            std::cout << err << std::endl;
+            return NULL;
+        }
+        catch(...) {
+            std::string err( "Invalid pointer to gate class or error in get_matrix");
+            PyErr_SetString(PyExc_Exception, err.c_str());
+            return NULL;
+        }
 
     }
     else if( gate->get_parameter_num() > 0 ) {
@@ -638,7 +650,21 @@ Gate_Wrapper_get_Matrix( Gate_Wrapper *self, PyObject *args, PyObject *kwds ) {
         Matrix_real&& parameters_mtx = numpy2matrix_real( parameters_arr );
 
         int parallel = 1;
-        gate_mtx = self->gate->get_matrix( parameters_mtx, parallel );
+        try {
+            gate_mtx = self->gate->get_matrix( parameters_mtx, parallel );
+        }
+        catch (std::string err) {
+            Py_DECREF(parameters_arr);
+            PyErr_SetString(PyExc_Exception, err.c_str());
+            std::cout << err << std::endl;
+            return NULL;
+        }
+        catch(...) {
+            Py_DECREF(parameters_arr);
+            std::string err( "Invalid pointer to gate class or error in get_matrix");
+            PyErr_SetString(PyExc_Exception, err.c_str());
+            return NULL;
+        }
 
         Py_DECREF(parameters_arr);
 
diff --git a/squander/gates/qgd_Circuit_Wrapper.cpp b/squander/gates/qgd_Circuit_Wrapper.cpp
index 9c3acbfef..80e79aa3e 100644
--- a/squander/gates/qgd_Circuit_Wrapper.cpp
+++ b/squander/gates/qgd_Circuit_Wrapper.cpp
@@ -626,8 +626,22 @@ qgd_Circuit_Wrapper_get_Matrix( qgd_Circuit_Wrapper *self, PyObject *args ) {
     // get the C++ wrapper around the data
     Matrix_real&& parameters_mtx = numpy2matrix_real( parameters_arr );
 
-
-    Matrix mtx = self->circuit->get_matrix( parameters_mtx );
+    Matrix mtx;
+    try {
+        mtx = self->circuit->get_matrix( parameters_mtx );
+    }
+    catch (std::string err) {
+        Py_DECREF(parameters_arr);
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        std::cout << err << std::endl;
+        return NULL;
+    }
+    catch(...) {
+        Py_DECREF(parameters_arr);
+        std::string err( "Invalid pointer to circuit class or error in get_matrix");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
     
     // convert to numpy array
     mtx.set_owner(false);
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 5b61c9a90..af7d2bf21 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -523,7 +523,8 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
             final_parameters = np.concatenate(final_parameters,axis=0)
         else:
             final_parameters = np.array([])
-        
+        if not check_circuit_compatibility(final_circuit,self.topology):
+            print("ERROR: Final circuit is not compatible with device topology!")
         return final_circuit, final_parameters
     
     @staticmethod
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 59ad6c04e..cd83fff0f 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -226,6 +226,7 @@ def extract_subtopology(involved_qbits, qbit_map, config ):
         if edge[0] in involved_qbits and edge[1] in involved_qbits:
             mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
     return mini_topology
+    
 def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
     P_i_inv = [P_i.index(i) for i in range(len(P_i))]  # Compute inverse
     qbit_map_input = {k : node_mapping[P_i_inv[v]] for k,v in qbit_map.items()}

From dff4d21725fa9ad9ef7cd672dd91c7e10224d615 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 5 Dec 2025 16:27:19 +0100
Subject: [PATCH 051/232] Refactor cost calculation

---
 squander/synthesis/PartAM.py | 35 +++++++++--------------------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index af7d2bf21..61f6c04e3 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -18,7 +18,6 @@
 )
 
 import numpy as np
-from qiskit import QuantumCircuit
 
 from typing import Callable, Dict, List, Optional, Set, Tuple, FrozenSet
 from dataclasses import dataclass
@@ -31,7 +30,6 @@
 from collections import deque, defaultdict
 import numpy as np
 
-from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 from squander.synthesis.PartAM_utils import (
     get_subtopologies_of_type,
     get_unique_subtopologies,
@@ -128,7 +126,6 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
         Cached version of get_subtopologies_of_type.
         Uses canonical form of mini_topology as cache key.
         """
-        from squander.synthesis.PartAM_utils import get_canonical_form
         
         # Create canonical form key
         target_qubits = set()
@@ -536,13 +533,6 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
         score_F += calculate_swap_cost(swaps, pi, used_qubits)
         score_F += len(partition_candidate.circuit_structure)
 
-        # Cache for transform_pi results to avoid redundant computation
-        # Key: (partition_idx, topology_idx, permutation_idx, topology_candidate_tuple, output_perm_tuple)
-        transform_cache = {}
-        
-        # Qubits used by current partition are now 'used' for lookahead
-        next_used_qubits = used_qubits.union(partition_candidate.involved_qbits)
-
         # Safety check: ensure partition_idx is valid for sDAG
         if partition_candidate.partition_idx < len(sDAG):
             for partition_idx in sDAG[partition_candidate.partition_idx]:
@@ -555,11 +545,11 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                     continue
                 for tdx, mini_topology in enumerate(partition_result.mini_topologies):
                     dist_placeholder = calculate_dist_small(mini_topology,partition_result.qubit_map,D,output_perm)
-                    circuit_length = min([len(circ) for circ in partition_result.circuit_structures[tdx]])
+                    circuit_length = np.mean([len(circ) for circ in partition_result.circuit_structures[tdx]])
                     score = dist_placeholder + circuit_length
                     mini_scores.append(score)
                 if mini_scores:
-                    score_E += min(mini_scores)
+                    score_E += np.mean(mini_scores)
 
         for partition_idx in F:
             partition = scoring_partitions[partition_idx]
@@ -567,19 +557,12 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                 continue
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                topology_candidates = partition.topology_candidates[tdx]
-                for topology_candidate in topology_candidates:
-                    for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
-                        # Create cache key for this candidate's transform_pi result
-                        cache_key = (partition_idx, tdx, pdx, tuple(sorted(topology_candidate)), tuple(output_perm))
-                        if cache_key not in transform_cache:
-                            new_cand = PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits)
-                            swaps_next, transformed_pi = new_cand.transform_pi(output_perm,D, swap_cache)
-                            swap_cost = calculate_swap_cost(swaps_next, transformed_pi, next_used_qubits) # Use next_used_qubits
-                            transform_cache[cache_key] = swap_cost + len(new_cand.circuit_structure)
-                        mini_scores.append(transform_cache[cache_key])
+                dist_placeholder = calculate_dist_small(mini_topology,partition.qubit_map,D,output_perm)
+                circuit_length = np.mean([len(circ) for circ in partition.circuit_structures[tdx]])
+                score = dist_placeholder + circuit_length
+                mini_scores.append(score)
             if mini_scores:
-                score_F += min(mini_scores)
+                score_F += np.mean(mini_scores)
 
             # Safety check: ensure partition_idx is valid for sDAG
             if partition_idx < len(sDAG):
@@ -593,11 +576,11 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                         continue
                     for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
                         dist_placeholder = calculate_dist_small(mini_topology,partition_result_E.qubit_map,D,output_perm)
-                        circuit_length = min([len(circ) for circ in partition_result_E.circuit_structures[tdx]])
+                        circuit_length = np.mean([len(circ) for circ in partition_result_E.circuit_structures[tdx]])
                         score = dist_placeholder + circuit_length
                         mini_scores.append(score)
                     if mini_scores:
-                        score_E += min(mini_scores)
+                        score_E += np.mean(mini_scores)
         # Safety check for division by zero
         coeff_E = 0.5
         if len(E_visited_partitions) == 0:

From 5e12e83b9c8f61fa43c8b521d536adde0f2e8068 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 6 Dec 2025 22:51:57 +0100
Subject: [PATCH 052/232] Add token swapping based search

---
 squander/synthesis/PartAM_utils.py | 184 ++++++++++++++++++++++++++++-
 1 file changed, 183 insertions(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index cd83fff0f..7bb3a8239 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -5,6 +5,187 @@
 import heapq
 import math
 import logging
+import pulp
+from collections import defaultdict
+
+def solve_min_swaps(perm, edges, T=None, use_gurobi=True):
+    """
+    Compute globally optimal minimum SWAPs to route permutation 'perm'
+    to identity under connectivity 'edges'.
+
+    perm[i] = logical qubit currently at physical node i.
+    edges   = list of undirected edges (u,v).
+    T       = time horizon (number of layers); if None, use n^2.
+    """
+
+    n = len(perm)
+    nodes = list(range(n))
+    tokens = list(range(n))
+
+    # Time horizon
+    if T is None:
+        T = n * n   # safe-ish upper bound for small n
+
+    # Undirected edges => directed arcs for movement variables
+    undirected = {tuple(sorted(e)) for e in edges}
+    neighbors = {u: set() for u in nodes}
+    for u, v in undirected:
+        neighbors[u].add(v)
+        neighbors[v].add(u)
+    directed_arcs = [(u, v) for u, v in undirected for (u, v) in ((u, v), (v, u))]
+
+    # ILP model
+    prob = pulp.LpProblem("TokenSwapping", pulp.LpMinimize)
+
+    # x[t][v][q]: token q at node v at time t
+    x = {
+        (t, v, q): pulp.LpVariable(f"x_t{t}_v{v}_q{q}", cat="Binary")
+        for t in range(T + 1)
+        for v in nodes
+        for q in tokens
+    }
+
+    # m[t][u][v][q]: token q moves from u to v between t and t+1
+    m = {
+        (t, u, v, q): pulp.LpVariable(f"m_t{t}_{u}_{v}_q{q}", cat="Binary")
+        for t in range(T)
+        for (u, v) in directed_arcs
+        for q in tokens
+    }
+
+    # Initial positions: tokens are at positions given by 'perm'
+    # perm[i] = token currently at physical node i
+    for v in nodes:
+        for q in tokens:
+            prob += x[(0, v, q)] == (1 if v == q else 0), f"init_t0_v{v}_q{q}"
+
+    # Final positions: identity mapping (token q at node q)
+    for v in nodes:
+        for q in tokens:
+            prob += x[(T, v, q)] == (1 if perm[v] == q else 0), f"final_tT_v{v}_q{q}"
+
+    # Each token at exactly one node at each time
+    for t in range(T + 1):
+        for q in tokens:
+            prob += (
+                pulp.lpSum(x[(t, v, q)] for v in nodes) == 1,
+                f"one_node_t{t}_q{q}",
+            )
+
+    # Each node holds exactly one token at each time
+    for t in range(T + 1):
+        for v in nodes:
+            prob += (
+                pulp.lpSum(x[(t, v, q)] for q in tokens) == 1,
+                f"one_token_t{t}_v{v}",
+            )
+
+    # Introduce swap decision per time over undirected edges (single swap per time)
+    y = {
+        (t, u, v): pulp.LpVariable(f"y_t{t}_e{u}_{v}", cat="Binary")
+        for t in range(T)
+        for (u, v) in undirected
+    }
+
+    # At most one swap per time step
+    for t in range(T):
+        prob += (
+            pulp.lpSum(y[(t, u, v)] for (u, v) in undirected) <= 1,
+            f"one_swap_per_time_t{t}",
+        )
+
+    # Flow constraints for token movement
+    for t in range(T):
+        for u in nodes:
+            for q in tokens:
+                outbound = pulp.lpSum(
+                    m[(t, u, v, q)] for v in neighbors[u]
+                )
+                inbound = pulp.lpSum(
+                    m[(t, v, u, q)] for v in neighbors[u]
+                )
+                prob += (
+                    x[(t, u, q)] == x[(t + 1, u, q)] + outbound - inbound,
+                    f"flow_t{t}_u{u}_q{q}",
+                )
+
+    # Link moves to selected swap edge and enforce swap semantics
+    for t in range(T):
+        for (u, v) in undirected:
+            # Only allow moves along (u,v) at time t if this edge is selected
+            for q in tokens:
+                prob += m[(t, u, v, q)] <= y[(t, u, v)], f"link_m_y_t{t}_{u}_{v}_q{q}_uv"
+                prob += m[(t, v, u, q)] <= y[(t, u, v)], f"link_m_y_t{t}_{u}_{v}_q{q}_vu"
+
+            # If edge selected, exactly one token moves each direction (a swap)
+            prob += (
+                pulp.lpSum(m[(t, u, v, q)] for q in tokens) == y[(t, u, v)],
+                f"one_token_uv_t{t}_{u}_{v}",
+            )
+            prob += (
+                pulp.lpSum(m[(t, v, u, q)] for q in tokens) == y[(t, u, v)],
+                f"one_token_vu_t{t}_{u}_{v}",
+            )
+
+    # Objective: minimize number of swaps (sum of y)
+    total_swaps = pulp.lpSum(y.values())
+    total_moves = pulp.lpSum(m.values())  # optional, for reporting
+    prob += total_swaps
+
+    # Choose solver
+    if use_gurobi:
+        try:
+            solver = pulp.GUROBI(msg=1)
+        except Exception:
+            # Fallback if GUROBI not properly installed with PuLP wrapper
+            solver = pulp.PULP_CBC_CMD(msg=1)
+    else:
+        solver = pulp.PULP_CBC_CMD(msg=1)
+
+    prob.solve(solver)
+
+    status = pulp.LpStatus[prob.status]
+    if status != "Optimal":
+        raise RuntimeError(f"Solver did not find optimal solution, status = {status}")
+
+    # Extract move/swap counts
+    moves_value = int(pulp.value(total_moves))
+    swap_value = int(pulp.value(total_swaps))
+
+    # Build per-time-step SWAP schedule:
+    # For each t, look at directed moves and turn them into undirected edges.
+    swap_layers = []
+    for t in range(T):
+        layer = []
+        for (u, v) in undirected:
+            if int(pulp.value(y[(t, u, v)])) == 1:
+                layer.append((u, v))
+        # At most one edge per layer by construction
+        if layer: swap_layers.append(layer)
+
+    return {
+        "swap_count": swap_value,
+        "moves": moves_value,
+        "layers": swap_layers,
+        "status": status,
+    }
+
+def apply_swaps(perm, layers):
+    """
+    Apply a sequence of SWAP layers to a permutation.
+
+    perm: initial permutation (list)
+    layers: list of SWAP layers, each layer is a list of edges (u,v)
+
+    Returns the resulting permutation after applying all SWAPs.
+    """
+    current_perm = perm[:]
+    for layer in layers:
+        for (u, v) in layer:
+            # Swap tokens at positions u and v
+            current_perm[u], current_perm[v] = current_perm[v], current_perm[u]
+    return current_perm
+
 
 def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
     adj_list = {}
@@ -217,7 +398,7 @@ def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
     dist_placeholder = 0
     qbit_map_inv = { k:v for v,k in qbit_map.items()}
     for u,v in mini_topology:
-        dist_placeholder += dist_matrix[pi[qbit_map_inv[u]]][pi[qbit_map_inv[v]]]-3
+        dist_placeholder += (dist_matrix[pi[qbit_map_inv[u]]][pi[qbit_map_inv[v]]]-1)*3
     return dist_placeholder
 
 def extract_subtopology(involved_qbits, qbit_map, config ):
@@ -249,6 +430,7 @@ def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
     else:
         swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
     return len(swaps)
+
 class SingleQubitPartitionResult:
     
     def __init__(self,circuit_in,parameters_in):

From 3abe25bd395b6a0bfabc3c19090993d2b483f2cc Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 7 Dec 2025 01:19:23 +0100
Subject: [PATCH 053/232] Add in ILP swaps

---
 squander/synthesis/PartAM_utils.py | 108 +++++++++++++++++++++++++++--
 1 file changed, 104 insertions(+), 4 deletions(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 7bb3a8239..2f8426977 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -135,12 +135,12 @@ def solve_min_swaps(perm, edges, T=None, use_gurobi=True):
     # Choose solver
     if use_gurobi:
         try:
-            solver = pulp.GUROBI(msg=1)
+            solver = pulp.GUROBI(msg=0)
         except Exception:
             # Fallback if GUROBI not properly installed with PuLP wrapper
-            solver = pulp.PULP_CBC_CMD(msg=1)
+            solver = pulp.PULP_CBC_CMD(msg=0)
     else:
-        solver = pulp.PULP_CBC_CMD(msg=1)
+        solver = pulp.PULP_CBC_CMD(msg=0)
 
     prob.solve(solver)
 
@@ -186,6 +186,106 @@ def apply_swaps(perm, layers):
             current_perm[u], current_perm[v] = current_perm[v], current_perm[u]
     return current_perm
 
+def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix, use_gurobi=True):
+    """
+    Find SWAP sequence to route subset of virtual qubits to targets using ILP.
+    
+    Args:
+        pi_A: List [P0, P1, ...] where pi_A[q] = P (virtual q at physical P)
+        pi_B_dict: Dict {q: P} specifying only qubits that need routing
+        dist_matrix: Pre-computed distance matrix dist[i][j] between physical qubits
+    
+    Returns:
+        swaps: List of (i, j) SWAP operations on adjacent physical qubits
+        final_permutation: List showing final virtual→physical mapping
+    """
+    n = len(pi_A)
+    
+    # Build edges from distance matrix (adjacent = distance 1)
+    edges = []
+    for i in range(n):
+        for j in range(i + 1, n):
+            if dist_matrix[i][j] == 1:
+                edges.append((i, j))
+    
+    # === Step 1: Complete eta (target permutation) ===
+    # eta[q] = target physical position for virtual qubit q
+    assigned_physical = set(pi_B_dict.values())
+    unassigned_logical = [q for q in range(n) if q not in pi_B_dict]
+    available_physical = set(P for P in range(n) if P not in assigned_physical)
+    
+    eta = dict(pi_B_dict)  # Start with required assignments
+    
+    # Try to keep unassigned qubits in place if their position is available
+    still_unassigned = []
+    for q in unassigned_logical:
+        current_P = pi_A[q]
+        if current_P in available_physical:
+            eta[q] = current_P
+            available_physical.remove(current_P)
+        else:
+            still_unassigned.append(q)
+    
+    # Assign remaining qubits to remaining positions
+    remaining_physical = sorted(available_physical)
+    for q, P in zip(still_unassigned, remaining_physical):
+        eta[q] = P
+    
+    # Convert to list
+    eta_list = [eta[q] for q in range(n)]
+    
+    # === Step 2: Compute inverse permutations ===
+    # pi_A_inv[P] = q means physical P has virtual q
+    pi_A_inv = [0] * n
+    for q in range(n):
+        pi_A_inv[pi_A[q]] = q
+    
+    # eta_inv[P] = q means we want physical P to have virtual q
+    eta_inv = [0] * n
+    for q in range(n):
+        eta_inv[eta_list[q]] = q
+    
+    # === Step 3: Construct perm for solve_min_swaps ===
+    # To route from state A to state B using swaps S where S(identity) = perm:
+    # We need: A[perm[P]] = B[P], so perm[P] = A^{-1}[B[P]]
+    # Here: A = pi_A_inv, B = eta_inv, A^{-1} = pi_A
+    # So: perm[P] = pi_A[eta_inv[P]]
+    perm = [pi_A[eta_inv[P]] for P in range(n)]
+    
+    # Check if already at target (perm is identity)
+    if perm == list(range(n)):
+        return [], eta_list
+    
+    # === Step 4: Solve using ILP ===
+    result = solve_min_swaps(perm, edges, use_gurobi=use_gurobi)
+    
+    if result['status'] != 'Optimal':
+        return None, None
+    
+    # Extract swaps from layers (flatten)
+    swaps = []
+    for layer in result['layers']:
+        for swap in layer:
+            swaps.append(swap)
+    
+    # === Step 5: Compute final permutation ===
+    # Apply swaps to pi_A to get final virtual→physical mapping
+    # Maintain both directions for O(1) swap operations
+    final_perm = list(pi_A)
+    phys_to_virt = list(pi_A_inv)
+    
+    for (i, j) in swaps:
+        # Get virtual qubits at physical positions i and j
+        q_i = phys_to_virt[i]
+        q_j = phys_to_virt[j]
+        
+        # Swap their physical positions
+        final_perm[q_i] = j
+        final_perm[q_j] = i
+        phys_to_virt[i] = q_j
+        phys_to_virt[j] = q_i
+    
+    return swaps, final_perm
 
 def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
     adj_list = {}
@@ -292,7 +392,7 @@ def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int
             return mapping
     return {}
 
-def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
+def find_constrained_swaps_ouch(pi_A, pi_B_dict, dist_matrix):
     """
     Find SWAP sequence to route subset of virtual qubits to targets.
     

From fb34b03582f5df843a9dd46a48dda33a147bcf25 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 7 Dec 2025 19:34:28 +0100
Subject: [PATCH 054/232] Rename functions

---
 squander/synthesis/PartAM.py       | 12 ++++++------
 squander/synthesis/PartAM_utils.py |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 61f6c04e3..bcad3b74a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -545,11 +545,11 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                     continue
                 for tdx, mini_topology in enumerate(partition_result.mini_topologies):
                     dist_placeholder = calculate_dist_small(mini_topology,partition_result.qubit_map,D,output_perm)
-                    circuit_length = np.mean([len(circ) for circ in partition_result.circuit_structures[tdx]])
+                    circuit_length = np.min([len(circ) for circ in partition_result.circuit_structures[tdx]])
                     score = dist_placeholder + circuit_length
                     mini_scores.append(score)
                 if mini_scores:
-                    score_E += np.mean(mini_scores)
+                    score_E += np.min(mini_scores)
 
         for partition_idx in F:
             partition = scoring_partitions[partition_idx]
@@ -558,11 +558,11 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
                 dist_placeholder = calculate_dist_small(mini_topology,partition.qubit_map,D,output_perm)
-                circuit_length = np.mean([len(circ) for circ in partition.circuit_structures[tdx]])
+                circuit_length = np.min([len(circ) for circ in partition.circuit_structures[tdx]])
                 score = dist_placeholder + circuit_length
                 mini_scores.append(score)
             if mini_scores:
-                score_F += np.mean(mini_scores)
+                score_F += np.min(mini_scores)
 
             # Safety check: ensure partition_idx is valid for sDAG
             if partition_idx < len(sDAG):
@@ -576,11 +576,11 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                         continue
                     for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
                         dist_placeholder = calculate_dist_small(mini_topology,partition_result_E.qubit_map,D,output_perm)
-                        circuit_length = np.mean([len(circ) for circ in partition_result_E.circuit_structures[tdx]])
+                        circuit_length = np.min([len(circ) for circ in partition_result_E.circuit_structures[tdx]])
                         score = dist_placeholder + circuit_length
                         mini_scores.append(score)
                     if mini_scores:
-                        score_E += np.mean(mini_scores)
+                        score_E += np.min(mini_scores)
         # Safety check for division by zero
         coeff_E = 0.5
         if len(E_visited_partitions) == 0:
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 2f8426977..41bf8da54 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -135,12 +135,12 @@ def solve_min_swaps(perm, edges, T=None, use_gurobi=True):
     # Choose solver
     if use_gurobi:
         try:
-            solver = pulp.GUROBI(msg=0)
+            solver = pulp.GUROBI(msg=False, manageEnv=True, Threads=1)
         except Exception:
             # Fallback if GUROBI not properly installed with PuLP wrapper
-            solver = pulp.PULP_CBC_CMD(msg=0)
+            solver = pulp.PULP_CBC_CMD(msg=False)
     else:
-        solver = pulp.PULP_CBC_CMD(msg=0)
+        solver = pulp.PULP_CBC_CMD(msg=False)
 
     prob.solve(solver)
 

From 00441110d26642d003721dfd2631b918157aa8e3 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 7 Dec 2025 19:51:32 +0100
Subject: [PATCH 055/232] rename

---
 squander/synthesis/PartAM_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 41bf8da54..fc8dd67dc 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -392,7 +392,7 @@ def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int
             return mapping
     return {}
 
-def find_constrained_swaps_ouch(pi_A, pi_B_dict, dist_matrix):
+def find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix):
     """
     Find SWAP sequence to route subset of virtual qubits to targets.
     

From 2415da9543db5189d636f56a91e7f8d0e9b168d2 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 7 Dec 2025 23:23:54 +0100
Subject: [PATCH 056/232] Add in second children to heuristic cost function

---
 squander/synthesis/PartAM.py       |  60 +++++++------
 squander/synthesis/PartAM_utils.py | 139 +++++++++++++++++------------
 2 files changed, 112 insertions(+), 87 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index bcad3b74a..df7a5a77d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -528,7 +528,9 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache, used_qubits):
         score_F = 0
         score_E = 0
-        E_visited_partitions = set()  # Changed to set for O(1) membership checks
+        E_partitions = set()  # Changed to set for O(1) membership checks
+        E_partitions_1 = set()
+        E_partitions_2 = set()
         swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
         score_F += calculate_swap_cost(swaps, pi, used_qubits)
         score_F += len(partition_candidate.circuit_structure)
@@ -536,20 +538,10 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
         # Safety check: ensure partition_idx is valid for sDAG
         if partition_candidate.partition_idx < len(sDAG):
             for partition_idx in sDAG[partition_candidate.partition_idx]:
-                if partition_idx in E_visited_partitions:
+                if partition_idx in E_partitions:
                     continue
-                E_visited_partitions.add(partition_idx)
-                mini_scores = []
-                partition_result = scoring_partitions[partition_idx]
-                if partition_result is None:
-                    continue
-                for tdx, mini_topology in enumerate(partition_result.mini_topologies):
-                    dist_placeholder = calculate_dist_small(mini_topology,partition_result.qubit_map,D,output_perm)
-                    circuit_length = np.min([len(circ) for circ in partition_result.circuit_structures[tdx]])
-                    score = dist_placeholder + circuit_length
-                    mini_scores.append(score)
-                if mini_scores:
-                    score_E += np.min(mini_scores)
+                E_partitions.add(partition_idx)
+
 
         for partition_idx in F:
             partition = scoring_partitions[partition_idx]
@@ -567,26 +559,36 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
             # Safety check: ensure partition_idx is valid for sDAG
             if partition_idx < len(sDAG):
                 for partition_idx_E in sDAG[partition_idx]:
-                    if partition_idx_E in E_visited_partitions:
+                    if partition_idx_E in E_partitions:
                         continue
-                    E_visited_partitions.add(partition_idx_E)
-                    mini_scores = []
-                    partition_result_E = scoring_partitions[partition_idx_E]
-                    if partition_result_E is None:
+                    E_partitions.add(partition_idx_E)
+
+        #check the secondary children            
+        for partition_idx in E_partitions: 
+            if partition_idx < len(sDAG):
+                for partition_idx_E in sDAG[partition_idx]:
+                    if partition_idx_E in E_partitions or partition_idx_e in E_partitions_1:
                         continue
-                    for tdx, mini_topology in enumerate(partition_result_E.mini_topologies):
-                        dist_placeholder = calculate_dist_small(mini_topology,partition_result_E.qubit_map,D,output_perm)
-                        circuit_length = np.min([len(circ) for circ in partition_result_E.circuit_structures[tdx]])
-                        score = dist_placeholder + circuit_length
-                        mini_scores.append(score)
-                    if mini_scores:
-                        score_E += np.min(mini_scores)
-        # Safety check for division by zero
+                    E_partitions_1.add(partition_idx_E)
+        #score all
+        for partition_idx in E_partitions.union(E_partitions_1):
+            mini_scores = []
+            partition_result = scoring_partitions[partition_idx]
+            if partition_result is None:
+                continue
+            for tdx, mini_topology in enumerate(partition_result.mini_topologies):
+                dist_placeholder = calculate_dist_small(mini_topology,partition_result.qubit_map,D,output_perm)
+                circuit_length = np.min([len(circ) for circ in partition_result.circuit_structures[tdx]])
+                score = dist_placeholder + circuit_length
+                mini_scores.append(score)
+            if mini_scores:
+                score_E += np.min(mini_scores)
+
         coeff_E = 0.5
-        if len(E_visited_partitions) == 0:
+        if len(E_partitions.union(E_partitions_1)) == 0:
             E_score = 0.0
         else:
-            E_score = coeff_E * score_E / len(E_visited_partitions)
+            E_score = coeff_E * score_E / len(E_partitions.union(E_partitions_1))
         
         F_score = score_F / len(F)
         
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index fc8dd67dc..819d46b5d 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -407,93 +407,116 @@ def find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix):
     """
     n = len(pi_A)
     
-    # Build adjacency list from distance matrix
-    adj = [set() for _ in range(n)]
+    # Build adjacency list if not provided
+    adj_list = [set() for _ in range(n)]
     for i in range(n):
         for j in range(i+1, n):
-            if dist_matrix[i][j] == 1:  # Adjacent in topology
-                adj[i].add(j)
-                adj[j].add(i)
-    
-    # Use physical-to-virtual representation for easier SWAP handling
-    # state[P] = q means physical qubit P contains virtual qubit q
-    def to_phys_to_virt(virt_to_phys):
-        """Convert virtual→physical list to physical→virtual list"""
+            if dist_matrix[i][j] == 1:
+                adj_list[i].add(j)
+                adj_list[j].add(i)
+    
+    # Convert to physical-to-virtual for SWAP handling
+    # Also maintain virtual-to-physical for O(1) lookup
+    def init_state(virt_to_phys):
         p2v = [0] * n
+        v2p = [0] * n
         for q in range(n):
             P = virt_to_phys[q]
             p2v[P] = q
-        return p2v
-    
-    def to_virt_to_phys(phys_to_virt):
-        """Convert physical→virtual list to virtual→physical list"""
-        v2p = [0] * n
-        for P in range(n):
-            q = phys_to_virt[P]
             v2p[q] = P
-        return v2p
+        return tuple(p2v), tuple(v2p)
     
-    start_state = tuple(to_phys_to_virt(pi_A))
+    start_p2v, start_v2p = init_state(pi_A)
     
-    def is_goal(state):
-        """Check if target qubits are in correct physical positions"""
+    def is_goal(v2p):
+        """Check if target qubits are at correct positions"""
         for q, target_P in pi_B_dict.items():
-            if state[target_P] != q:  # Physical position target_P should contain virtual q
+            if v2p[q] != target_P:
                 return False
         return True
     
-    def heuristic(state):
-        """Lower bound: sum of distances for qubits needing routing"""
-        total = 0.0
+    def heuristic(v2p):
+        """
+        Improved heuristic: sum of distances without over-optimistic division.
+        Each qubit needs at least ceil(dist/1) swaps to move dist positions.
+        But swaps can help at most 2 qubits, so we use a tighter bound.
+        """
+        distances = []
         for q, target_P in pi_B_dict.items():
-            # Find where virtual qubit q currently is
-            current_P = state.index(q)
-            distance = dist_matrix[current_P][target_P]
-            if np.isinf(distance):
-                logging.warning(
-                    "Encountered unreachable qubit pair (%s, %s) in routing heuristic; returning inf cost.",
-                    current_P,
-                    target_P,
-                )
+            current_P = v2p[q]  # O(1) lookup now!
+            d = dist_matrix[current_P][target_P]
+            if np.isinf(d):
                 return math.inf
-            total += float(distance)
-        return math.floor(total / 2)  # Optimistic: each SWAP helps 2 qubits
+            distances.append(int(d))
+        
+        if not distances:
+            return 0
+        
+        # Tighter heuristic: max distance is a lower bound
+        # Also sum/2 but with ceiling, and take max of both
+        total = sum(distances)
+        max_dist = max(distances)
+        
+        # A single SWAP reduces total distance by at most 2
+        # So we need at least ceil(total/2) swaps
+        # But we also need at least max_dist swaps for the furthest qubit
+        return max(max_dist, (total + 1) // 2)
+    
+    if is_goal(start_v2p):
+        return [], list(pi_A)
     
-    # A* search
-    heap = [(heuristic(start_state), 0, start_state, [])]
+    # A* search with improved state representation
+    # State: (p2v_tuple, v2p_tuple) - we track both for efficiency
+    start_state = (start_p2v, start_v2p)
+    h0 = heuristic(start_v2p)
+    
+    # heap: (f, g, state, path)
+    heap = [(h0, 0, start_state, [])]
     visited = {start_state: 0}
     
-    while heap:
-        f, g, current, path = heapq.heappop(heap)
+    max_iterations = 100000  # Safety limit
+    iterations = 0
+    
+    while heap and iterations < max_iterations:
+        iterations += 1
+        f, g, (p2v, v2p), path = heapq.heappop(heap)
         
-        if is_goal(current):
-            # Convert final state back to virtual→physical mapping
-            final_permutation = to_virt_to_phys(current)
-            return path, final_permutation
+        if is_goal(v2p):
+            # Convert back to virtual->physical list
+            return path, list(v2p)
         
-        if visited.get(current, float('inf')) < g:
+        if visited.get((p2v, v2p), float('inf')) < g:
             continue
         
         # Try all valid SWAPs on adjacent physical qubits
-        current_list = list(current)
         for i in range(n):
-            for j in adj[i]:
-                if i < j:  # Avoid duplicate (i,j) and (j,i)
+            for j in adj_list[i]:
+                if i < j:
                     # SWAP physical qubits i and j
-                    new_state = current_list[:]
-                    new_state[i], new_state[j] = new_state[j], new_state[i]
-                    new_state_tuple = tuple(new_state)
+                    # Get virtual qubits at these positions
+                    q_i = p2v[i]
+                    q_j = p2v[j]
                     
+                    # Create new state
+                    new_p2v = list(p2v)
+                    new_v2p = list(v2p)
+                    
+                    new_p2v[i], new_p2v[j] = q_j, q_i
+                    new_v2p[q_i], new_v2p[q_j] = j, i
+                    
+                    new_state = (tuple(new_p2v), tuple(new_v2p))
                     new_g = g + 1
                     
-                    if visited.get(new_state_tuple, float('inf')) > new_g:
-                        visited[new_state_tuple] = new_g
-                        new_f = new_g + heuristic(new_state_tuple)
-                        new_path = path + [(i, j)]
-                        heapq.heappush(heap, (new_f, new_g, new_state_tuple, new_path))
+                    if visited.get(new_state, float('inf')) > new_g:
+                        visited[new_state] = new_g
+                        new_h = heuristic(tuple(new_v2p))
+                        if new_h < math.inf:
+                            new_f = new_g + new_h
+                            heapq.heappush(heap, (new_f, new_g, new_state, path + [(i, j)]))
+    
+    logging.warning(f"SWAP routing did not converge after {iterations} iterations")
+    return None, None
     
-    return None, None  # No solution found
-
 def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
     dist_placeholder = 0
     qbit_map_inv = { k:v for v,k in qbit_map.items()}

From 22ed0245a9acf413aec27dcce717b126d6e59c0b Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 7 Dec 2025 23:55:32 +0100
Subject: [PATCH 057/232] Rename again

---
 squander/synthesis/PartAM_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 819d46b5d..a62beaa13 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -186,7 +186,7 @@ def apply_swaps(perm, layers):
             current_perm[u], current_perm[v] = current_perm[v], current_perm[u]
     return current_perm
 
-def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix, use_gurobi=True):
+def find_constrained_swaps_ILP(pi_A, pi_B_dict, dist_matrix, use_gurobi=True):
     """
     Find SWAP sequence to route subset of virtual qubits to targets using ILP.
     
@@ -392,7 +392,7 @@ def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int
             return mapping
     return {}
 
-def find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix):
+def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
     """
     Find SWAP sequence to route subset of virtual qubits to targets.
     
@@ -516,7 +516,7 @@ def heuristic(v2p):
     
     logging.warning(f"SWAP routing did not converge after {iterations} iterations")
     return None, None
-    
+
 def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
     dist_placeholder = 0
     qbit_map_inv = { k:v for v,k in qbit_map.items()}

From 326c98caa0266f2569100c613863f89d7b0954f9 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 8 Dec 2025 00:14:42 +0100
Subject: [PATCH 058/232] fix typo

---
 squander/synthesis/PartAM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index df7a5a77d..3e3f5f525 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -567,7 +567,7 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
         for partition_idx in E_partitions: 
             if partition_idx < len(sDAG):
                 for partition_idx_E in sDAG[partition_idx]:
-                    if partition_idx_E in E_partitions or partition_idx_e in E_partitions_1:
+                    if partition_idx_E in E_partitions or partition_idx_E in E_partitions_1:
                         continue
                     E_partitions_1.add(partition_idx_E)
         #score all

From ca4cf2df2c579b51ecd35288c49c0d712e6a652a Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Mon, 8 Dec 2025 14:26:50 +0100
Subject: [PATCH 059/232] Move back to old heuristic A_star search

---
 squander/synthesis/PartAM_utils.py | 140 ++++++++++++-----------------
 1 file changed, 59 insertions(+), 81 deletions(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index a62beaa13..a0e51c6b3 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -407,115 +407,93 @@ def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
     """
     n = len(pi_A)
     
-    # Build adjacency list if not provided
-    adj_list = [set() for _ in range(n)]
+    # Build adjacency list from distance matrix
+    adj = [set() for _ in range(n)]
     for i in range(n):
         for j in range(i+1, n):
-            if dist_matrix[i][j] == 1:
-                adj_list[i].add(j)
-                adj_list[j].add(i)
-    
-    # Convert to physical-to-virtual for SWAP handling
-    # Also maintain virtual-to-physical for O(1) lookup
-    def init_state(virt_to_phys):
+            if dist_matrix[i][j] == 1:  # Adjacent in topology
+                adj[i].add(j)
+                adj[j].add(i)
+    
+    # Use physical-to-virtual representation for easier SWAP handling
+    # state[P] = q means physical qubit P contains virtual qubit q
+    def to_phys_to_virt(virt_to_phys):
+        """Convert virtual→physical list to physical→virtual list"""
         p2v = [0] * n
-        v2p = [0] * n
         for q in range(n):
             P = virt_to_phys[q]
             p2v[P] = q
+        return p2v
+    
+    def to_virt_to_phys(phys_to_virt):
+        """Convert physical→virtual list to virtual→physical list"""
+        v2p = [0] * n
+        for P in range(n):
+            q = phys_to_virt[P]
             v2p[q] = P
-        return tuple(p2v), tuple(v2p)
+        return v2p
     
-    start_p2v, start_v2p = init_state(pi_A)
+    start_state = tuple(to_phys_to_virt(pi_A))
     
-    def is_goal(v2p):
-        """Check if target qubits are at correct positions"""
+    def is_goal(state):
+        """Check if target qubits are in correct physical positions"""
         for q, target_P in pi_B_dict.items():
-            if v2p[q] != target_P:
+            if state[target_P] != q:  # Physical position target_P should contain virtual q
                 return False
         return True
     
-    def heuristic(v2p):
-        """
-        Improved heuristic: sum of distances without over-optimistic division.
-        Each qubit needs at least ceil(dist/1) swaps to move dist positions.
-        But swaps can help at most 2 qubits, so we use a tighter bound.
-        """
-        distances = []
+    def heuristic(state):
+        """Lower bound: sum of distances for qubits needing routing"""
+        total = 0.0
         for q, target_P in pi_B_dict.items():
-            current_P = v2p[q]  # O(1) lookup now!
-            d = dist_matrix[current_P][target_P]
-            if np.isinf(d):
+            # Find where virtual qubit q currently is
+            current_P = state.index(q)
+            distance = dist_matrix[current_P][target_P]
+            if np.isinf(distance):
+                logging.warning(
+                    "Encountered unreachable qubit pair (%s, %s) in routing heuristic; returning inf cost.",
+                    current_P,
+                    target_P,
+                )
                 return math.inf
-            distances.append(int(d))
-        
-        if not distances:
-            return 0
-        
-        # Tighter heuristic: max distance is a lower bound
-        # Also sum/2 but with ceiling, and take max of both
-        total = sum(distances)
-        max_dist = max(distances)
-        
-        # A single SWAP reduces total distance by at most 2
-        # So we need at least ceil(total/2) swaps
-        # But we also need at least max_dist swaps for the furthest qubit
-        return max(max_dist, (total + 1) // 2)
-    
-    if is_goal(start_v2p):
-        return [], list(pi_A)
-    
-    # A* search with improved state representation
-    # State: (p2v_tuple, v2p_tuple) - we track both for efficiency
-    start_state = (start_p2v, start_v2p)
-    h0 = heuristic(start_v2p)
+            total += float(distance)
+        return math.floor(total / 2)  # Optimistic: each SWAP helps 2 qubits
     
-    # heap: (f, g, state, path)
-    heap = [(h0, 0, start_state, [])]
+    # A* search
+    heap = [(heuristic(start_state), 0, start_state, [])]
     visited = {start_state: 0}
     
-    max_iterations = 100000  # Safety limit
-    iterations = 0
-    
-    while heap and iterations < max_iterations:
-        iterations += 1
-        f, g, (p2v, v2p), path = heapq.heappop(heap)
+    while heap:
+        f, g, current, path = heapq.heappop(heap)
         
-        if is_goal(v2p):
-            # Convert back to virtual->physical list
-            return path, list(v2p)
+        if is_goal(current):
+            # Convert final state back to virtual→physical mapping
+            final_permutation = to_virt_to_phys(current)
+            return path, final_permutation
         
-        if visited.get((p2v, v2p), float('inf')) < g:
+        if visited.get(current, float('inf')) < g:
             continue
         
         # Try all valid SWAPs on adjacent physical qubits
+        current_list = list(current)
         for i in range(n):
-            for j in adj_list[i]:
-                if i < j:
+            for j in adj[i]:
+                if i < j:  # Avoid duplicate (i,j) and (j,i)
                     # SWAP physical qubits i and j
-                    # Get virtual qubits at these positions
-                    q_i = p2v[i]
-                    q_j = p2v[j]
-                    
-                    # Create new state
-                    new_p2v = list(p2v)
-                    new_v2p = list(v2p)
+                    new_state = current_list[:]
+                    new_state[i], new_state[j] = new_state[j], new_state[i]
+                    new_state_tuple = tuple(new_state)
                     
-                    new_p2v[i], new_p2v[j] = q_j, q_i
-                    new_v2p[q_i], new_v2p[q_j] = j, i
-                    
-                    new_state = (tuple(new_p2v), tuple(new_v2p))
                     new_g = g + 1
                     
-                    if visited.get(new_state, float('inf')) > new_g:
-                        visited[new_state] = new_g
-                        new_h = heuristic(tuple(new_v2p))
-                        if new_h < math.inf:
-                            new_f = new_g + new_h
-                            heapq.heappush(heap, (new_f, new_g, new_state, path + [(i, j)]))
+                    if visited.get(new_state_tuple, float('inf')) > new_g:
+                        visited[new_state_tuple] = new_g
+                        new_f = new_g + heuristic(new_state_tuple)
+                        new_path = path + [(i, j)]
+                        heapq.heappush(heap, (new_f, new_g, new_state_tuple, new_path))
     
-    logging.warning(f"SWAP routing did not converge after {iterations} iterations")
-    return None, None
+    return None, None  # No solution found
+
 
 def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
     dist_placeholder = 0
@@ -626,7 +604,7 @@ def get_partition_synthesis_score(self):
         score = 0
         for topology_idx in range(self.topology_count):
             cnot_count_topology = np.min(self.cnot_counts[topology_idx])#np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
-            score += cnot_count_topology/self.topology_count
+            score = min(cnot_count_topology,score)
         return score
     
     def get_topology_candidates(self, topology_idx):

From 37503d28212df347dd248ffb38dd58dd1629efb9 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Mon, 8 Dec 2025 15:27:24 +0100
Subject: [PATCH 060/232] Change scoring

---
 squander/synthesis/PartAM.py       | 1 -
 squander/synthesis/PartAM_utils.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3e3f5f525..5bdf2b800 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -380,7 +380,6 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         
         D = self.compute_distances_bfs(circ.get_Qbit_Num())
         pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
-        pi_list = pi.tolist() if hasattr(pi, 'tolist') else list(pi)
         
         F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
         scoring_partitions = self._build_scoring_partitions(optimized_partitions)
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index a0e51c6b3..55029d1e1 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -601,7 +601,7 @@ def get_original_circuit_structure(self):
         return circuit_structure
         
     def get_partition_synthesis_score(self):
-        score = 0
+        score = np.inf
         for topology_idx in range(self.topology_count):
             cnot_count_topology = np.min(self.cnot_counts[topology_idx])#np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
             score = min(cnot_count_topology,score)

From a8db19894c1ae4b03c4ecd8bd4e33b00a0a7e579 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 8 Dec 2025 16:26:00 +0100
Subject: [PATCH 061/232] Refactor and reorganize PartAM_utils and PartAM for
 improved structure and clarity. Introduce new routing algorithms and topology
 utilities, while removing unused functions and consolidating related methods.
 Enhance documentation with section headers for better navigation.

---
 squander/synthesis/PartAM.py       |  58 +++-
 squander/synthesis/PartAM_utils.py | 420 +++++++++++++++--------------
 2 files changed, 279 insertions(+), 199 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 5bdf2b800..567891d3d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -28,7 +28,6 @@
 import logging
 from tqdm import tqdm
 from collections import deque, defaultdict
-import numpy as np
 
 from squander.synthesis.PartAM_utils import (
     get_subtopologies_of_type,
@@ -45,6 +44,10 @@
 )
 
 
+# ============================================================================
+# Data Classes
+# ============================================================================
+
 @dataclass(frozen=True)
 class PartitionScoreData:
     mini_topologies: Tuple[Tuple[Tuple[int, int], ...], ...]
@@ -57,6 +60,10 @@ class PartitionScoreData:
     involved_qbits: Tuple[int, ...]
 
 
+# ============================================================================
+# Parallel Processing Setup
+# ============================================================================
+
 _WORKER_SCORING_PARTITIONS: Optional[List[Optional[PartitionScoreData]]] = None
 _WORKER_S_DAG: Optional[List[List[int]]] = None
 _WORKER_DISTANCE_MATRIX: Optional[np.ndarray] = None
@@ -95,8 +102,17 @@ def _score_candidate_worker(payload):
         used_qubits,
     )
 
+
+# ============================================================================
+# Main Class: qgd_Partition_Aware_Mapping
+# ============================================================================
+
 class qgd_Partition_Aware_Mapping:
 
+    # ------------------------------------------------------------------------
+    # Initialization & Configuration
+    # ------------------------------------------------------------------------
+
     def __init__(self, config):
         self.topology = config['topology']
         self.config = config
@@ -121,6 +137,10 @@ def __init__(self, config):
         self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
         self._swap_cache = {}     # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)}
 
+    # ------------------------------------------------------------------------
+    # Caching Methods
+    # ------------------------------------------------------------------------
+
     def _get_subtopologies_of_type_cached(self, mini_topology):
         """
         Cached version of get_subtopologies_of_type.
@@ -191,6 +211,10 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
             )
         return scoring_partitions
 
+    # ------------------------------------------------------------------------
+    # Partition Decomposition Methods
+    # ------------------------------------------------------------------------
+
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
         """
@@ -271,6 +295,10 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
         parameters       = cDecompose.get_Optimized_Parameters()
         return squander_circuit, parameters
 
+    # ------------------------------------------------------------------------
+    # Circuit Synthesis
+    # ------------------------------------------------------------------------
+
     def SynthesizeWideCircuit(self, circ, orig_parameters):
         allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(circ, self.config["max_partition_size"])
         qbit_num_orig_circuit = circ.get_Qbit_Num()
@@ -364,6 +392,10 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         
         return optimized_partitions
 
+    # ------------------------------------------------------------------------
+    # Main Public API
+    # ------------------------------------------------------------------------
+
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         
         optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
@@ -390,6 +422,10 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         
         return final_circuit, final_parameters, pi_initial, pi
 
+    # ------------------------------------------------------------------------
+    # Heuristic Search
+    # ------------------------------------------------------------------------
+
     def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, sDAG):
         pi_initial = pi.copy()
         used_qubits = set()
@@ -495,6 +531,10 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         pbar.close()
         return partition_order, pi, pi_initial
 
+    # ------------------------------------------------------------------------
+    # Circuit Construction
+    # ------------------------------------------------------------------------
+
     def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         final_circuit = Circuit(N)
         final_parameters = []
@@ -523,6 +563,10 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
             print("ERROR: Final circuit is not compatible with device topology!")
         return final_circuit, final_parameters
     
+    # ------------------------------------------------------------------------
+    # Scoring
+    # ------------------------------------------------------------------------
+
     @staticmethod
     def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache, used_qubits):
         score_F = 0
@@ -593,6 +637,10 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
         
         return E_score + F_score
 
+    # ------------------------------------------------------------------------
+    # Candidate Generation
+    # ------------------------------------------------------------------------
+
     def obtain_partition_candidates(self, F, optimized_partitions):
         partition_candidates = []
         for partition_idx in F:
@@ -607,6 +655,10 @@ def obtain_partition_candidates(self, F, optimized_partitions):
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
                         partition_candidates.append(PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits))
         return partition_candidates
+
+    # ------------------------------------------------------------------------
+    # Graph Construction
+    # ------------------------------------------------------------------------
         
     def get_initial_layer(self, IDAG, N, optimized_partitions):
         initial_layer = []
@@ -681,6 +733,10 @@ def construct_sDAG(self, optimized_partitions):
             
         return sDAG
 
+    # ------------------------------------------------------------------------
+    # Distance & Layout
+    # ------------------------------------------------------------------------
+
     def compute_distances_bfs(self, N):
         """BFS distance computation - faster than Floyd-Warshall."""
         D = np.ones((N, N)) * np.inf
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 55029d1e1..bcb830f9e 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -8,6 +8,11 @@
 import pulp
 from collections import defaultdict
 
+
+# ============================================================================
+# SWAP Routing Algorithms
+# ============================================================================
+
 def solve_min_swaps(perm, edges, T=None, use_gurobi=True):
     """
     Compute globally optimal minimum SWAPs to route permutation 'perm'
@@ -287,111 +292,6 @@ def find_constrained_swaps_ILP(pi_A, pi_B_dict, dist_matrix, use_gurobi=True):
     
     return swaps, final_perm
 
-def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
-    adj_list = {}
-    for u, v in edges:
-        if u not in adj_list:
-            adj_list[u] = set()
-        if v not in adj_list:
-            adj_list[v] = set()
-        adj_list[u].add(v)
-        adj_list[v].add(u)
-    return adj_list
-
-def _get_induced_edges(edges: List[Tuple[int, int]], qubit_subset: Set[int]) -> List[Tuple[int, int]]:
-    return [edge for edge in edges if edge[0] in qubit_subset and edge[1] in qubit_subset]
-
-def _dfs_enumerate(adj_list: dict, k: int, callback):
-    all_qubits = sorted(adj_list.keys())
-    seen = set()
-    def dfs(current_qubits: Set[int], candidates: Set[int]):
-        if len(current_qubits) == k:
-            frozen = frozenset(current_qubits)
-            if frozen not in seen:
-                seen.add(frozen)
-                callback(current_qubits)
-            return
-        if len(current_qubits) + len(candidates) < k:
-            return
-        for node in sorted(candidates):
-            new_qubits = current_qubits | {node}
-            new_candidates = {neighbor for q in new_qubits for neighbor in adj_list[q] 
-                            if neighbor not in new_qubits and neighbor > node}
-            dfs(new_qubits, new_candidates)
-    for start in all_qubits:
-        dfs({start}, {n for n in adj_list[start] if n > start})
-
-def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]:
-    qubits = sorted(qubit_subset)
-    n = len(qubits)
-    best_edges = None
-    for perm in permutations(range(n)):
-        mapping = {qubits[i]: perm[i] for i in range(n)}
-        relabeled = tuple(sorted([tuple(sorted([mapping[u], mapping[v]])) for u, v in induced_edges]))
-        if best_edges is None or relabeled < best_edges:
-            best_edges = relabeled
-    return frozenset(best_edges)
-
-def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
-    if k <= 0:
-        return []
-    adj_list = _build_adj_list(edges)
-    if k == 1:
-        return [[]]
-    canonical_forms = {}
-    def process(qubits):
-        induced = _get_induced_edges(edges, qubits)
-        canonical = get_canonical_form(qubits, induced)
-        if canonical not in canonical_forms:
-            canonical_forms[canonical] = induced
-    _dfs_enumerate(adj_list, k, process)
-    return list(canonical_forms.values())
-
-def get_subtopologies_of_type(edges: List[Tuple[int, int]], target_topology: List[Tuple[int, int]]) -> List[List[Tuple[int, int]]]:
-    target_qubits = set()
-    for u, v in target_topology:
-        target_qubits.add(u)
-        target_qubits.add(v)
-    k = len(target_qubits) if target_qubits else 1
-    if k <= 0:
-        return []
-    adj_list = _build_adj_list(edges)
-    if k == 1:
-        return [[] for _ in adj_list.keys()]
-    target_canonical = get_canonical_form(target_qubits, target_topology)
-    matches = []
-    def process(qubits):
-        induced = _get_induced_edges(edges, qubits)
-        canonical = get_canonical_form(qubits, induced)
-        if canonical == target_canonical:
-            matches.append(induced)
-    _dfs_enumerate(adj_list, k, process)
-    return matches
-
-def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int, int]]) -> dict:
-    qubits1 = set()
-    for u, v in topology1:
-        qubits1.add(u)
-        qubits1.add(v)
-    qubits2 = set()
-    for u, v in topology2:
-        qubits2.add(u)
-        qubits2.add(v)
-    if len(qubits1) != len(qubits2):
-        return {}
-    sorted_qubits1 = sorted(qubits1)
-    sorted_qubits2 = sorted(qubits2)
-    n = len(sorted_qubits1)
-    for perm in permutations(range(n)):
-        mapping = {sorted_qubits1[i]: sorted_qubits2[perm[i]] for i in range(n)}
-        mapped_edges = set()
-        for u, v in topology1:
-            mapped_edges.add(tuple(sorted([mapping[u], mapping[v]])))
-        original_edges = set(tuple(sorted([u, v])) for u, v in topology2)
-        if mapped_edges == original_edges:
-            return mapping
-    return {}
-
 def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
     """
     Find SWAP sequence to route subset of virtual qubits to targets.
@@ -494,21 +394,6 @@ def heuristic(state):
     
     return None, None  # No solution found
 
-
-def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
-    dist_placeholder = 0
-    qbit_map_inv = { k:v for v,k in qbit_map.items()}
-    for u,v in mini_topology:
-        dist_placeholder += (dist_matrix[pi[qbit_map_inv[u]]][pi[qbit_map_inv[v]]]-1)*3
-    return dist_placeholder
-
-def extract_subtopology(involved_qbits, qbit_map, config ):
-    mini_topology = []
-    for edge in config["topology"]:
-        if edge[0] in involved_qbits and edge[1] in involved_qbits:
-            mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
-    return mini_topology
-    
 def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
     P_i_inv = [P_i.index(i) for i in range(len(P_i))]  # Compute inverse
     qbit_map_input = {k : node_mapping[P_i_inv[v]] for k,v in qbit_map.items()}
@@ -522,7 +407,6 @@ def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
         pi_tuple = tuple(pi_list)
         qbit_map_frozen = frozenset(qbit_map_input.items())
         cache_key = (pi_tuple, qbit_map_frozen)
-        cache_key = (pi_tuple, qbit_map_frozen)
         if cache_key in swap_cache:
             swaps, pi_init = swap_cache[cache_key]
         else:
@@ -532,6 +416,217 @@ def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
         swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
     return len(swaps)
 
+
+# ============================================================================
+# Topology Utilities
+# ============================================================================
+
+def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
+    adj_list = {}
+    for u, v in edges:
+        if u not in adj_list:
+            adj_list[u] = set()
+        if v not in adj_list:
+            adj_list[v] = set()
+        adj_list[u].add(v)
+        adj_list[v].add(u)
+    return adj_list
+
+def _get_induced_edges(edges: List[Tuple[int, int]], qubit_subset: Set[int]) -> List[Tuple[int, int]]:
+    return [edge for edge in edges if edge[0] in qubit_subset and edge[1] in qubit_subset]
+
+def _dfs_enumerate(adj_list: dict, k: int, callback):
+    all_qubits = sorted(adj_list.keys())
+    seen = set()
+    def dfs(current_qubits: Set[int], candidates: Set[int]):
+        if len(current_qubits) == k:
+            frozen = frozenset(current_qubits)
+            if frozen not in seen:
+                seen.add(frozen)
+                callback(current_qubits)
+            return
+        if len(current_qubits) + len(candidates) < k:
+            return
+        for node in sorted(candidates):
+            new_qubits = current_qubits | {node}
+            new_candidates = {neighbor for q in new_qubits for neighbor in adj_list[q] 
+                            if neighbor not in new_qubits and neighbor > node}
+            dfs(new_qubits, new_candidates)
+    for start in all_qubits:
+        dfs({start}, {n for n in adj_list[start] if n > start})
+
+def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]:
+    qubits = sorted(qubit_subset)
+    n = len(qubits)
+    best_edges = None
+    for perm in permutations(range(n)):
+        mapping = {qubits[i]: perm[i] for i in range(n)}
+        relabeled = tuple(sorted([tuple(sorted([mapping[u], mapping[v]])) for u, v in induced_edges]))
+        if best_edges is None or relabeled < best_edges:
+            best_edges = relabeled
+    return frozenset(best_edges)
+
+def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
+    if k <= 0:
+        return []
+    adj_list = _build_adj_list(edges)
+    if k == 1:
+        return [[]]
+    canonical_forms = {}
+    def process(qubits):
+        induced = _get_induced_edges(edges, qubits)
+        canonical = get_canonical_form(qubits, induced)
+        if canonical not in canonical_forms:
+            canonical_forms[canonical] = induced
+    _dfs_enumerate(adj_list, k, process)
+    return list(canonical_forms.values())
+
+def get_subtopologies_of_type(edges: List[Tuple[int, int]], target_topology: List[Tuple[int, int]]) -> List[List[Tuple[int, int]]]:
+    target_qubits = set()
+    for u, v in target_topology:
+        target_qubits.add(u)
+        target_qubits.add(v)
+    k = len(target_qubits) if target_qubits else 1
+    if k <= 0:
+        return []
+    adj_list = _build_adj_list(edges)
+    if k == 1:
+        return [[] for _ in adj_list.keys()]
+    target_canonical = get_canonical_form(target_qubits, target_topology)
+    matches = []
+    def process(qubits):
+        induced = _get_induced_edges(edges, qubits)
+        canonical = get_canonical_form(qubits, induced)
+        if canonical == target_canonical:
+            matches.append(induced)
+    _dfs_enumerate(adj_list, k, process)
+    return matches
+
+def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int, int]]) -> dict:
+    qubits1 = set()
+    for u, v in topology1:
+        qubits1.add(u)
+        qubits1.add(v)
+    qubits2 = set()
+    for u, v in topology2:
+        qubits2.add(u)
+        qubits2.add(v)
+    if len(qubits1) != len(qubits2):
+        return {}
+    sorted_qubits1 = sorted(qubits1)
+    sorted_qubits2 = sorted(qubits2)
+    n = len(sorted_qubits1)
+    for perm in permutations(range(n)):
+        mapping = {sorted_qubits1[i]: sorted_qubits2[perm[i]] for i in range(n)}
+        mapped_edges = set()
+        for u, v in topology1:
+            mapped_edges.add(tuple(sorted([mapping[u], mapping[v]])))
+        original_edges = set(tuple(sorted([u, v])) for u, v in topology2)
+        if mapped_edges == original_edges:
+            return mapping
+    return {}
+
+def extract_subtopology(involved_qbits, qbit_map, config ):
+    mini_topology = []
+    for edge in config["topology"]:
+        if edge[0] in involved_qbits and edge[1] in involved_qbits:
+            mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
+    return mini_topology
+
+
+# ============================================================================
+# Distance & Cost Calculations
+# ============================================================================
+
+def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
+    dist_placeholder = 0
+    qbit_map_inv = { k:v for v,k in qbit_map.items()}
+    for u,v in mini_topology:
+        dist_placeholder += (dist_matrix[pi[qbit_map_inv[u]]][pi[qbit_map_inv[v]]]-1)*3
+    return dist_placeholder
+
+def calculate_swap_cost(swaps, current_pi, used_qubits):
+    """
+    Calculate swap cost. Swaps involving unused qubits are costless (0).
+    unused qubits are those not in used_qubits set.
+    """
+    cost = 0
+    temp_pi = list(current_pi)
+    # Build inverse map for O(1) lookup: physical -> logical
+    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
+
+    for p1, p2 in swaps:
+        l1 = phys_to_logical[p1]
+        l2 = phys_to_logical[p2]
+
+        is_l1_unused = (l1 not in used_qubits)
+        is_l2_unused = (l2 not in used_qubits)
+
+        if is_l1_unused and is_l2_unused:
+            step_cost = 0
+        else:
+            step_cost = 3
+
+        cost += step_cost
+
+        # Update state
+        temp_pi[l1] = p2
+        temp_pi[l2] = p1
+        phys_to_logical[p1] = l2
+        phys_to_logical[p2] = l1
+
+    return cost
+
+def filter_required_swaps(swaps, current_pi, pi_initial, used_qubits):
+    """
+    Filter swaps that are effectively 'costless' (involve unused qubits).
+    Returns filtered swaps and the updated pi_initial.
+    """
+    required_swaps = []
+    temp_pi = list(current_pi)
+    
+    # pi_initial might be numpy array, convert to list for mutation if needed, 
+    # but we'll return a new list/array to be safe.
+    updated_pi_initial = list(pi_initial)
+    
+    # Build inverse map for O(1) lookup: physical -> logical
+    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
+
+    for p1, p2 in swaps:
+        l1 = phys_to_logical[p1]
+        l2 = phys_to_logical[p2]
+
+        is_l1_unused = (l1 not in used_qubits)
+        is_l2_unused = (l2 not in used_qubits)
+
+        if not (is_l1_unused and is_l2_unused):
+            required_swaps.append((p1, p2))
+        else:
+            # If unused, we update the initial mapping to reflect this swap
+            # effectively retconning that they started in these positions.
+            # pi_initial maps logical -> physical.
+            # We swap the physical locations for these logical qubits.
+            # Note: updated_pi_initial[l1] should track where l1 'started'.
+            # If we swap l1 and l2 physically, and it's costless, 
+            # it means l1 is now 'initially' at p2, and l2 at p1.
+            # But wait, temp_pi[l1] is currently p1. After swap it is p2.
+            # So we update pi_initial to match the new temp_pi.
+            updated_pi_initial[l1] = p2
+            updated_pi_initial[l2] = p1
+
+        # Always update the tracking state
+        temp_pi[l1] = p2
+        temp_pi[l2] = p1
+        phys_to_logical[p1] = l2
+        phys_to_logical[p2] = l1
+
+    return required_swaps, updated_pi_initial
+
+
+# ============================================================================
+# Data Classes
+# ============================================================================
+
 class SingleQubitPartitionResult:
     
     def __init__(self,circuit_in,parameters_in):
@@ -540,6 +635,7 @@ def __init__(self,circuit_in,parameters_in):
     
     def get_partition_synthesis_score(self):
         return 0
+
 # Virtual qubits q, reduced virtual qubits (the remapped circuit only up to partition_size) q*
 # Physical qubits Q, reduced physical qubits Q* 
 class PartitionSynthesisResult:
@@ -714,6 +810,11 @@ def get_final_circuit(self,optimized_partitions,N):
         part_circuit = part_circuit.Remap_Qbits(self.node_mapping, N)
         return part_circuit, part_parameters
 
+
+# ============================================================================
+# Circuit Utilities
+# ============================================================================
+
 def check_circuit_compatibility(circuit: Circuit, topology):
     circuit_topology = []
     gates = circuit.get_Gates()
@@ -739,83 +840,6 @@ def check_circuit_compatibility(circuit: Circuit, topology):
             return False
     return True
 
-def calculate_swap_cost(swaps, current_pi, used_qubits):
-    """
-    Calculate swap cost. Swaps involving unused qubits are costless (0).
-    unused qubits are those not in used_qubits set.
-    """
-    cost = 0
-    temp_pi = list(current_pi)
-    # Build inverse map for O(1) lookup: physical -> logical
-    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
-
-    for p1, p2 in swaps:
-        l1 = phys_to_logical[p1]
-        l2 = phys_to_logical[p2]
-
-        is_l1_unused = (l1 not in used_qubits)
-        is_l2_unused = (l2 not in used_qubits)
-
-        if is_l1_unused and is_l2_unused:
-            step_cost = 0
-        else:
-            step_cost = 3
-
-        cost += step_cost
-
-        # Update state
-        temp_pi[l1] = p2
-        temp_pi[l2] = p1
-        phys_to_logical[p1] = l2
-        phys_to_logical[p2] = l1
-
-    return cost
-
-def filter_required_swaps(swaps, current_pi, pi_initial, used_qubits):
-    """
-    Filter swaps that are effectively 'costless' (involve unused qubits).
-    Returns filtered swaps and the updated pi_initial.
-    """
-    required_swaps = []
-    temp_pi = list(current_pi)
-    
-    # pi_initial might be numpy array, convert to list for mutation if needed, 
-    # but we'll return a new list/array to be safe.
-    updated_pi_initial = list(pi_initial)
-    
-    # Build inverse map for O(1) lookup: physical -> logical
-    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
-
-    for p1, p2 in swaps:
-        l1 = phys_to_logical[p1]
-        l2 = phys_to_logical[p2]
-
-        is_l1_unused = (l1 not in used_qubits)
-        is_l2_unused = (l2 not in used_qubits)
-
-        if not (is_l1_unused and is_l2_unused):
-            required_swaps.append((p1, p2))
-        else:
-            # If unused, we update the initial mapping to reflect this swap
-            # effectively retconning that they started in these positions.
-            # pi_initial maps logical -> physical.
-            # We swap the physical locations for these logical qubits.
-            # Note: updated_pi_initial[l1] should track where l1 'started'.
-            # If we swap l1 and l2 physically, and it's costless, 
-            # it means l1 is now 'initially' at p2, and l2 at p1.
-            # But wait, temp_pi[l1] is currently p1. After swap it is p2.
-            # So we update pi_initial to match the new temp_pi.
-            updated_pi_initial[l1] = p2
-            updated_pi_initial[l2] = p1
-
-        # Always update the tracking state
-        temp_pi[l1] = p2
-        temp_pi[l2] = p1
-        phys_to_logical[p1] = l2
-        phys_to_logical[p2] = l1
-
-    return required_swaps, updated_pi_initial
-
 def construct_swap_circuit(swap_order, N):
     swap_circ = Circuit(N)
     for swap in swap_order:

From 0eefb766696b54832ce55cfb0172faae670feed7 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 9 Dec 2025 01:17:10 +0100
Subject: [PATCH 062/232] Add in new lookahead functionality and SWAP penalty

---
 squander/synthesis/PartAM.py       | 233 +++++++++++++++++------------
 squander/synthesis/PartAM_utils.py | 108 +++----------
 2 files changed, 161 insertions(+), 180 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 567891d3d..3036be572 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -39,8 +39,6 @@
     check_circuit_compatibility,
     construct_swap_circuit,
     calculate_dist_small,
-    calculate_swap_cost,
-    filter_required_swaps
 )
 
 
@@ -82,7 +80,7 @@ def _init_scoring_worker(scoring_partitions, sdag, distance_matrix):
 def _score_candidate_worker(payload):
     """
     Worker wrapper that reconstructs scoring inputs from a lightweight payload.
-    Payload format: (PartitionCandidate, F_snapshot, pi_snapshot, used_qubits)
+    Payload format: (PartitionCandidate, F_snapshot, pi_snapshot)
     """
     if (
         _WORKER_SCORING_PARTITIONS is None
@@ -90,7 +88,7 @@ def _score_candidate_worker(payload):
         or _WORKER_DISTANCE_MATRIX is None
     ):
         raise RuntimeError("Scoring worker not initialized with shared data.")
-    partition_candidate, F_snapshot, pi_snapshot, used_qubits = payload
+    partition_candidate, F_snapshot, pi_snapshot, lookahead_gates = payload
     return qgd_Partition_Aware_Mapping.score_partition_candidate(
         partition_candidate,
         F_snapshot,
@@ -99,7 +97,7 @@ def _score_candidate_worker(payload):
         _WORKER_S_DAG,
         _WORKER_DISTANCE_MATRIX,
         _WORKER_SWAP_CACHE,
-        used_qubits,
+        lookahead_gates
     )
 
 
@@ -254,6 +252,7 @@ def DecomposePartition_Full(Partition_circuit: Circuit, Partition_parameters: np
         Call to decompose a partition sequentially
         """
         N = Partition_circuit.get_Qbit_Num()
+        print(N)
         if N !=1:
             permutations_all = list(permutations(range(N)))
             result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
@@ -287,7 +286,7 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
         else:
             raise Exception(f"Unsupported decomposition type: {strategy}")
         cDecompose.set_Verbose( config["verbosity"] )
-        cDecompose.set_Cost_Function_Variant( 3 )	
+        cDecompose.set_Cost_Function_Variant( 3 )    
         cDecompose.set_Optimization_Tolerance( config["tolerance"] )
         cDecompose.set_Optimizer( config["optimizer"] )
         cDecompose.Start_Decomposition()
@@ -428,7 +427,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
 
     def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, sDAG):
         pi_initial = pi.copy()
-        used_qubits = set()
+
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
@@ -459,6 +458,12 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
 
         try:
             while len(F) != 0:
+                lookahead_partitions = list(F)[:5]
+                lookahead_gates = []
+                for idx in lookahead_partitions:
+                    if idx < len(optimized_partitions):
+                        lookahead_gates.extend(optimized_partitions[idx].get_original_circuit_structure())
+                lookahead_gates = None
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
                     break
@@ -466,7 +471,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 if executor is not None:
                     pi_snapshot = tuple(int(x) for x in pi)
                     payloads = [
-                        (partition_candidate, F_snapshot, pi_snapshot, used_qubits)
+                        (partition_candidate, F_snapshot, pi_snapshot, lookahead_gates)
                         for partition_candidate in partition_candidates
                     ]
                     scores = list(executor.map(_score_candidate_worker, payloads))
@@ -480,7 +485,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             sDAG,
                             D,
                             self._swap_cache,
-                            used_qubits,
+                            lookahead_gates
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -495,11 +500,8 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 pi_prev = pi # Save previous pi state for filtering
                 swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
                 if len(swap_order)!=0:
-                    filtered_swap_order, pi_initial = filter_required_swaps(swap_order, pi_prev, pi_initial, used_qubits)
-                    partition_order.append(construct_swap_circuit(filtered_swap_order, len(pi)))
+                    partition_order.append(construct_swap_circuit(swap_order, len(pi)))
                 
-                # Add involved qubits to used set
-                used_qubits.update(min_partition_candidate.involved_qbits)
                 
                 partition_order.append(min_partition_candidate)
                 children = DAG[min_partition_candidate.partition_idx]
@@ -515,8 +517,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             qubit = child_partition.circuit.get_Qbits()[0]
                             child_partition.circuit.map_circuit({qubit: pi[qubit]})
                             partition_order.append(child_partition)
-                            # Update used qubits for single qubit partition
-                            used_qubits.add(qubit) 
                             
                             resolved_partitions[child] = True
                             resolved_count = sum(resolved_partitions)
@@ -568,14 +568,14 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache, used_qubits):
+    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache, lookahead_gates=None):
         score_F = 0
         score_E = 0
         E_partitions = set()  # Changed to set for O(1) membership checks
         E_partitions_1 = set()
         E_partitions_2 = set()
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
-        score_F += calculate_swap_cost(swaps, pi, used_qubits)
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, lookahead_gates)
+        score_F += len(swaps)*3
         score_F += len(partition_candidate.circuit_structure)
 
         # Safety check: ensure partition_idx is valid for sDAG
@@ -592,7 +592,7 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                 continue
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                dist_placeholder = calculate_dist_small(mini_topology,partition.qubit_map,D,output_perm)
+                dist_placeholder = 3*calculate_dist_small(mini_topology,partition.qubit_map,D,output_perm)
                 circuit_length = np.min([len(circ) for circ in partition.circuit_structures[tdx]])
                 score = dist_placeholder + circuit_length
                 mini_scores.append(score)
@@ -614,28 +614,28 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                         continue
                     E_partitions_1.add(partition_idx_E)
         #score all
-        for partition_idx in E_partitions.union(E_partitions_1):
+        for partition_idx in E_partitions:
             mini_scores = []
             partition_result = scoring_partitions[partition_idx]
             if partition_result is None:
                 continue
             for tdx, mini_topology in enumerate(partition_result.mini_topologies):
-                dist_placeholder = calculate_dist_small(mini_topology,partition_result.qubit_map,D,output_perm)
+                dist_placeholder = 3*calculate_dist_small(mini_topology,partition_result.qubit_map,D,output_perm)
                 circuit_length = np.min([len(circ) for circ in partition_result.circuit_structures[tdx]])
                 score = dist_placeholder + circuit_length
                 mini_scores.append(score)
             if mini_scores:
                 score_E += np.min(mini_scores)
 
-        coeff_E = 0.5
-        if len(E_partitions.union(E_partitions_1)) == 0:
+        coeff_E = 0.3
+        if len(E_partitions) == 0:
             E_score = 0.0
         else:
-            E_score = coeff_E * score_E / len(E_partitions.union(E_partitions_1))
+            E_score = coeff_E * score_E / len(E_partitions)
         
-        F_score = score_F / len(F)
+        F_score = score_F
         
-        return E_score + F_score
+        return E_score + 0.7*F_score
 
     # ------------------------------------------------------------------------
     # Candidate Generation
@@ -763,89 +763,132 @@ def compute_distances_bfs(self, N):
         
         return D #multiply by 3 to make it CNOT cost instead of SWAP cost
 
-    def _compute_smart_initial_layout(self, circuit, N, D):
 
-        # Count interactions between qubits
-        interaction_count = defaultdict(int)
+    def _compute_smart_initial_layout(self, circuit, N, D):
+        """
+        Compute initial layout using interaction graph + simulated annealing.
+        Much better than the greedy approach.
+        """
+        # Build interaction graph: weight = number of CNOTs between qubits
+        interaction_graph = defaultdict(int)
         gates = circuit.get_Gates()
         
         for gate in gates:
             if gate.get_Control_Qbit() != -1:
-                q1 = gate.get_Target_Qbit()
-                q2 = gate.get_Control_Qbit()
+                q1, q2 = sorted([gate.get_Target_Qbit(), gate.get_Control_Qbit()])
                 if q1 < N and q2 < N:
-                    key = (min(q1, q2), max(q1, q2))
-                    interaction_count[key] += 1
+                    interaction_graph[(q1, q2)] += 1
         
-        if not interaction_count:
-            # No 2-qubit gates, use trivial mapping
+        # If no 2-qubit gates, return identity
+        if not interaction_graph:
             return np.arange(N)
         
-        # Find most interacting qubit pair
-        most_connected = max(interaction_count.items(), key=lambda x: x[1])
-        q1, q2 = most_connected[0]
+        # Start with greedy mapping as baseline
+        pi_greedy = self._greedy_initial_layout(interaction_graph, N, D)
+        best_pi = pi_greedy.copy()
+        best_score = self._evaluate_layout_score(best_pi, interaction_graph, D)
         
-        # Find physical qubits that are connected
-        # Start with an arbitrary connected pair
-        for edge in self.config['topology']:
-            p1, p2 = edge
-            break  # Just take first edge
+        # Simulated annealing to improve
+        current_pi = best_pi.copy()
+        current_score = best_score
         
-        # Initialize mapping
-        pi = np.arange(N)
-        
-        # Place most interacting qubits on connected physical qubits
-        pi[q1] = p1
-        pi[q2] = p2
-        
-        # Place other qubits using greedy approach
-        placed_logical = {q1, q2}
-        placed_physical = {p1, p2}
+        # Temperature schedule
+        max_iter = 100 * N
+        for iteration in range(max_iter):
+            temp = 1.0 - (iteration / max_iter)
+            
+            # Propose swap of two physical qubits
+            p1, p2 = np.random.choice(N, 2, replace=False)
+            new_pi = current_pi.copy()
+            new_pi[p1], new_pi[p2] = new_pi[p2], new_pi[p1]  # Swap assignments
+            
+            # Evaluate new layout
+            new_score = self._evaluate_layout_score(new_pi, interaction_graph, D)
+            
+            # Accept if better or with probability
+            delta = new_score - current_score
+            if delta < 0 or np.random.random() < np.exp(-delta / (temp + 1e-6)):
+                current_pi = new_pi
+                current_score = new_score
+                
+                if current_score < best_score:
+                    best_score = current_score
+                    best_pi = current_pi.copy()
         
-        # For each remaining logical qubit, find where to place it
+        return best_pi
+    
+    def _greedy_initial_layout(self, interaction_graph, N, D):
+        """Greedy baseline mapping - much simpler and reliable"""
+        pi = np.arange(N)
+        placed_logical = set()
+        placed_physical = set()
+        
+        # Sort interactions by weight (descending)
+        sorted_interactions = sorted(
+            interaction_graph.items(), 
+            key=lambda x: x[1], 
+            reverse=True
+        )
+        
+        # Place highest interaction pair first
+        if sorted_interactions:
+            (q1, q2), _ = sorted_interactions[0]
+            # Find closest physical pair
+            min_dist = float('inf')
+            best_pair = None
+            for p1 in range(N):
+                for p2 in range(p1 + 1, N):
+                    if D[p1][p2] < min_dist:
+                        min_dist = D[p1][p2]
+                        best_pair = (p1, p2)
+            
+            if best_pair:
+                p1, p2 = best_pair
+                pi[q1] = p1
+                pi[q2] = p2
+                placed_logical = {q1, q2}
+                placed_physical = {p1, p2}
+        
+        # Place remaining qubits
         remaining_logical = [q for q in range(N) if q not in placed_logical]
-        
-        # Sort by how much they interact with already placed qubits
-        def interaction_score(q):
-            score = 0
-            for placed_q in placed_logical:
-                key = (min(q, placed_q), max(q, placed_q))
-                score += interaction_count.get(key, 0)
-            return score
-        
-        remaining_logical.sort(key=interaction_score, reverse=True)
-        
-        # Place them near their interacting partners
-        for logical_q in remaining_logical:
-            # Find best physical location
-            best_physical = None
-            best_score = float('inf')
+        for q in remaining_logical:
+            best_p = None
+            best_cost = float('inf')
             
-            for physical_q in range(N):
-                if physical_q not in placed_physical:
-                    # Calculate average distance to interacting qubits
-                    total_dist = 0
-                    count = 0
-                    for other_q in placed_logical:
-                        key = (min(logical_q, other_q), max(logical_q, other_q))
-                        weight = interaction_count.get(key, 0)
-                        if weight > 0:
-                            other_physical = pi[other_q]
-                            total_dist += D[physical_q][other_physical] * weight
-                            count += weight
-                    
-                    if count > 0:
-                        avg_dist = total_dist / count
-                    else:
-                        avg_dist = 0
-                    
-                    if avg_dist < best_score:
-                        best_score = avg_dist
-                        best_physical = physical_q
+            for p in range(N):
+                if p in placed_physical:
+                    continue
+                
+                # Cost = sum of distances to already placed interacting qubits
+                cost = 0
+                for other_q in placed_logical:
+                    weight = interaction_graph.get(tuple(sorted((q, other_q))), 0)
+                    if weight > 0:
+                        other_p = pi[other_q]
+                        cost += D[p][other_p] * weight
+                
+                if cost < best_cost:
+                    best_cost = cost
+                    best_p = p
             
-            if best_physical is not None:
-                pi[logical_q] = best_physical
-                placed_logical.add(logical_q)
-                placed_physical.add(best_physical)
+            if best_p is not None:
+                pi[q] = best_p
+                placed_logical.add(q)
+                placed_physical.add(best_p)
         
         return pi
+    
+    def _evaluate_layout_score(self, pi, interaction_graph, D):
+        """
+        Evaluate layout quality: lower score is better.
+        Score = sum(distance(physical_q1, physical_q2) * interaction_weight)
+        """
+        score = 0.0
+        for (q1, q2), weight in interaction_graph.items():
+            p1, p2 = pi[q1], pi[q2]
+            distance = D[p1][p2]
+            if np.isinf(distance):
+                return float('inf')  # Invalid layout
+            score += distance * weight
+        
+        return score
\ No newline at end of file
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index bcb830f9e..d190e0a61 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -292,7 +292,7 @@ def find_constrained_swaps_ILP(pi_A, pi_B_dict, dist_matrix, use_gurobi=True):
     
     return swaps, final_perm
 
-def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
+def find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix,lookahead_gates=None):
     """
     Find SWAP sequence to route subset of virtual qubits to targets.
     
@@ -341,7 +341,7 @@ def is_goal(state):
             if state[target_P] != q:  # Physical position target_P should contain virtual q
                 return False
         return True
-    
+
     def heuristic(state):
         """Lower bound: sum of distances for qubits needing routing"""
         total = 0.0
@@ -359,8 +359,21 @@ def heuristic(state):
             total += float(distance)
         return math.floor(total / 2)  # Optimistic: each SWAP helps 2 qubits
     
-    # A* search
-    heap = [(heuristic(start_state), 0, start_state, [])]
+    def heuristic_with_lookahead(state):
+        base_cost = heuristic(state)
+        
+        if lookahead_gates:
+            # Add penalty for gates that would require swaps
+            for q1, q2 in lookahead_gates[:5]:  # Look at next 5 gates
+                p1 = state[q1]
+                p2 = state[q2]
+                if dist_matrix[p1][p2] > 1:
+                    base_cost += dist_matrix[p1][p2] - 1
+        
+        return base_cost
+    
+    # Use this heuristic in A*
+    heap = [(heuristic_with_lookahead(start_state), 0, start_state, [])]
     visited = {start_state: 0}
     
     while heap:
@@ -394,6 +407,9 @@ def heuristic(state):
     
     return None, None  # No solution found
 
+def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix,lookahead_gates=None):
+    return find_constrained_swaps_A_star(pi_A,pi_B_dict,dist_matrix,lookahead_gates)
+
 def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
     P_i_inv = [P_i.index(i) for i in range(len(P_i))]  # Compute inverse
     qbit_map_input = {k : node_mapping[P_i_inv[v]] for k,v in qbit_map.items()}
@@ -545,84 +561,6 @@ def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
         dist_placeholder += (dist_matrix[pi[qbit_map_inv[u]]][pi[qbit_map_inv[v]]]-1)*3
     return dist_placeholder
 
-def calculate_swap_cost(swaps, current_pi, used_qubits):
-    """
-    Calculate swap cost. Swaps involving unused qubits are costless (0).
-    unused qubits are those not in used_qubits set.
-    """
-    cost = 0
-    temp_pi = list(current_pi)
-    # Build inverse map for O(1) lookup: physical -> logical
-    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
-
-    for p1, p2 in swaps:
-        l1 = phys_to_logical[p1]
-        l2 = phys_to_logical[p2]
-
-        is_l1_unused = (l1 not in used_qubits)
-        is_l2_unused = (l2 not in used_qubits)
-
-        if is_l1_unused and is_l2_unused:
-            step_cost = 0
-        else:
-            step_cost = 3
-
-        cost += step_cost
-
-        # Update state
-        temp_pi[l1] = p2
-        temp_pi[l2] = p1
-        phys_to_logical[p1] = l2
-        phys_to_logical[p2] = l1
-
-    return cost
-
-def filter_required_swaps(swaps, current_pi, pi_initial, used_qubits):
-    """
-    Filter swaps that are effectively 'costless' (involve unused qubits).
-    Returns filtered swaps and the updated pi_initial.
-    """
-    required_swaps = []
-    temp_pi = list(current_pi)
-    
-    # pi_initial might be numpy array, convert to list for mutation if needed, 
-    # but we'll return a new list/array to be safe.
-    updated_pi_initial = list(pi_initial)
-    
-    # Build inverse map for O(1) lookup: physical -> logical
-    phys_to_logical = {p: l for l, p in enumerate(temp_pi)}
-
-    for p1, p2 in swaps:
-        l1 = phys_to_logical[p1]
-        l2 = phys_to_logical[p2]
-
-        is_l1_unused = (l1 not in used_qubits)
-        is_l2_unused = (l2 not in used_qubits)
-
-        if not (is_l1_unused and is_l2_unused):
-            required_swaps.append((p1, p2))
-        else:
-            # If unused, we update the initial mapping to reflect this swap
-            # effectively retconning that they started in these positions.
-            # pi_initial maps logical -> physical.
-            # We swap the physical locations for these logical qubits.
-            # Note: updated_pi_initial[l1] should track where l1 'started'.
-            # If we swap l1 and l2 physically, and it's costless, 
-            # it means l1 is now 'initially' at p2, and l2 at p1.
-            # But wait, temp_pi[l1] is currently p1. After swap it is p2.
-            # So we update pi_initial to match the new temp_pi.
-            updated_pi_initial[l1] = p2
-            updated_pi_initial[l2] = p1
-
-        # Always update the tracking state
-        temp_pi[l1] = p2
-        temp_pi[l2] = p1
-        phys_to_logical[p1] = l2
-        phys_to_logical[p2] = l1
-
-    return required_swaps, updated_pi_initial
-
-
 # ============================================================================
 # Data Classes
 # ============================================================================
@@ -765,7 +703,7 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
-    def transform_pi(self, pi, D, swap_cache=None):
+    def transform_pi(self, pi, D, swap_cache=None, lookahead_gates=None):
         # Fixed: Use P_i^{-1} instead of P_i for input routing
         # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
         # For Original to see logical qubit q* at partition position q*, we need:
@@ -787,10 +725,10 @@ def transform_pi(self, pi, D, swap_cache=None):
             if cache_key in swap_cache:
                 swaps, pi_init = swap_cache[cache_key]
             else:
-                swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+                swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D, lookahead_gates)
                 swap_cache[cache_key] = (swaps, pi_init)
         else:
-            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D, lookahead_gates)
         
         pi_output = pi_init.copy()
         # Fixed: P_o should be indexed by partition virtual index q*, not physical index Q*

From 4b29853ef08fb650643c5a6c8fa4eedb5be7ccc0 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 9 Dec 2025 01:33:42 +0100
Subject: [PATCH 063/232] Update P_o

---
 squander/synthesis/PartAM_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index d190e0a61..7ca446ad8 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -738,7 +738,7 @@ def transform_pi(self, pi, D, swap_cache=None, lookahead_gates=None):
         for q_star in range(len(self.P_o)):
             if q_star in qbit_map_inverse:
                 k = qbit_map_inverse[q_star]
-                pi_output[k] = self.node_mapping[self.P_o[q_star]]
+                pi_output[k] = self.node_mapping[self.P_o.index(self.qbit_map[k])] 
         return swaps, pi_output
     
     def get_final_circuit(self,optimized_partitions,N):

From dda5546b92c2d0ad461a74ec9eb245a338041249 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 9 Dec 2025 01:49:47 +0100
Subject: [PATCH 064/232] Revert stupid change

---
 squander/synthesis/PartAM_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 7ca446ad8..d190e0a61 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -738,7 +738,7 @@ def transform_pi(self, pi, D, swap_cache=None, lookahead_gates=None):
         for q_star in range(len(self.P_o)):
             if q_star in qbit_map_inverse:
                 k = qbit_map_inverse[q_star]
-                pi_output[k] = self.node_mapping[self.P_o.index(self.qbit_map[k])] 
+                pi_output[k] = self.node_mapping[self.P_o[q_star]]
         return swaps, pi_output
     
     def get_final_circuit(self,optimized_partitions,N):

From 00826f12428f50726a31f30111160f051d32c175 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 10 Dec 2025 12:32:40 +0100
Subject: [PATCH 065/232] Add in single qubit partition handling

---
 examples/decomposition/PartAM_example.py      |  11 +-
 .../qgd_Wide_Circuit_Optimization.py          | 124 ++++++++++++------
 squander/synthesis/PartAM.py                  |  28 ++--
 squander/synthesis/PartAM_utils.py            |   1 +
 4 files changed, 115 insertions(+), 49 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index cffdda844..1e2e54e3e 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -38,16 +38,22 @@
             'progressbar': True,  # Enable diagnostic output
     }
 
-    filename = "benchmarks/qfast/4q/adder_q4.qasm"
+    filename = "benchmarks/qfast/5q/vqe.qasm"
     start_time = time.time()
 
     # load the circuit from a file
     circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
     config['topology'] = [
-    (0, 1), (0, 2), (0, 3), 
+    (0, 1), (0, 2), (0, 3), (0, 4)
     ]
     wide_circuit_optimizer = Partition_Aware_Mapping( config )
     circ, params, input_perm,output_perm = wide_circuit_optimizer.Partition_Aware_Mapping( circ_orig, parameters_orig )
+    wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
+    config['routed'] = True 
+    circo = Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params)
+    # run circuti optimization
+    circ, params = Qiskit_IO.convert_Qiskit_to_Squander(circo)
+    circ, params = wide_circuit_optimizer.OptimizeWideCircuit( circ, params, True )
     #print(Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params))
     num_qubits = circ.get_Qbit_Num() 
     matrix_size = 1 << num_qubits 
@@ -66,7 +72,6 @@
     circ_Final.add_Permutation(input_perm_list)
     circ_Final.add_Circuit(circ)
     circ_Final.add_Permutation(output_perm_T)
-    
     # Additional matrix validation in example     
     PartAM_state = initial_state.copy()
     circ_Final.apply_to(params, PartAM_state)
diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 1bbc02ff5..3fc285e40 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -34,7 +34,8 @@ def extract_subtopology(involved_qbits, qbit_map, config ):
 
 def CNOTGateCount( circ: Circuit ) -> int :
     """
-    Call to get the number of CNOT gates in the circuit
+    Call to get the number of CNOT-equivalent gates in the circuit.
+    Counts all two-qubit gates, converting them to CNOT equivalents.
 
     
     Args:
@@ -44,7 +45,7 @@ def CNOTGateCount( circ: Circuit ) -> int :
 
     Return:
 
-        Returns with the CNOT gate count
+        Returns with the CNOT-equivalent gate count (all two-qubit gates counted)
 
     
     """ 
@@ -54,7 +55,19 @@ def CNOTGateCount( circ: Circuit ) -> int :
 
     gate_counts = circ.get_Gate_Nums()
 
-    return gate_counts.get('CNOT', 0) +  3*gate_counts.get('SWAP', 0)
+    # Count all two-qubit gates
+    # CNOT gates count as 1
+    cnot_count = gate_counts.get('CNOT', 0)
+    
+    # SWAP gates count as 3 CNOTs
+    swap_count = 3 * gate_counts.get('SWAP', 0)
+    
+    # Other two-qubit gates count as 1 CNOT each
+    # CZ, CH, SYC, CRY, CRX, CRZ, CP, CROT, CR, CU are all two-qubit gates
+    two_qubit_gates = ['CZ', 'CH', 'SYC', 'CRY', 'CRX', 'CRZ', 'CP', 'CROT', 'CR', 'CU']
+    other_two_qubit_count = sum(gate_counts.get(gate_name, 0) for gate_name in two_qubit_gates)
+    
+    return cnot_count + swap_count + other_two_qubit_count
 
 
 
@@ -195,43 +208,50 @@ def DecomposePartition( Umtx: np.ndarray, config: dict, mini_topology = None ) -
 
     
         """ 
-        strategy = config["strategy"]
-        if strategy == "TreeSearch":
-            cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
-        elif strategy == "TabuSearch":
-            cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
-        elif strategy == "Adaptive":
-            cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
-        else:
-            raise Exception(f"Unsupported decomposition type: {strategy}")
-
-
-        tolerance = config["tolerance"]
-            
-            
-        cDecompose.set_Verbose( config["verbosity"] )
-        cDecompose.set_Cost_Function_Variant( 3 )	
-        cDecompose.set_Optimization_Tolerance( tolerance )
-    
+        try:
+            strategy = config["strategy"]
+            if strategy == "TreeSearch":
+                cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
+            elif strategy == "TabuSearch":
+                cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
+            elif strategy == "Adaptive":
+                cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
+            else:
+                raise Exception(f"Unsupported decomposition type: {strategy}")
+
+
+            tolerance = config["tolerance"]
+                
+                
+            cDecompose.set_Verbose( config["verbosity"] )
+            cDecompose.set_Cost_Function_Variant( 3 )	
+            cDecompose.set_Optimization_Tolerance( tolerance )
+        
 
-        # adding new layer to the decomposition until threshold
-        cDecompose.set_Optimizer( "BFGS" )
+            # adding new layer to the decomposition until threshold
+            cDecompose.set_Optimizer( "BFGS" )
 
-        # starting the decomposition
-        cDecompose.Start_Decomposition()
-            
+            # starting the decomposition
+            cDecompose.Start_Decomposition()
+                
 
-        squander_circuit = cDecompose.get_Circuit()
-        parameters       = cDecompose.get_Optimized_Parameters()
+            squander_circuit = cDecompose.get_Circuit()
+            parameters       = cDecompose.get_Optimized_Parameters()
 
 
-        #print( "Decomposition error: ", cDecompose.get_Decomposition_Error() )
+            #print( "Decomposition error: ", cDecompose.get_Decomposition_Error() )
 
-        if tolerance < cDecompose.get_Decomposition_Error():
-            return None, None
+            if tolerance < cDecompose.get_Decomposition_Error():
+                return None, None
 
 
-        return squander_circuit, parameters
+            return squander_circuit, parameters
+        except Exception as e:
+            # Catch C++ exceptions and other errors during decomposition
+            if config.get("verbosity", 0) > 0:
+                print(f"Warning: Decomposition failed with error: {e}")
+            # Re-raise to be caught by caller
+            raise
 
 
 
@@ -311,10 +331,24 @@ def PartitionDecompositionProcess( subcircuit: Circuit, subcircuit_parameters: n
         remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
 
         # get the unitary representing the circuit
-        unitary = remapped_subcircuit.get_Matrix( subcircuit_parameters )
+        # Wrap in try-except to catch C++ exceptions (e.g., from non-CNOT gates)
+        try:
+            unitary = remapped_subcircuit.get_Matrix( subcircuit_parameters )
+        except Exception as e:
+            # If get_Matrix fails (e.g., due to unsupported gate types), return original circuit
+            if config.get("verbosity", 0) > 0:
+                print(f"Warning: Failed to get matrix for subcircuit: {e}. Using original circuit.")
+            return subcircuit, subcircuit_parameters
 
         # decompose a small unitary into a new circuit
-        decomposed_circuit, decomposed_parameters = qgd_Wide_Circuit_Optimization.DecomposePartition( unitary, config, mini_topology )
+        # Wrap in try-except to catch C++ exceptions during decomposition
+        try:
+            decomposed_circuit, decomposed_parameters = qgd_Wide_Circuit_Optimization.DecomposePartition( unitary, config, mini_topology )
+        except Exception as e:
+            # If decomposition fails, return original circuit
+            if config.get("verbosity", 0) > 0:
+                print(f"Warning: Decomposition failed: {e}. Using original circuit.")
+            return subcircuit, subcircuit_parameters
 
         if decomposed_circuit is None:
             return subcircuit, subcircuit_parameters #remaining code will fail, just return original circuit
@@ -430,7 +464,9 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
                 callback_fnc = lambda  x : self.CompareAndPickCircuits( [subcircuit, x[0]], [subcircuit_parameters, x[1]] ) 
 
                 # call a process to decompose a subcircuit
-                config = self.config if not global_min or len(subcircuit.get_Qbits()) < 3 else {**self.config, 'tree_level_max': max(0, subcircuit.get_Gate_Nums().get('CNOT', 0)-1) } # 'strategy': "Adaptive"}
+                # Use total two-qubit gate count for tree_level_max calculation to handle non-CNOT gates
+                two_qubit_count = CNOTGateCount(subcircuit)
+                config = self.config if not global_min or len(subcircuit.get_Qbits()) < 3 else {**self.config, 'tree_level_max': max(0, two_qubit_count-1) } # 'strategy': "Adaptive"}
                 async_results[partition_idx]  = pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config), callback=callback_fnc )
 
 
@@ -447,7 +483,12 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
                     print( "reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums()) 
                 '''
 
-                if new_subcircuit.get_Gate_Nums().get('CNOT', 0) < subcircuit.get_Gate_Nums().get('CNOT', 0):
+                # Compare using total two-qubit gate count instead of just CNOT count
+                # This properly handles circuits with non-CNOT two-qubit gates
+                original_count = CNOTGateCount(subcircuit)
+                new_count = CNOTGateCount(new_subcircuit)
+                
+                if new_count < original_count:
                     optimized_subcircuits[ partition_idx ] = new_subcircuit
                     optimized_parameter_list[ partition_idx ] = new_parameters
                 else:
@@ -461,8 +502,15 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
         # construct the wide circuit from the optimized suncircuits
         if global_min:
             max_gates = max(len(c.get_Gates()) for c in optimized_subcircuits)
-            def to_cost(d): return d.get('CNOT', 0)*max_gates + sum(d[x] for x in d if x != 'CNOT')
-            weights = [to_cost(circ.get_Gate_Nums()) for circ in optimized_subcircuits[:len(allparts)]]
+            # Use CNOTGateCount to properly account for all two-qubit gates
+            def to_cost(circ): 
+                gate_nums = circ.get_Gate_Nums()
+                two_qubit_cost = CNOTGateCount(circ) * max_gates
+                # Add cost for single-qubit gates
+                single_qubit_gates = ['U3', 'U2', 'U1', 'RX', 'RY', 'RZ', 'R', 'H', 'X', 'Y', 'Z', 'SX', 'S', 'Sdg', 'T', 'Tdg']
+                single_qubit_cost = sum(gate_nums.get(gate, 0) for gate in single_qubit_gates)
+                return two_qubit_cost + single_qubit_cost
+            weights = [to_cost(circ) for circ in optimized_subcircuits[:len(allparts)]]
             L, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
             parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L], fusion_info)
             L = topo_sort_partitions(circ, self.max_partition_size, parts)
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3036be572..95f113720 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -431,7 +431,24 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
-        
+        for partition_idx in F:
+            if isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
+                F.remove(partition_idx)
+                single_qubit_part = optimized_partitions[partition_idx]
+                qubit = single_qubit_part.circuit.get_Qbits()[0]
+                single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
+                partition_order.append(single_qubit_part)
+
+                resolved_partitions[partition_idx] = True
+                children = DAG[partition_idx]
+                while len(children) !=0:
+                    child = children.pop(0)
+                    parents_resolved = True
+                    for parent in IDAG[child]:
+                        parents_resolved *= resolved_partitions[parent]
+                    if parents_resolved:
+                        F.append(child)
+
         
         # Initialize progress bar
         total_partitions = len(DAG)
@@ -458,11 +475,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
 
         try:
             while len(F) != 0:
-                lookahead_partitions = list(F)[:5]
-                lookahead_gates = []
-                for idx in lookahead_partitions:
-                    if idx < len(optimized_partitions):
-                        lookahead_gates.extend(optimized_partitions[idx].get_original_circuit_structure())
                 lookahead_gates = None
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
@@ -515,9 +527,9 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                         if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
                             child_partition = optimized_partitions[child]
                             qubit = child_partition.circuit.get_Qbits()[0]
-                            child_partition.circuit.map_circuit({qubit: pi[qubit]})
+                            print(int(qubit),int(pi[qubit]))
+                            child_partition.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
                             partition_order.append(child_partition)
-                            
                             resolved_partitions[child] = True
                             resolved_count = sum(resolved_partitions)
                             pbar.n = resolved_count
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index d190e0a61..e5ccaab86 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -570,6 +570,7 @@ class SingleQubitPartitionResult:
     def __init__(self,circuit_in,parameters_in):
         self.circuit = circuit_in
         self.parameters = parameters_in
+        self.involved_qbits = circuit_in.get_Qbits()
     
     def get_partition_synthesis_score(self):
         return 0

From d357d7d579018b12b36cfce4121bca7a4e40fdf9 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 11 Dec 2025 14:54:02 +0100
Subject: [PATCH 066/232] Add in SWAP weight to cost function

---
 squander/synthesis/PartAM.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 95f113720..eda65aead 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -586,8 +586,9 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
         E_partitions = set()  # Changed to set for O(1) membership checks
         E_partitions_1 = set()
         E_partitions_2 = set()
+        swap_weight = 4
         swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, lookahead_gates)
-        score_F += len(swaps)*3
+        score_F += swap_weight*len(swaps)*3
         score_F += len(partition_candidate.circuit_structure)
 
         # Safety check: ensure partition_idx is valid for sDAG
@@ -604,7 +605,7 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
                 continue
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                dist_placeholder = 3*calculate_dist_small(mini_topology,partition.qubit_map,D,output_perm)
+                dist_placeholder = swap_weight*3*calculate_dist_small(mini_topology,partition.qubit_map,D,output_perm)
                 circuit_length = np.min([len(circ) for circ in partition.circuit_structures[tdx]])
                 score = dist_placeholder + circuit_length
                 mini_scores.append(score)
@@ -643,11 +644,11 @@ def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, s
         if len(E_partitions) == 0:
             E_score = 0.0
         else:
-            E_score = coeff_E * score_E / len(E_partitions)
+            E_score = coeff_E * score_E 
         
-        F_score = score_F
+        F_score = 0.7*score_F
         
-        return E_score + 0.7*F_score
+        return E_score + F_score
 
     # ------------------------------------------------------------------------
     # Candidate Generation

From bf5f6a85f1d604cd0b289051939ef0b26bab1f7c Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Thu, 11 Dec 2025 14:57:18 +0100
Subject: [PATCH 067/232] Remove priting

---
 squander/synthesis/PartAM.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 95f113720..c52ed9c7d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -252,7 +252,6 @@ def DecomposePartition_Full(Partition_circuit: Circuit, Partition_parameters: np
         Call to decompose a partition sequentially
         """
         N = Partition_circuit.get_Qbit_Num()
-        print(N)
         if N !=1:
             permutations_all = list(permutations(range(N)))
             result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
@@ -903,4 +902,4 @@ def _evaluate_layout_score(self, pi, interaction_graph, D):
                 return float('inf')  # Invalid layout
             score += distance * weight
         
-        return score
\ No newline at end of file
+        return score

From 7f0f0bae76fa01b29798a220cc02d99efc39b270 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 12 Dec 2025 23:05:12 +0100
Subject: [PATCH 068/232] Remove unneeded printing

---
 squander/synthesis/PartAM.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index a20d5ba20..b35c1d83d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -526,7 +526,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                         if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
                             child_partition = optimized_partitions[child]
                             qubit = child_partition.circuit.get_Qbits()[0]
-                            print(int(qubit),int(pi[qubit]))
                             child_partition.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
                             partition_order.append(child_partition)
                             resolved_partitions[child] = True

From cb457a10cfd148052d42d05bd087b1e46743f517 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 13 Jan 2026 12:22:47 +0100
Subject: [PATCH 069/232] Rework wide circuit example

---
 .../wide_circuit_optimization.py              | 45 ++++++++++++-------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/examples/decomposition/wide_circuit_optimization.py b/examples/decomposition/wide_circuit_optimization.py
index 89ca6d6f6..edccd2aab 100644
--- a/examples/decomposition/wide_circuit_optimization.py
+++ b/examples/decomposition/wide_circuit_optimization.py
@@ -27,9 +27,21 @@
 import time
 from squander import Circuit
 import numpy as np
-if __name__ == '__main__':
+from qiskit import transpile
+def generate_star_topology(num_qubits):
+    return [(0, i) for i in range(1, num_qubits)]
+def extract_two_qubit_gate_count(gate_nums_dict):
 
+    # List of two-qubit gate names
+    two_qubit_gates = ['CNOT', 'CZ', 'CU', 'CH', 'SYC', 'CRY', 'CRZ', 'CRX', 'CP', 'SWAP', 'CSWAP']
+    
+    total_two_qubit = 0
+    for gate_name in two_qubit_gates:
+        total_two_qubit += gate_nums_dict.get(gate_name, 0)
+    return total_two_qubit
+if __name__ == '__main__':
 
+    use_qiskit_sabre = False
     config = {  
             'strategy': "TreeSearch", 
             'test_subcircuits': True,
@@ -37,29 +49,32 @@
             'max_partition_size': 3,
     }
 
-    filename = "examples/partitioning/qasm_samples/heisenberg-16-20.qasm"
+    filename = "benchmarks/qfast/5q/vqe.qasm"
     start_time = time.time()
 
     # load the circuit from a file
-    circ, parameters = utils.qasm_to_squander_circuit(filename)
-
+    circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
+    N = circ_orig.get_Qbit_Num()
     # instantiate the object for optimizing wide circuits
     wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
 
     # run circuti optimization
-    circ_flat, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters, True )
+    circ_flat, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ_orig, parameters_orig, True )
 
-    config['topology'] = [
-    (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7),
-    (8, 9), (8, 10), (8, 11), (8, 12), (8, 13), (8, 14), (8, 15),
-    (0, 8),
-    ]
-    wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
+    config['topology'] = generate_star_topology(N)
     circo = Qiskit_IO.get_Qiskit_Circuit(circ_flat.get_Flat_Circuit(),parameters)
-    # run circuti optimization
-    circ, parameters = Qiskit_IO.convert_Qiskit_to_Squander(circo)
-    wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters, True )
-
+    if use_qiskit_sabre:
+        coupling_map = [[i,j] for i,j in config['topology']]
+        circuit_qiskit_sabre = transpile(circo, coupling_map=coupling_map)
+        circ, parameters = Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_sabre)
+        config['routed']= True
+        wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
+    else:
+        wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
+        # run circuti optimization
+        circ, parameters = Qiskit_IO.convert_Qiskit_to_Squander(circo)
+    circ, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters, True )
+    print(f"Two qubit gate count: {extract_two_qubit_gate_count(circ.get_Gate_Nums())}")
     print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
 
 

From 96fa37ca2018c78a6db0ea378d03887941cbfc73 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 08:56:08 +0100
Subject: [PATCH 070/232] add generate DAG levels function

---
 squander/synthesis/PartAM.py | 60 ++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index b35c1d83d..d7d3f7f01 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -903,3 +903,63 @@ def _evaluate_layout_score(self, pi, interaction_graph, D):
             score += distance * weight
         
         return score
+    
+    def generate_DAG_levels(self, circuit):
+        """
+        Generate DAG levels - groups gates by their topological level.
+        
+        Args:
+            circuit: The quantum circuit to analyze
+            
+        Returns:
+            List of lists, where each inner list contains gate indices at the same DAG level.
+            Level 0 contains gates with no parents, level 1 contains gates whose parents
+            are all at level 0, etc.
+        """
+        gates = circuit.get_Gates()
+        num_gates = len(gates)
+        
+        # Build parent count for each gate
+        parent_counts = [0] * num_gates
+        children_map = [[] for _ in range(num_gates)]
+        
+        for gate_idx in range(num_gates):
+            gate = gates[gate_idx]
+            parents = circuit.get_Parents(gate)
+            parent_counts[gate_idx] = len(parents)
+            
+            # Build children map
+            children = circuit.get_Children(gate)
+            for child_idx in children:
+                children_map[gate_idx].append(child_idx)
+        
+        # Initialize level 0 with gates that have no parents
+        levels = []
+        current_level = []
+        processed = [False] * num_gates
+        
+        # Find gates with no parents (level 0)
+        for gate_idx in range(num_gates):
+            if parent_counts[gate_idx] == 0:
+                current_level.append(gate_idx)
+                processed[gate_idx] = True
+        
+        # Process levels using BFS
+        while current_level:
+            levels.append(current_level)
+            next_level = []
+            
+            # Process all gates in current level
+            for gate_idx in current_level:
+                # Decrement parent counts for children
+                for child_idx in children_map[gate_idx]:
+                    parent_counts[child_idx] -= 1
+                    # If all parents are processed, add to next level
+                    if parent_counts[child_idx] == 0 and not processed[child_idx]:
+                        next_level.append(child_idx)
+                        processed[child_idx] = True
+            
+            current_level = next_level
+        
+        return levels
+    
\ No newline at end of file

From 485ca21c96d91d37a796dac1dd09521be2671c28 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 10:09:16 +0100
Subject: [PATCH 071/232] Remove bugs, clean up codebase and add new circuit
 function

---
 squander/synthesis/PartAM.py                  |  17 +-
 squander/synthesis/PartAM_utils.py            | 363 +++---------------
 .../test_group_into_two_qubit_blocks.py       | 217 +++++++++++
 3 files changed, 286 insertions(+), 311 deletions(-)
 create mode 100644 tests/decomposition/test_group_into_two_qubit_blocks.py

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index ab9b5c5d4..01f36764c 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -88,7 +88,7 @@ def _score_candidate_worker(payload):
         or _WORKER_DISTANCE_MATRIX is None
     ):
         raise RuntimeError("Scoring worker not initialized with shared data.")
-    partition_candidate, F_snapshot, pi_snapshot, lookahead_gates = payload
+    partition_candidate, F_snapshot, pi_snapshot = payload
     return qgd_Partition_Aware_Mapping.score_partition_candidate(
         partition_candidate,
         F_snapshot,
@@ -97,7 +97,6 @@ def _score_candidate_worker(payload):
         _WORKER_S_DAG,
         _WORKER_DISTANCE_MATRIX,
         _WORKER_SWAP_CACHE,
-        lookahead_gates
     )
 
 
@@ -430,7 +429,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
-        for partition_idx in F:
+        for partition_idx in list(F):
             if isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
                 F.remove(partition_idx)
                 single_qubit_part = optimized_partitions[partition_idx]
@@ -439,7 +438,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 partition_order.append(single_qubit_part)
 
                 resolved_partitions[partition_idx] = True
-                children = DAG[partition_idx]
+                children = list(DAG[partition_idx])
                 while len(children) !=0:
                     child = children.pop(0)
                     parents_resolved = True
@@ -474,7 +473,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
 
         try:
             while len(F) != 0:
-                lookahead_gates = None
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
                     break
@@ -482,7 +480,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 if executor is not None:
                     pi_snapshot = tuple(int(x) for x in pi)
                     payloads = [
-                        (partition_candidate, F_snapshot, pi_snapshot, lookahead_gates)
+                        (partition_candidate, F_snapshot, pi_snapshot)
                         for partition_candidate in partition_candidates
                     ]
                     scores = list(executor.map(_score_candidate_worker, payloads))
@@ -496,7 +494,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             sDAG,
                             D,
                             self._swap_cache,
-                            lookahead_gates
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -515,7 +512,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 
                 
                 partition_order.append(min_partition_candidate)
-                children = DAG[min_partition_candidate.partition_idx]
+                children = list(DAG[min_partition_candidate.partition_idx])
                 step += 1
                 while len(children) != 0:
                     child = children.pop(0)
@@ -578,14 +575,14 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache, lookahead_gates=None):
+    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache):
         score_F = 0
         score_E = 0
         E_partitions = set()  # Changed to set for O(1) membership checks
         E_partitions_1 = set()
         E_partitions_2 = set()
         swap_weight = 4
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, lookahead_gates)
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
         score_F += swap_weight*len(swaps)*3
         score_F += len(partition_candidate.circuit_structure)
 
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index e5ccaab86..0813cdef8 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -12,287 +12,7 @@
 # ============================================================================
 # SWAP Routing Algorithms
 # ============================================================================
-
-def solve_min_swaps(perm, edges, T=None, use_gurobi=True):
-    """
-    Compute globally optimal minimum SWAPs to route permutation 'perm'
-    to identity under connectivity 'edges'.
-
-    perm[i] = logical qubit currently at physical node i.
-    edges   = list of undirected edges (u,v).
-    T       = time horizon (number of layers); if None, use n^2.
-    """
-
-    n = len(perm)
-    nodes = list(range(n))
-    tokens = list(range(n))
-
-    # Time horizon
-    if T is None:
-        T = n * n   # safe-ish upper bound for small n
-
-    # Undirected edges => directed arcs for movement variables
-    undirected = {tuple(sorted(e)) for e in edges}
-    neighbors = {u: set() for u in nodes}
-    for u, v in undirected:
-        neighbors[u].add(v)
-        neighbors[v].add(u)
-    directed_arcs = [(u, v) for u, v in undirected for (u, v) in ((u, v), (v, u))]
-
-    # ILP model
-    prob = pulp.LpProblem("TokenSwapping", pulp.LpMinimize)
-
-    # x[t][v][q]: token q at node v at time t
-    x = {
-        (t, v, q): pulp.LpVariable(f"x_t{t}_v{v}_q{q}", cat="Binary")
-        for t in range(T + 1)
-        for v in nodes
-        for q in tokens
-    }
-
-    # m[t][u][v][q]: token q moves from u to v between t and t+1
-    m = {
-        (t, u, v, q): pulp.LpVariable(f"m_t{t}_{u}_{v}_q{q}", cat="Binary")
-        for t in range(T)
-        for (u, v) in directed_arcs
-        for q in tokens
-    }
-
-    # Initial positions: tokens are at positions given by 'perm'
-    # perm[i] = token currently at physical node i
-    for v in nodes:
-        for q in tokens:
-            prob += x[(0, v, q)] == (1 if v == q else 0), f"init_t0_v{v}_q{q}"
-
-    # Final positions: identity mapping (token q at node q)
-    for v in nodes:
-        for q in tokens:
-            prob += x[(T, v, q)] == (1 if perm[v] == q else 0), f"final_tT_v{v}_q{q}"
-
-    # Each token at exactly one node at each time
-    for t in range(T + 1):
-        for q in tokens:
-            prob += (
-                pulp.lpSum(x[(t, v, q)] for v in nodes) == 1,
-                f"one_node_t{t}_q{q}",
-            )
-
-    # Each node holds exactly one token at each time
-    for t in range(T + 1):
-        for v in nodes:
-            prob += (
-                pulp.lpSum(x[(t, v, q)] for q in tokens) == 1,
-                f"one_token_t{t}_v{v}",
-            )
-
-    # Introduce swap decision per time over undirected edges (single swap per time)
-    y = {
-        (t, u, v): pulp.LpVariable(f"y_t{t}_e{u}_{v}", cat="Binary")
-        for t in range(T)
-        for (u, v) in undirected
-    }
-
-    # At most one swap per time step
-    for t in range(T):
-        prob += (
-            pulp.lpSum(y[(t, u, v)] for (u, v) in undirected) <= 1,
-            f"one_swap_per_time_t{t}",
-        )
-
-    # Flow constraints for token movement
-    for t in range(T):
-        for u in nodes:
-            for q in tokens:
-                outbound = pulp.lpSum(
-                    m[(t, u, v, q)] for v in neighbors[u]
-                )
-                inbound = pulp.lpSum(
-                    m[(t, v, u, q)] for v in neighbors[u]
-                )
-                prob += (
-                    x[(t, u, q)] == x[(t + 1, u, q)] + outbound - inbound,
-                    f"flow_t{t}_u{u}_q{q}",
-                )
-
-    # Link moves to selected swap edge and enforce swap semantics
-    for t in range(T):
-        for (u, v) in undirected:
-            # Only allow moves along (u,v) at time t if this edge is selected
-            for q in tokens:
-                prob += m[(t, u, v, q)] <= y[(t, u, v)], f"link_m_y_t{t}_{u}_{v}_q{q}_uv"
-                prob += m[(t, v, u, q)] <= y[(t, u, v)], f"link_m_y_t{t}_{u}_{v}_q{q}_vu"
-
-            # If edge selected, exactly one token moves each direction (a swap)
-            prob += (
-                pulp.lpSum(m[(t, u, v, q)] for q in tokens) == y[(t, u, v)],
-                f"one_token_uv_t{t}_{u}_{v}",
-            )
-            prob += (
-                pulp.lpSum(m[(t, v, u, q)] for q in tokens) == y[(t, u, v)],
-                f"one_token_vu_t{t}_{u}_{v}",
-            )
-
-    # Objective: minimize number of swaps (sum of y)
-    total_swaps = pulp.lpSum(y.values())
-    total_moves = pulp.lpSum(m.values())  # optional, for reporting
-    prob += total_swaps
-
-    # Choose solver
-    if use_gurobi:
-        try:
-            solver = pulp.GUROBI(msg=False, manageEnv=True, Threads=1)
-        except Exception:
-            # Fallback if GUROBI not properly installed with PuLP wrapper
-            solver = pulp.PULP_CBC_CMD(msg=False)
-    else:
-        solver = pulp.PULP_CBC_CMD(msg=False)
-
-    prob.solve(solver)
-
-    status = pulp.LpStatus[prob.status]
-    if status != "Optimal":
-        raise RuntimeError(f"Solver did not find optimal solution, status = {status}")
-
-    # Extract move/swap counts
-    moves_value = int(pulp.value(total_moves))
-    swap_value = int(pulp.value(total_swaps))
-
-    # Build per-time-step SWAP schedule:
-    # For each t, look at directed moves and turn them into undirected edges.
-    swap_layers = []
-    for t in range(T):
-        layer = []
-        for (u, v) in undirected:
-            if int(pulp.value(y[(t, u, v)])) == 1:
-                layer.append((u, v))
-        # At most one edge per layer by construction
-        if layer: swap_layers.append(layer)
-
-    return {
-        "swap_count": swap_value,
-        "moves": moves_value,
-        "layers": swap_layers,
-        "status": status,
-    }
-
-def apply_swaps(perm, layers):
-    """
-    Apply a sequence of SWAP layers to a permutation.
-
-    perm: initial permutation (list)
-    layers: list of SWAP layers, each layer is a list of edges (u,v)
-
-    Returns the resulting permutation after applying all SWAPs.
-    """
-    current_perm = perm[:]
-    for layer in layers:
-        for (u, v) in layer:
-            # Swap tokens at positions u and v
-            current_perm[u], current_perm[v] = current_perm[v], current_perm[u]
-    return current_perm
-
-def find_constrained_swaps_ILP(pi_A, pi_B_dict, dist_matrix, use_gurobi=True):
-    """
-    Find SWAP sequence to route subset of virtual qubits to targets using ILP.
-    
-    Args:
-        pi_A: List [P0, P1, ...] where pi_A[q] = P (virtual q at physical P)
-        pi_B_dict: Dict {q: P} specifying only qubits that need routing
-        dist_matrix: Pre-computed distance matrix dist[i][j] between physical qubits
-    
-    Returns:
-        swaps: List of (i, j) SWAP operations on adjacent physical qubits
-        final_permutation: List showing final virtual→physical mapping
-    """
-    n = len(pi_A)
-    
-    # Build edges from distance matrix (adjacent = distance 1)
-    edges = []
-    for i in range(n):
-        for j in range(i + 1, n):
-            if dist_matrix[i][j] == 1:
-                edges.append((i, j))
-    
-    # === Step 1: Complete eta (target permutation) ===
-    # eta[q] = target physical position for virtual qubit q
-    assigned_physical = set(pi_B_dict.values())
-    unassigned_logical = [q for q in range(n) if q not in pi_B_dict]
-    available_physical = set(P for P in range(n) if P not in assigned_physical)
-    
-    eta = dict(pi_B_dict)  # Start with required assignments
-    
-    # Try to keep unassigned qubits in place if their position is available
-    still_unassigned = []
-    for q in unassigned_logical:
-        current_P = pi_A[q]
-        if current_P in available_physical:
-            eta[q] = current_P
-            available_physical.remove(current_P)
-        else:
-            still_unassigned.append(q)
-    
-    # Assign remaining qubits to remaining positions
-    remaining_physical = sorted(available_physical)
-    for q, P in zip(still_unassigned, remaining_physical):
-        eta[q] = P
-    
-    # Convert to list
-    eta_list = [eta[q] for q in range(n)]
-    
-    # === Step 2: Compute inverse permutations ===
-    # pi_A_inv[P] = q means physical P has virtual q
-    pi_A_inv = [0] * n
-    for q in range(n):
-        pi_A_inv[pi_A[q]] = q
-    
-    # eta_inv[P] = q means we want physical P to have virtual q
-    eta_inv = [0] * n
-    for q in range(n):
-        eta_inv[eta_list[q]] = q
-    
-    # === Step 3: Construct perm for solve_min_swaps ===
-    # To route from state A to state B using swaps S where S(identity) = perm:
-    # We need: A[perm[P]] = B[P], so perm[P] = A^{-1}[B[P]]
-    # Here: A = pi_A_inv, B = eta_inv, A^{-1} = pi_A
-    # So: perm[P] = pi_A[eta_inv[P]]
-    perm = [pi_A[eta_inv[P]] for P in range(n)]
-    
-    # Check if already at target (perm is identity)
-    if perm == list(range(n)):
-        return [], eta_list
-    
-    # === Step 4: Solve using ILP ===
-    result = solve_min_swaps(perm, edges, use_gurobi=use_gurobi)
-    
-    if result['status'] != 'Optimal':
-        return None, None
-    
-    # Extract swaps from layers (flatten)
-    swaps = []
-    for layer in result['layers']:
-        for swap in layer:
-            swaps.append(swap)
-    
-    # === Step 5: Compute final permutation ===
-    # Apply swaps to pi_A to get final virtual→physical mapping
-    # Maintain both directions for O(1) swap operations
-    final_perm = list(pi_A)
-    phys_to_virt = list(pi_A_inv)
-    
-    for (i, j) in swaps:
-        # Get virtual qubits at physical positions i and j
-        q_i = phys_to_virt[i]
-        q_j = phys_to_virt[j]
-        
-        # Swap their physical positions
-        final_perm[q_i] = j
-        final_perm[q_j] = i
-        phys_to_virt[i] = q_j
-        phys_to_virt[j] = q_i
-    
-    return swaps, final_perm
-
-def find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix,lookahead_gates=None):
+def find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix):
     """
     Find SWAP sequence to route subset of virtual qubits to targets.
     
@@ -359,21 +79,7 @@ def heuristic(state):
             total += float(distance)
         return math.floor(total / 2)  # Optimistic: each SWAP helps 2 qubits
     
-    def heuristic_with_lookahead(state):
-        base_cost = heuristic(state)
-        
-        if lookahead_gates:
-            # Add penalty for gates that would require swaps
-            for q1, q2 in lookahead_gates[:5]:  # Look at next 5 gates
-                p1 = state[q1]
-                p2 = state[q2]
-                if dist_matrix[p1][p2] > 1:
-                    base_cost += dist_matrix[p1][p2] - 1
-        
-        return base_cost
-    
-    # Use this heuristic in A*
-    heap = [(heuristic_with_lookahead(start_state), 0, start_state, [])]
+    heap = [(heuristic(start_state), 0, start_state, [])]
     visited = {start_state: 0}
     
     while heap:
@@ -407,8 +113,8 @@ def heuristic_with_lookahead(state):
     
     return None, None  # No solution found
 
-def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix,lookahead_gates=None):
-    return find_constrained_swaps_A_star(pi_A,pi_B_dict,dist_matrix,lookahead_gates)
+def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
+    return find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix)
 
 def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
     P_i_inv = [P_i.index(i) for i in range(len(P_i))]  # Compute inverse
@@ -704,7 +410,7 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
-    def transform_pi(self, pi, D, swap_cache=None, lookahead_gates=None):
+    def transform_pi(self, pi, D, swap_cache=None):
         # Fixed: Use P_i^{-1} instead of P_i for input routing
         # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
         # For Original to see logical qubit q* at partition position q*, we need:
@@ -726,10 +432,10 @@ def transform_pi(self, pi, D, swap_cache=None, lookahead_gates=None):
             if cache_key in swap_cache:
                 swaps, pi_init = swap_cache[cache_key]
             else:
-                swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D, lookahead_gates)
+                swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
                 swap_cache[cache_key] = (swaps, pi_init)
         else:
-            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D, lookahead_gates)
+            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
         
         pi_output = pi_init.copy()
         # Fixed: P_o should be indexed by partition virtual index q*, not physical index Q*
@@ -786,3 +492,58 @@ def construct_swap_circuit(swap_order, N):
         swap_circ.add_CNOT(swap[1],swap[0])
         swap_circ.add_CNOT(swap[0],swap[1])
     return swap_circ
+
+def group_into_two_qubit_blocks(circuit: Circuit) -> Circuit:
+    """
+    Takes a flat circuit and returns an equivalent circuit whose top-level
+    elements are all 2-qubit Circuit blocks, each containing exactly one
+    2-qubit gate.
+
+    Single-qubit gates are buffered and flushed into the next 2-qubit block
+    on that qubit. Trailing single-qubit gates (after the last 2-qubit gate
+    on a qubit) are appended to the last block that involved that qubit.
+
+    Assumes the circuit contains only 1- and 2-qubit gates.
+
+    Args:
+        circuit: Flat input circuit with individual gates
+
+    Returns:
+        Circuit: Equivalent circuit whose top-level elements are all 2-qubit blocks
+    """
+    N = circuit.get_Qbit_Num()
+
+    pending = defaultdict(list)  # pending[q] = single-qubit gates waiting for next block on q
+    blocks = []                  # accumulated Circuit block objects
+    last_block_for_qubit = {}    # last_block_for_qubit[q] = index into blocks
+
+    for gate in circuit.get_Gates():
+        qubits = gate.get_Involved_Qbits()
+        if len(qubits) == 1:
+            pending[qubits[0]].append(gate)
+        else:  # 2-qubit gate
+            q0, q1 = qubits[0], qubits[1]
+            block = Circuit(N)
+            for g in pending[q0]:
+                block.add_Gate(g)
+            for g in pending[q1]:
+                block.add_Gate(g)
+            pending[q0].clear()
+            pending[q1].clear()
+            block.add_Gate(gate)
+            idx = len(blocks)
+            blocks.append(block)
+            last_block_for_qubit[q0] = idx
+            last_block_for_qubit[q1] = idx
+
+    # Append trailing single-qubit gates to the last block that touched that qubit
+    for q, gates_list in pending.items():
+        if gates_list and q in last_block_for_qubit:
+            block = blocks[last_block_for_qubit[q]]
+            for g in gates_list:
+                block.add_Gate(g)
+
+    result = Circuit(N)
+    for block in blocks:
+        result.add_Circuit(block)
+    return result
diff --git a/tests/decomposition/test_group_into_two_qubit_blocks.py b/tests/decomposition/test_group_into_two_qubit_blocks.py
new file mode 100644
index 000000000..ec0b45ed0
--- /dev/null
+++ b/tests/decomposition/test_group_into_two_qubit_blocks.py
@@ -0,0 +1,217 @@
+import pytest
+import numpy as np
+from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
+from squander.synthesis.PartAM_utils import group_into_two_qubit_blocks
+
+
+def _count_gates_by_qubit_num(circuit, qubit_num):
+    """Count gates with exactly `qubit_num` involved qubits, recursing into blocks."""
+    count = 0
+    for gate in circuit.get_Gates():
+        if isinstance(gate, Circuit):
+            count += _count_gates_by_qubit_num(gate, qubit_num)
+        else:
+            if len(gate.get_Involved_Qbits()) == qubit_num:
+                count += 1
+    return count
+
+
+def _get_params(circuit, seed=42):
+    np.random.seed(seed)
+    return np.random.uniform(0, 2 * np.pi, circuit.get_Parameter_Num())
+
+
+# ============================================================================
+# Structure tests
+# ============================================================================
+
+def test_all_top_level_elements_are_circuits():
+    c = Circuit(3)
+    c.add_H(0)
+    c.add_CNOT(0, 1)
+    c.add_RZ(1)
+    c.add_CNOT(1, 2)
+
+    result = group_into_two_qubit_blocks(c)
+    for gate in result.get_Gates():
+        assert isinstance(gate, Circuit)
+
+
+def test_each_block_has_exactly_one_two_qubit_gate():
+    c = Circuit(3)
+    c.add_H(0)
+    c.add_CNOT(0, 1)
+    c.add_RZ(1)
+    c.add_CNOT(1, 2)
+    c.add_H(2)
+
+    result = group_into_two_qubit_blocks(c)
+    for block in result.get_Gates():
+        two_qubit_count = sum(
+            1 for g in block.get_Gates()
+            if len(g.get_Involved_Qbits()) == 2
+        )
+        assert two_qubit_count == 1
+
+
+def test_block_count_equals_two_qubit_gate_count():
+    c = Circuit(4)
+    c.add_CNOT(0, 1)
+    c.add_CNOT(1, 2)
+    c.add_CNOT(2, 3)
+
+    result = group_into_two_qubit_blocks(c)
+    assert len(result.get_Gates()) == 3
+
+
+def test_only_2qubit_gates_each_block_has_one_gate():
+    """With no single-qubit gates, each block contains exactly the 2-qubit gate."""
+    c = Circuit(3)
+    c.add_CNOT(0, 1)
+    c.add_CNOT(1, 2)
+
+    result = group_into_two_qubit_blocks(c)
+    for block in result.get_Gates():
+        assert len(block.get_Gates()) == 1
+
+
+# ============================================================================
+# Gate count preservation tests
+# ============================================================================
+
+def test_total_single_qubit_gate_count_preserved():
+    c = Circuit(3)
+    c.add_H(0)
+    c.add_RZ(1)
+    c.add_CNOT(0, 1)
+    c.add_RZ(0)
+    c.add_CNOT(1, 2)
+    c.add_H(2)
+
+    result = group_into_two_qubit_blocks(c)
+    assert _count_gates_by_qubit_num(result, 1) == _count_gates_by_qubit_num(c, 1)
+
+
+def test_total_two_qubit_gate_count_preserved():
+    c = Circuit(4)
+    c.add_H(0)
+    c.add_CNOT(0, 1)
+    c.add_H(2)
+    c.add_CNOT(1, 2)
+    c.add_CNOT(2, 3)
+
+    result = group_into_two_qubit_blocks(c)
+    assert _count_gates_by_qubit_num(result, 2) == _count_gates_by_qubit_num(c, 2)
+
+
+# ============================================================================
+# Block membership tests
+# ============================================================================
+
+def test_leading_single_qubit_gates_in_first_block():
+    """Single-qubit gates before the first 2-qubit gate go into the first block."""
+    c = Circuit(2)
+    c.add_H(0)
+    c.add_H(1)
+    c.add_CNOT(0, 1)
+
+    result = group_into_two_qubit_blocks(c)
+    blocks = result.get_Gates()
+    assert len(blocks) == 1
+    assert len(blocks[0].get_Gates()) == 3  # H(0) + H(1) + CNOT
+
+
+def test_trailing_single_qubit_gates_in_last_block():
+    """Single-qubit gates after the last 2-qubit gate on a qubit go into that last block."""
+    c = Circuit(2)
+    c.add_CNOT(0, 1)
+    c.add_H(0)
+    c.add_RZ(1)
+
+    result = group_into_two_qubit_blocks(c)
+    blocks = result.get_Gates()
+    assert len(blocks) == 1
+    assert len(blocks[0].get_Gates()) == 3  # CNOT + H(0) + RZ(1)
+
+
+def test_interleaved_single_qubit_gates_split_correctly():
+    """Single-qubit gates between two 2-qubit gates go to the next block."""
+    c = Circuit(3)
+    c.add_CNOT(0, 1)   # block 0
+    c.add_H(0)          # -> block 1 (next 2-qubit gate involving q0)
+    c.add_RZ(1)         # -> block 1 (next 2-qubit gate involving q1)
+    c.add_CNOT(0, 1)   # block 1
+
+    result = group_into_two_qubit_blocks(c)
+    blocks = result.get_Gates()
+    assert len(blocks) == 2
+    assert len(blocks[0].get_Gates()) == 1  # only CNOT
+    assert len(blocks[1].get_Gates()) == 3  # H + RZ + CNOT
+
+
+# ============================================================================
+# Correctness (unitary equivalence) tests
+# ============================================================================
+
+def test_unitary_equivalence_cnot_chain():
+    c = Circuit(3)
+    c.add_CNOT(0, 1)
+    c.add_CNOT(1, 2)
+    c.add_CNOT(0, 2)
+
+    result = group_into_two_qubit_blocks(c)
+    params = _get_params(c)
+    assert np.allclose(c.get_Matrix(params), result.get_Matrix(params), atol=1e-10)
+
+
+def test_unitary_equivalence_with_single_qubit_gates():
+    c = Circuit(3)
+    c.add_H(0)
+    c.add_RZ(1)
+    c.add_CNOT(0, 1)
+    c.add_H(1)
+    c.add_CNOT(1, 2)
+    c.add_H(2)
+
+    result = group_into_two_qubit_blocks(c)
+    params = _get_params(c)
+    assert np.allclose(c.get_Matrix(params), result.get_Matrix(params), atol=1e-10)
+
+
+@pytest.mark.parametrize("N", [2, 3, 4])
+def test_unitary_equivalence_parametric(N):
+    c = Circuit(N)
+    c.add_RZ(0)
+    c.add_CNOT(0, 1)
+    c.add_RY(1)
+    if N > 2:
+        c.add_RZ(2)
+        c.add_CNOT(1, 2)
+    if N > 3:
+        c.add_RZ(3)
+        c.add_CNOT(2, 3)
+
+    result = group_into_two_qubit_blocks(c)
+    params = _get_params(c)
+    assert np.allclose(c.get_Matrix(params), result.get_Matrix(params), atol=1e-10)
+
+
+# ============================================================================
+# Edge cases
+# ============================================================================
+
+def test_empty_circuit_returns_empty():
+    c = Circuit(3)
+    result = group_into_two_qubit_blocks(c)
+    assert len(result.get_Gates()) == 0
+
+
+def test_single_two_qubit_gate_no_singles():
+    c = Circuit(2)
+    c.add_CNOT(0, 1)
+
+    result = group_into_two_qubit_blocks(c)
+    blocks = result.get_Gates()
+    assert len(blocks) == 1
+    assert isinstance(blocks[0], Circuit)
+    assert len(blocks[0].get_Gates()) == 1

From 8c22d4a1639ca91a00fb3d7a6b93a161bb0bc34f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 10:17:58 +0100
Subject: [PATCH 072/232] Add copy method to qgd_Circuit

---
 squander/gates/qgd_Circuit.py          |  8 ++++
 squander/gates/qgd_Circuit_Wrapper.cpp | 59 ++++++++++++++++++++++++++
 2 files changed, 67 insertions(+)

diff --git a/squander/gates/qgd_Circuit.py b/squander/gates/qgd_Circuit.py
index 967ddb20d..980ad3a39 100644
--- a/squander/gates/qgd_Circuit.py
+++ b/squander/gates/qgd_Circuit.py
@@ -78,6 +78,14 @@ def __init__(self, qbit_num):
         # call the constructor of the wrapper class
         super().__init__(qbit_num)
 
+    def copy(self):
+        """
+        Create a deep copy of the circuit.
+        @return A new qgd_Circuit instance with all gates copied.
+        """
+        # Call the C wrapper function that uses the clone() method
+        return super().copy()
+
     def add_U1(self, target_qbit):
         """Add a U1 gate to the front of the gate structure.
 
diff --git a/squander/gates/qgd_Circuit_Wrapper.cpp b/squander/gates/qgd_Circuit_Wrapper.cpp
index f931620e4..4719af7b6 100644
--- a/squander/gates/qgd_Circuit_Wrapper.cpp
+++ b/squander/gates/qgd_Circuit_Wrapper.cpp
@@ -1928,6 +1928,62 @@ qgd_Circuit_Wrapper_get_Flat_Circuit( qgd_Circuit_Wrapper *self ) {
 
 
 
+/**
+@brief Wrapper function to create a deep copy of the circuit.
+@param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper.
+@return Returns a new qgd_Circuit Python object that is a deep copy.
+*/
+static PyObject *
+qgd_Circuit_Wrapper_copy( qgd_Circuit_Wrapper *self ) {
+
+    Gates_block* copied_circuit = NULL;
+
+    try {
+        copied_circuit = self->circuit->clone();
+    }
+    catch (std::string err) {
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        std::cout << err << std::endl;
+        return NULL;
+    }
+    catch(...) {
+        std::string err( "Invalid pointer to circuit class");
+        PyErr_SetString(PyExc_Exception, err.c_str());
+        return NULL;
+    }
+
+    int qbit_num = copied_circuit->get_qbit_num();
+
+    // import gate operation modules
+    PyObject* qgd_circuit  = PyImport_ImportModule("squander.gates.qgd_Circuit");
+
+    if ( qgd_circuit == NULL ) {
+        PyErr_SetString(PyExc_Exception, "Module import error: squander.gates.qgd_Circuit" );
+        delete copied_circuit;
+        return NULL;
+    }
+
+    PyObject* qgd_circuit_Dict  = PyModule_GetDict( qgd_circuit );
+
+    // PyDict_GetItemString creates a borrowed reference to the item in the dict. Reference counting is not increased on this element, dont need to decrease the reference counting at the end
+    PyObject* py_circuit_class = PyDict_GetItemString( qgd_circuit_Dict, "qgd_Circuit");
+
+    PyObject* circuit_input = Py_BuildValue("(O)", Py_BuildValue("i", qbit_num) );
+    PyObject* py_circuit    = PyObject_CallObject(py_circuit_class, circuit_input);
+
+    // replace dummy data with real gate data
+    qgd_Circuit_Wrapper* py_circuit_C = reinterpret_cast<qgd_Circuit_Wrapper*>( py_circuit );
+
+    delete( py_circuit_C->circuit );
+    py_circuit_C->circuit = copied_circuit;
+
+    Py_DECREF( qgd_circuit );
+    Py_DECREF( circuit_input );
+
+    return py_circuit;
+}
+
+
 /**
 @brief Method to extract the stored quantum circuit in a human-readable data serialized and pickle-able format
 @param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper
@@ -2343,6 +2399,9 @@ static PyMethodDef qgd_Circuit_Wrapper_Methods[] = {
     {"get_Children", (PyCFunction) qgd_Circuit_Wrapper_get_children, METH_VARARGS,
      "Method to get the list of child gate indices. Then the children gates can be obtained from the list of gates involved in the circuit."
     },
+    {"copy", (PyCFunction) qgd_Circuit_Wrapper_copy, METH_NOARGS,
+     "Method to create a deep copy of the circuit."
+    },
     {"__getstate__", (PyCFunction) qgd_Circuit_Wrapper_getstate, METH_NOARGS,
      "Method to extract the stored quantum circuit in a human-readable data serialized and pickle-able format."
     },

From cff6eb570611bd01ba12b4ee65b6f2573dddeb1d Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 11:32:18 +0100
Subject: [PATCH 073/232] Add windowed synthesis

---
 examples/decomposition/PartAM_example.py | 134 ++++--
 squander/synthesis/PartAM.py             | 493 +++++++++++++++++------
 2 files changed, 470 insertions(+), 157 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 1e2e54e3e..82cba7c1e 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -27,61 +27,109 @@
 import time
 from squander import Circuit
 import numpy as np
-if __name__ == '__main__':
-
+def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm):
+    """Validate decomposition by applying both circuits to a random state."""
+    num_qubits = circ.get_Qbit_Num()
+    matrix_size = 1 << num_qubits
+    initial_state_real = np.random.uniform(-1.0, 1.0, (matrix_size,))
+    initial_state_imag = np.random.uniform(-1.0, 1.0, (matrix_size,))
+    initial_state = initial_state_real + initial_state_imag * 1j
+    initial_state = initial_state / np.linalg.norm(initial_state)
 
-    config = {
-            'strategy': "TreeSearch",
-            'test_subcircuits': True,
-            'test_final_circuit': True,
-            'max_partition_size': 3,
-            'progressbar': True,  # Enable diagnostic output
-    }
-
-    filename = "benchmarks/qfast/5q/vqe.qasm"
-    start_time = time.time()
-
-    # load the circuit from a file
-    circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
-    config['topology'] = [
-    (0, 1), (0, 2), (0, 3), (0, 4)
-    ]
-    wide_circuit_optimizer = Partition_Aware_Mapping( config )
-    circ, params, input_perm,output_perm = wide_circuit_optimizer.Partition_Aware_Mapping( circ_orig, parameters_orig )
-    wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config )
-    config['routed'] = True 
-    circo = Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params)
-    # run circuti optimization
-    circ, params = Qiskit_IO.convert_Qiskit_to_Squander(circo)
-    circ, params = wide_circuit_optimizer.OptimizeWideCircuit( circ, params, True )
-    #print(Qiskit_IO.get_Qiskit_Circuit(circ.get_Flat_Circuit(),params))
-    num_qubits = circ.get_Qbit_Num() 
-    matrix_size = 1 << num_qubits 
-    initial_state_real = np.random.uniform(-1.0,1.0, (matrix_size,) )
-    initial_state_imag = np.random.uniform(-1.0,1.0, (matrix_size,) )
-    initial_state = initial_state_real + initial_state_imag*1j
-    initial_state = initial_state/np.linalg.norm(initial_state)
     original_state = initial_state.copy()
-    circ_orig.apply_to(parameters_orig,original_state)
-    circ_Final = Circuit(circ.get_Qbit_Num() )
-    output_perm_T = [0]* circ.get_Qbit_Num() 
+    circ_orig.apply_to(parameters_orig, original_state)
+
+    circ_Final = Circuit(num_qubits)
+    output_perm_T = [0] * num_qubits
     for i, j in enumerate(output_perm):
-        output_perm_T[j] = i        
-    # Convert numpy arrays/ints to plain Python lists for add_Permutation
+        output_perm_T[j] = i
     input_perm_list = [int(x) for x in input_perm]
     circ_Final.add_Permutation(input_perm_list)
     circ_Final.add_Circuit(circ)
     circ_Final.add_Permutation(output_perm_T)
-    # Additional matrix validation in example     
+
     PartAM_state = initial_state.copy()
     circ_Final.apply_to(params, PartAM_state)
     state_error = 1 - abs(np.vdot(PartAM_state, original_state))
+    return state_error, circ_Final
+
+
+if __name__ == '__main__':
+
+    filename = "benchmarks/qfast/5q/vqe.qasm"
+    circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
+    topology = [(0, 1), (0, 2), (0, 3), (0, 4)]
+
+    # ================================================================
+    # Full-circuit mode (default, window_size=0)
+    # ================================================================
+    print(f"\n{'='*70}")
+    print("Full-circuit mode (window_size=0)")
+    print(f"{'='*70}")
+
+    config_full = {
+        'strategy': "TreeSearch",
+        'test_subcircuits': True,
+        'test_final_circuit': True,
+        'max_partition_size': 3,
+        'progressbar': True,
+        'topology': topology,
+    }
+
+    start_time = time.time()
+    pam_full = Partition_Aware_Mapping(config_full)
+    circ_full, params_full, input_perm_full, output_perm_full = \
+        pam_full.Partition_Aware_Mapping(circ_orig, parameters_orig)
+    elapsed_full = time.time() - start_time
+
+    error_full, circ_final_full = validate_result(
+        circ_orig, parameters_orig,
+        circ_full, params_full, input_perm_full, output_perm_full
+    )
+    print(f"Decomposition error: {error_full:.10f}")
+    print(f"Gate counts: {circ_final_full.get_Gate_Nums()}")
+    print(f"Time: {elapsed_full:.2f}s")
+
+    # ================================================================
+    # Windowed mode (window_size=3)
+    # ================================================================
+    print(f"\n{'='*70}")
+    print("Windowed mode (window_size=3)")
+    print(f"{'='*70}")
+
+    config_windowed = {
+        'strategy': "TreeSearch",
+        'test_subcircuits': True,
+        'test_final_circuit': True,
+        'max_partition_size': 3,
+        'progressbar': True,
+        'topology': topology,
+        'window_size': 3,
+    }
+
+    start_time = time.time()
+    pam_windowed = Partition_Aware_Mapping(config_windowed)
+    circ_win, params_win, input_perm_win, output_perm_win = \
+        pam_windowed.Partition_Aware_Mapping(circ_orig, parameters_orig)
+    elapsed_win = time.time() - start_time
+
+    error_win, circ_final_win = validate_result(
+        circ_orig, parameters_orig,
+        circ_win, params_win, input_perm_win, output_perm_win
+    )
+    print(f"Decomposition error: {error_win:.10f}")
+    print(f"Gate counts: {circ_final_win.get_Gate_Nums()}")
+    print(f"Time: {elapsed_win:.2f}s")
+
+    # ================================================================
+    # Summary
+    # ================================================================
     print(f"\n{'='*70}")
-    print(f"State Vector Validation")
+    print("Summary")
     print(f"{'='*70}")
-    print(f"Decomposition error on random state: {state_error:.10f}")
-    print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time))
+    print(f"{'Mode':<20} {'Error':<20} {'Time':<10}")
+    print(f"{'Full circuit':<20} {error_full:<20.10f} {elapsed_full:<10.2f}s")
+    print(f"{'Windowed (K=3)':<20} {error_win:<20.10f} {elapsed_win:<10.2f}s")
     print(f"{'='*70}\n")
-    print(circ_Final.get_Gate_Nums())
 
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 01f36764c..cf875ed0a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -39,6 +39,7 @@
     check_circuit_compatibility,
     construct_swap_circuit,
     calculate_dist_small,
+    group_into_two_qubit_blocks,
 )
 
 
@@ -63,18 +64,18 @@ class PartitionScoreData:
 # ============================================================================
 
 _WORKER_SCORING_PARTITIONS: Optional[List[Optional[PartitionScoreData]]] = None
-_WORKER_S_DAG: Optional[List[List[int]]] = None
 _WORKER_DISTANCE_MATRIX: Optional[np.ndarray] = None
 _WORKER_SWAP_CACHE: Optional[Dict] = None
+_WORKER_VIRTUAL_E: Optional[List[Tuple[int, int]]] = None
 
 
-def _init_scoring_worker(scoring_partitions, sdag, distance_matrix):
+def _init_scoring_worker(scoring_partitions, distance_matrix, virtual_E=None):
     """Initializer for process-based scoring workers."""
-    global _WORKER_SCORING_PARTITIONS, _WORKER_S_DAG, _WORKER_DISTANCE_MATRIX, _WORKER_SWAP_CACHE
+    global _WORKER_SCORING_PARTITIONS, _WORKER_DISTANCE_MATRIX, _WORKER_SWAP_CACHE, _WORKER_VIRTUAL_E
     _WORKER_SCORING_PARTITIONS = scoring_partitions
-    _WORKER_S_DAG = sdag
     _WORKER_DISTANCE_MATRIX = distance_matrix
     _WORKER_SWAP_CACHE = {}
+    _WORKER_VIRTUAL_E = virtual_E
 
 
 def _score_candidate_worker(payload):
@@ -84,7 +85,6 @@ def _score_candidate_worker(payload):
     """
     if (
         _WORKER_SCORING_PARTITIONS is None
-        or _WORKER_S_DAG is None
         or _WORKER_DISTANCE_MATRIX is None
     ):
         raise RuntimeError("Scoring worker not initialized with shared data.")
@@ -94,9 +94,9 @@ def _score_candidate_worker(payload):
         F_snapshot,
         pi_snapshot,
         _WORKER_SCORING_PARTITIONS,
-        _WORKER_S_DAG,
         _WORKER_DISTANCE_MATRIX,
         _WORKER_SWAP_CACHE,
+        _WORKER_VIRTUAL_E,
     )
 
 
@@ -125,6 +125,7 @@ def __init__(self, config):
         self.config.setdefault('partition_strategy','ilp')
         self.config.setdefault('optimizer', 'BFGS')
         self.config.setdefault('hs_score_workers', os.cpu_count() or 1)
+        self.config.setdefault('window_size', 0)  # 0 = full circuit (backward compat)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -134,6 +135,53 @@ def __init__(self, config):
         self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
         self._swap_cache = {}     # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)}
 
+    # ------------------------------------------------------------------------
+    # Scoring Methods
+    # ------------------------------------------------------------------------
+
+    def compute_routing_aware_weight(self, result, pi_init, D, E):
+        """
+        Compute a routing-aware ILP weight for a partition synthesis result.
+
+        Combines three components:
+        1. Base synthesis cost (min CNOT count across topologies)
+        2. Routing cost from pi_init (estimated SWAP overhead)
+        3. Virtual outgoing gate penalty (future routing constraints)
+
+        Args:
+            result: PartitionSynthesisResult or SingleQubitPartitionResult
+            pi_init: Current qubit layout (logical -> physical mapping)
+            D: Distance matrix between physical qubits
+            E: List of (q_a, q_b) tuples for virtual outgoing 2-qubit gates
+
+        Returns:
+            float: Combined weight (lower is better)
+        """
+        if isinstance(result, SingleQubitPartitionResult):
+            return 0
+
+        # 1. Base synthesis cost
+        base_cost = result.get_partition_synthesis_score()
+
+        # 2. Routing cost: best (minimum) across topologies
+        routing_cost = np.inf
+        for mini_topology in result.mini_topologies:
+            dist = calculate_dist_small(mini_topology, result.qubit_map, D, pi_init)
+            routing_cost = min(routing_cost, dist)
+        if np.isinf(routing_cost):
+            routing_cost = 0
+
+        # 3. Virtual outgoing gate penalty
+        involved = set(result.involved_qbits)
+        e_penalty = 0.0
+        for (q_a, q_b) in E:
+            if q_a in involved or q_b in involved:
+                dist = D[pi_init[q_a]][pi_init[q_b]]
+                if not np.isinf(dist):
+                    e_penalty += max(0, (dist - 1)) * 3
+
+        return base_cost + routing_cost + 0.3 * e_penalty
+
     # ------------------------------------------------------------------------
     # Caching Methods
     # ------------------------------------------------------------------------
@@ -208,6 +256,56 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
             )
         return scoring_partitions
 
+    @staticmethod
+    def _group_circuit_with_params(circuit, parameters):
+        """
+        Group a flat circuit into 2-qubit blocks and reorder parameters to match.
+        Replicates the gate ordering logic of group_into_two_qubit_blocks to build
+        a parameter array consistent with the grouped circuit's parameter space.
+        """
+        gates = circuit.get_Gates()
+
+        # Track gate indices following group_into_two_qubit_blocks logic
+        pending = defaultdict(list)
+        block_gate_orders = []
+        last_block_for_qubit = {}
+
+        for gate_idx, gate in enumerate(gates):
+            qubits = gate.get_Involved_Qbits()
+            if len(qubits) == 1:
+                pending[qubits[0]].append(gate_idx)
+            else:
+                q0, q1 = qubits[0], qubits[1]
+                block_order = list(pending[q0]) + list(pending[q1]) + [gate_idx]
+                pending[q0].clear()
+                pending[q1].clear()
+                block_idx = len(block_gate_orders)
+                block_gate_orders.append(block_order)
+                last_block_for_qubit[q0] = block_idx
+                last_block_for_qubit[q1] = block_idx
+
+        # Trailing single-qubit gates
+        for q, gate_indices in pending.items():
+            if gate_indices and q in last_block_for_qubit:
+                block_gate_orders[last_block_for_qubit[q]].extend(gate_indices)
+
+        # Build parameter reordering from original gate indices
+        param_indices = []
+        for block_order in block_gate_orders:
+            for g_idx in block_order:
+                gate = gates[g_idx]
+                start = gate.get_Parameter_Start_Index()
+                num = gate.get_Parameter_Num()
+                param_indices.extend(range(start, start + num))
+
+        grouped_circ = group_into_two_qubit_blocks(circuit)
+        if param_indices:
+            grouped_params = parameters[np.array(param_indices)]
+        else:
+            grouped_params = np.array([])
+
+        return grouped_circ, grouped_params
+
     # ------------------------------------------------------------------------
     # Partition Decomposition Methods
     # ------------------------------------------------------------------------
@@ -296,10 +394,100 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
     # Circuit Synthesis
     # ------------------------------------------------------------------------
 
-    def SynthesizeWideCircuit(self, circ, orig_parameters):
-        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(circ, self.config["max_partition_size"])
-        qbit_num_orig_circuit = circ.get_Qbit_Num()
-        gate_dict = {i: gate for i, gate in enumerate(circ.get_Gates())}
+    def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG_start=0, DAG_end=0):
+        """
+        Partition and synthesize a circuit, optionally restricted to a window
+        of K DAG levels.
+
+        Args:
+            circ: The full quantum circuit
+            orig_parameters: Parameters for circ
+            pi_init: Current qubit permutation (logical->physical). When provided,
+                     enables routing-aware ILP scoring.
+            E: Virtual outgoing gates as List[(q_a, q_b)]. Computed automatically
+               when None and a window is active.
+            DAG_start: First DAG level to process (inclusive)
+            DAG_end: Last DAG level to process (exclusive). 0 means all levels.
+
+        Returns:
+            optimized_partitions: List of PartitionSynthesisResult / SingleQubitPartitionResult
+        """
+        # ---- Phase 0: Window extraction ----
+        all_gates = circ.get_Gates()
+        qbit_num = circ.get_Qbit_Num()
+        levels = self.generate_DAG_levels(circ)
+        total_levels = len(levels)
+
+        # Backward compatibility: DAG_start=0, DAG_end=0 means all levels
+        if DAG_start == 0 and DAG_end == 0:
+            effective_end = total_levels
+        else:
+            effective_end = min(DAG_end, total_levels)
+        effective_start = DAG_start
+
+        # Guard: empty window
+        if effective_start >= total_levels or effective_start >= effective_end:
+            self._last_synthesis_metadata = {
+                'E': E if E is not None else [],
+                'levels_processed': (effective_start, effective_end),
+                'total_levels': total_levels,
+            }
+            return []
+
+        # Collect window gate indices in topological order (level by level)
+        window_topo_order = []
+        for level_idx in range(effective_start, effective_end):
+            window_topo_order.extend(levels[level_idx])
+        window_gate_set = set(window_topo_order)
+
+        # Determine if we're processing the full circuit
+        full_circuit_mode = (len(window_gate_set) == len(all_gates))
+
+        if full_circuit_mode:
+            working_circ = circ
+            working_parameters = orig_parameters
+        else:
+            # Build sub-circuit from window gates
+            working_circ = Circuit(qbit_num)
+            working_params_list = []
+            for orig_idx in window_topo_order:
+                gate = all_gates[orig_idx]
+                working_circ.add_Gate(gate)
+                start = gate.get_Parameter_Start_Index()
+                working_params_list.append(
+                    orig_parameters[start:start + gate.get_Parameter_Num()]
+                )
+            if working_params_list:
+                working_parameters = np.concatenate(working_params_list, axis=0)
+            else:
+                working_parameters = np.array([])
+
+        # ---- Phase 0b: Identify virtual outgoing gates (E) ----
+        if E is None and not full_circuit_mode and effective_end < total_levels:
+            E = []
+            for orig_idx in window_topo_order:
+                gate = all_gates[orig_idx]
+                children = circ.get_Children(gate)
+                for child_idx in children:
+                    if child_idx not in window_gate_set:
+                        child_gate = all_gates[child_idx]
+                        child_qubits = child_gate.get_Involved_Qbits()
+                        if len(child_qubits) == 2:
+                            E.append((child_qubits[0], child_qubits[1]))
+            E = list(set(E))
+        elif E is None:
+            E = []
+
+        # ---- Phase 0c: Compute distance matrix if routing-aware ----
+        if pi_init is not None:
+            D = self.compute_distances_bfs(qbit_num)
+        else:
+            D = None
+
+        # ---- Phase 1: Partition enumeration ----
+        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
+        qbit_num_orig_circuit = working_circ.get_Qbit_Num()
+        gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())}
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
@@ -313,7 +501,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             for gate_idx in _get_topo_order({x: go[x] & gates for x in gates}, {x: rgo[x] & gates for x in gates}):
                 c.add_Gate( gate_dict[gate_idx] )
                 start = gate_dict[gate_idx].get_Parameter_Start_Index()
-                params.append(orig_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+                params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
             partitioned_circuit.add_Circuit(c)
         # Only add single-qubit chains as separate partitions if minimum_partition_size allows it
         for chain in single_qubit_chains:
@@ -321,15 +509,12 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             for gate_idx in chain:
                 c.add_Gate( gate_dict[gate_idx] )
                 start = gate_dict[gate_idx].get_Parameter_Start_Index()
-                params.append(orig_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+                params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
             partitioned_circuit.add_Circuit(c)
         parameters = np.concatenate(params, axis=0)
 
-        qbit_num_orig_circuit = circ.get_Qbit_Num()
-
-
+        # ---- Phase 2: Stage 1 synthesis (Sequential) ----
         subcircuits = partitioned_circuit.get_Gates()
-
         optimized_results = [None] * len(subcircuits)
 
         with Pool(processes=mp.cpu_count()) as pool:
@@ -338,32 +523,38 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 start_idx = subcircuit.get_Parameter_Start_Index()
                 end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
                 subcircuit_parameters = parameters[ start_idx:end_idx ]
-                qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
                 involved_qbits = subcircuit.get_Qbits()
 
-                qbit_num = len( involved_qbits )
-                mini_topologies = get_unique_subtopologies(self.topology, qbit_num)
+                qbit_num_sub = len( involved_qbits )
+                mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub)
                 qbit_map = {}
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
-                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
+                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
                 optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
-        
-        weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
+
+        # ---- Phase 3: ILP partition selection with routing-aware weights ----
+        if pi_init is not None and D is not None:
+            weights = [
+                self.compute_routing_aware_weight(result, pi_init, D, E)
+                for result in optimized_results[:len(allparts)]
+            ]
+        else:
+            weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
+
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
-        L = topo_sort_partitions(circ, self.config["max_partition_size"], parts)
+        L = topo_sort_partitions(working_circ, self.config["max_partition_size"], parts)
         from squander.partitioning.kahn import kahn_partition_preparts
         from squander.partitioning.tools import translate_param_order
-        partitioned_circuit, param_order, _ = kahn_partition_preparts(circ, self.config["max_partition_size"], [parts[i] for i in L])
-        parameters = translate_param_order(orig_parameters, param_order)
+        partitioned_circuit, param_order, _ = kahn_partition_preparts(working_circ, self.config["max_partition_size"], [parts[i] for i in L])
+        parameters = translate_param_order(working_parameters, param_order)
 
+        # ---- Phase 4: Stage 2 synthesis (Full) ----
         subcircuits = partitioned_circuit.get_Gates()
-
-        # the list of optimized subcircuits
         optimized_partitions = [None] * len(subcircuits)
 
         with Pool(processes=mp.cpu_count()) as pool:
@@ -372,21 +563,26 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 start_idx = subcircuit.get_Parameter_Start_Index()
                 end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
                 subcircuit_parameters = parameters[ start_idx:end_idx ]
-                qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
                 involved_qbits = subcircuit.get_Qbits()
 
-                qbit_num = len( involved_qbits )
-                mini_topologies = get_unique_subtopologies(self.topology, qbit_num)
+                qbit_num_sub = len( involved_qbits )
+                mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub)
                 qbit_map = {}
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
-                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num )
+                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
                 optimized_partitions[partition_idx] = pool.apply_async( self.DecomposePartition_Full, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
-                
-        
+
+        # ---- Phase 5: Store metadata and return ----
+        self._last_synthesis_metadata = {
+            'E': E,
+            'levels_processed': (effective_start, effective_end),
+            'total_levels': total_levels,
+        }
+
         return optimized_partitions
 
     # ------------------------------------------------------------------------
@@ -394,36 +590,110 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
     # ------------------------------------------------------------------------
 
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
-        
-        optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
-        
-        # Initialize topology candidates in PartitionSynthesisResult objects
-        for partition in optimized_partitions:
-            if isinstance(partition, PartitionSynthesisResult):
-                partition._topology = self.topology
-                partition._topology_cache = self._topology_cache
+        N = circ.get_Qbit_Num()
+
+        # Pre-process: group into 2-qubit blocks (skip if no 2-qubit gates)
+        has_2q_gates = any(len(g.get_Involved_Qbits()) >= 2 for g in circ.get_Gates())
+        if has_2q_gates:
+            grouped_circ, grouped_params = self._group_circuit_with_params(circ, orig_parameters)
+        else:
+            grouped_circ, grouped_params = circ, orig_parameters
+
+        window_size = self.config.get('window_size', 0)
+        levels = self.generate_DAG_levels(grouped_circ)
+        total_levels = len(levels)
+
+        # ---- Full-circuit path (backward compat) ----
+        if window_size <= 0 or window_size >= total_levels:
+            optimized_partitions = self.SynthesizeWideCircuit(grouped_circ, grouped_params)
+
+            for partition in optimized_partitions:
+                if isinstance(partition, PartitionSynthesisResult):
+                    partition._topology = self.topology
+                    partition._topology_cache = self._topology_cache
+
+            DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
+
+            D = self.compute_distances_bfs(N)
+            pi = self._compute_smart_initial_layout(circ, N, D)
+
+            F = self.get_initial_layer(IDAG, N, optimized_partitions)
+            scoring_partitions = self._build_scoring_partitions(optimized_partitions)
+
+            partition_order, pi, pi_initial = self.Heuristic_Search(F, pi.copy(), DAG, IDAG, optimized_partitions, scoring_partitions, D)
+
+            final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
+
+            return final_circuit, final_parameters, pi_initial, pi
+
+        # ---- Windowed mode ----
+        D = self.compute_distances_bfs(N)
+        pi = self._compute_smart_initial_layout(circ, N, D)
+        pi_initial = pi.copy()
+
+        all_window_circuits = []
+        all_window_params = []
+
+        for window_start in range(0, total_levels, window_size):
+            window_end = min(window_start + window_size, total_levels)
+
+            # a. Synthesize this window
+            window_partitions = self.SynthesizeWideCircuit(
+                grouped_circ, grouped_params,
+                pi_init=pi, DAG_start=window_start, DAG_end=window_end
+            )
+
+            # Skip empty windows
+            if not window_partitions:
+                continue
+
+            # Retrieve virtual outgoing gates computed by SynthesizeWideCircuit
+            virtual_E = self._last_synthesis_metadata.get('E', []) or []
+
+            # b. Set topology info on partition results
+            for partition in window_partitions:
+                if isinstance(partition, PartitionSynthesisResult):
+                    partition._topology = self.topology
+                    partition._topology_cache = self._topology_cache
+
+            # c. Build per-window structures
+            DAG, IDAG = self.construct_DAG_and_IDAG(window_partitions)
+            F = self.get_initial_layer(IDAG, N, window_partitions)
+            scoring_partitions = self._build_scoring_partitions(window_partitions)
+
+            # d. Heuristic search for this window (pi carries forward)
+            partition_order, pi, _ = self.Heuristic_Search(
+                F, pi.copy(), DAG, IDAG,
+                window_partitions, scoring_partitions, D,
+                virtual_E=virtual_E if virtual_E else None
+            )
+
+            # e. Construct window circuit
+            window_circuit, window_params = self.Construct_circuit_from_HS(
+                partition_order, window_partitions, N
+            )
+
+            # f. Append results
+            all_window_circuits.append(window_circuit)
+            all_window_params.append(window_params)
+
+        # Concatenate all window circuits and parameters
+        final_circuit = Circuit(N)
+        for wc in all_window_circuits:
+            final_circuit.add_Circuit(wc)
+
+        if all_window_params:
+            final_parameters = np.concatenate(all_window_params, axis=0)
+        else:
+            final_parameters = np.array([])
 
-        
-        DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
-        sDAG = self.construct_sDAG(optimized_partitions)
-        
-        D = self.compute_distances_bfs(circ.get_Qbit_Num())
-        pi = self._compute_smart_initial_layout(circ, circ.get_Qbit_Num(), D)
-        
-        F = self.get_initial_layer(IDAG, circ.get_Qbit_Num(),optimized_partitions)
-        scoring_partitions = self._build_scoring_partitions(optimized_partitions)
-        
-        partition_order, pi, pi_initial = self.Heuristic_Search(F,pi.copy(),DAG,IDAG, optimized_partitions,scoring_partitions,D, sDAG)
-        
-        final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order,optimized_partitions, circ.get_Qbit_Num())
-        
         return final_circuit, final_parameters, pi_initial, pi
 
     # ------------------------------------------------------------------------
     # Heuristic Search
     # ------------------------------------------------------------------------
 
-    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, sDAG):
+    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, virtual_E=None):
         pi_initial = pi.copy()
 
         resolved_partitions = [False] * len(DAG)
@@ -447,13 +717,13 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                     if parents_resolved:
                         F.append(child)
 
-        
+
         # Initialize progress bar
         total_partitions = len(DAG)
-        pbar = tqdm(total=total_partitions, desc="Heuristic Search", 
-                   bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved', 
+        pbar = tqdm(total=total_partitions, desc="Heuristic Search",
+                   bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved',
                    disable=self.config.get('progressbar', 0) == False)
-        
+
         configured_workers = self.config.get('hs_score_workers', os.cpu_count() or 1)
         score_workers = max(1, configured_workers if configured_workers else 1)
         executor: Optional[ProcessPoolExecutor] = None
@@ -462,7 +732,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 executor = ProcessPoolExecutor(
                     max_workers=score_workers,
                     initializer=_init_scoring_worker,
-                    initargs=(scoring_partitions, sDAG, D),
+                    initargs=(scoring_partitions, D, virtual_E),
                 )
             except Exception as exc:
                 logging.warning(
@@ -491,9 +761,9 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             F_snapshot,
                             pi,
                             scoring_partitions,
-                            sDAG,
                             D,
                             self._swap_cache,
+                            virtual_E,
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -575,74 +845,36 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def score_partition_candidate(partition_candidate, F,  pi, scoring_partitions, sDAG, D, swap_cache):
+    def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache, virtual_E=None):
         score_F = 0
-        score_E = 0
-        E_partitions = set()  # Changed to set for O(1) membership checks
-        E_partitions_1 = set()
-        E_partitions_2 = set()
         swap_weight = 4
         swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
-        score_F += swap_weight*len(swaps)*3
+        score_F += swap_weight * len(swaps) * 3
         score_F += len(partition_candidate.circuit_structure)
 
-        # Safety check: ensure partition_idx is valid for sDAG
-        if partition_candidate.partition_idx < len(sDAG):
-            for partition_idx in sDAG[partition_candidate.partition_idx]:
-                if partition_idx in E_partitions:
-                    continue
-                E_partitions.add(partition_idx)
-
-
         for partition_idx in F:
             partition = scoring_partitions[partition_idx]
             if partition is None or partition_idx == partition_candidate.partition_idx:
                 continue
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                dist_placeholder = swap_weight*3*calculate_dist_small(mini_topology,partition.qubit_map,D,output_perm)
+                dist_placeholder = swap_weight * 3 * calculate_dist_small(mini_topology, partition.qubit_map, D, output_perm)
                 circuit_length = np.min([len(circ) for circ in partition.circuit_structures[tdx]])
-                score = dist_placeholder + circuit_length
-                mini_scores.append(score)
+                mini_scores.append(dist_placeholder + circuit_length)
             if mini_scores:
                 score_F += np.min(mini_scores)
 
-            # Safety check: ensure partition_idx is valid for sDAG
-            if partition_idx < len(sDAG):
-                for partition_idx_E in sDAG[partition_idx]:
-                    if partition_idx_E in E_partitions:
-                        continue
-                    E_partitions.add(partition_idx_E)
+        # Virtual outgoing gate penalty: cross-window look-ahead
+        virtual_e_score = 0.0
+        if virtual_E:
+            for (q_a, q_b) in virtual_E:
+                dist = D[int(output_perm[q_a])][int(output_perm[q_b])]
+                if not np.isinf(dist):
+                    virtual_e_score += max(0, (dist - 1)) * 3
 
-        #check the secondary children            
-        for partition_idx in E_partitions: 
-            if partition_idx < len(sDAG):
-                for partition_idx_E in sDAG[partition_idx]:
-                    if partition_idx_E in E_partitions or partition_idx_E in E_partitions_1:
-                        continue
-                    E_partitions_1.add(partition_idx_E)
-        #score all
-        for partition_idx in E_partitions:
-            mini_scores = []
-            partition_result = scoring_partitions[partition_idx]
-            if partition_result is None:
-                continue
-            for tdx, mini_topology in enumerate(partition_result.mini_topologies):
-                dist_placeholder = 3*calculate_dist_small(mini_topology,partition_result.qubit_map,D,output_perm)
-                circuit_length = np.min([len(circ) for circ in partition_result.circuit_structures[tdx]])
-                score = dist_placeholder + circuit_length
-                mini_scores.append(score)
-            if mini_scores:
-                score_E += np.min(mini_scores)
+        E_score = 0.3 * virtual_e_score if virtual_e_score > 0.0 else 0.0
+        F_score = 0.7 * score_F
 
-        coeff_E = 0.3
-        if len(E_partitions) == 0:
-            E_score = 0.0
-        else:
-            E_score = coeff_E * score_E 
-        
-        F_score = 0.7*score_F
-        
         return E_score + F_score
 
     # ------------------------------------------------------------------------
@@ -959,4 +1191,37 @@ def generate_DAG_levels(self, circuit):
             current_level = next_level
         
         return levels
-    
\ No newline at end of file
+
+    def get_gate_DAG_level(self, circuit, gate_idx):
+        """
+        Find the DAG level a specific gate belongs to.
+
+        Args:
+            circuit: The quantum circuit to analyze
+            gate_idx: Index of the gate within the circuit's gate list
+
+        Returns:
+            int: The DAG level the gate belongs to (0-indexed), or -1 if not found.
+        """
+        levels = self.generate_DAG_levels(circuit)
+        for level_idx, level_gates in enumerate(levels):
+            if gate_idx in level_gates:
+                return level_idx
+        return -1
+
+    def get_gate_DAG_level_map(self, circuit):
+        """
+        Build a mapping from gate index to its DAG level.
+
+        Args:
+            circuit: The quantum circuit to analyze
+
+        Returns:
+            dict: Mapping {gate_idx: level} for every gate in the circuit.
+        """
+        levels = self.generate_DAG_levels(circuit)
+        gate_to_level = {}
+        for level_idx, level_gates in enumerate(levels):
+            for gate_idx in level_gates:
+                gate_to_level[gate_idx] = level_idx
+        return gate_to_level

From fc249adc0863e9f7e29feb983cd62a7e6b75558e Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 11:42:25 +0100
Subject: [PATCH 074/232] Fix get all partitions

---
 squander/synthesis/PartAM.py       | 20 ++++++++++++++------
 squander/synthesis/PartAM_utils.py | 10 +++++++++-
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index cf875ed0a..b441d2325 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -286,8 +286,13 @@ def _group_circuit_with_params(circuit, parameters):
 
         # Trailing single-qubit gates
         for q, gate_indices in pending.items():
-            if gate_indices and q in last_block_for_qubit:
+            if not gate_indices:
+                continue
+            if q in last_block_for_qubit:
                 block_gate_orders[last_block_for_qubit[q]].extend(gate_indices)
+            else:
+                # Qubit only has single-qubit gates — standalone block
+                block_gate_orders.append(list(gate_indices))
 
         # Build parameter reordering from original gate indices
         param_indices = []
@@ -485,9 +490,12 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
             D = None
 
         # ---- Phase 1: Partition enumeration ----
-        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
-        qbit_num_orig_circuit = working_circ.get_Qbit_Num()
-        gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())}
+        # Flatten the circuit so get_all_partitions sees individual gates
+        # (not Circuit blocks from group_into_two_qubit_blocks)
+        flat_circ = working_circ.get_Flat_Circuit()
+        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(flat_circ, self.config["max_partition_size"])
+        qbit_num_orig_circuit = flat_circ.get_Qbit_Num()
+        gate_dict = {i: gate for i, gate in enumerate(flat_circ.get_Gates())}
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
@@ -547,10 +555,10 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
 
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
-        L = topo_sort_partitions(working_circ, self.config["max_partition_size"], parts)
+        L = topo_sort_partitions(flat_circ, self.config["max_partition_size"], parts)
         from squander.partitioning.kahn import kahn_partition_preparts
         from squander.partitioning.tools import translate_param_order
-        partitioned_circuit, param_order, _ = kahn_partition_preparts(working_circ, self.config["max_partition_size"], [parts[i] for i in L])
+        partitioned_circuit, param_order, _ = kahn_partition_preparts(flat_circ, self.config["max_partition_size"], [parts[i] for i in L])
         parameters = translate_param_order(working_parameters, param_order)
 
         # ---- Phase 4: Stage 2 synthesis (Full) ----
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 0813cdef8..d523aefb9 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -538,10 +538,18 @@ def group_into_two_qubit_blocks(circuit: Circuit) -> Circuit:
 
     # Append trailing single-qubit gates to the last block that touched that qubit
     for q, gates_list in pending.items():
-        if gates_list and q in last_block_for_qubit:
+        if not gates_list:
+            continue
+        if q in last_block_for_qubit:
             block = blocks[last_block_for_qubit[q]]
             for g in gates_list:
                 block.add_Gate(g)
+        else:
+            # Qubit only has single-qubit gates — create a standalone block
+            block = Circuit(N)
+            for g in gates_list:
+                block.add_Gate(g)
+            blocks.append(block)
 
     result = Circuit(N)
     for block in blocks:

From 6fec10e7469fb4243a2d4a5f6f2cd7519e1ac468 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 11:52:01 +0100
Subject: [PATCH 075/232] Fix again

---
 squander/synthesis/PartAM.py | 137 ++++++++++++++++++-----------------
 1 file changed, 71 insertions(+), 66 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index b441d2325..3a5009310 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -257,11 +257,14 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
         return scoring_partitions
 
     @staticmethod
-    def _group_circuit_with_params(circuit, parameters):
+    def _group_circuit_for_levels(circuit):
         """
-        Group a flat circuit into 2-qubit blocks and reorder parameters to match.
-        Replicates the gate ordering logic of group_into_two_qubit_blocks to build
-        a parameter array consistent with the grouped circuit's parameter space.
+        Group a flat circuit into 2-qubit blocks for coarser DAG level generation.
+
+        Returns:
+            grouped_circ: Circuit with 2-qubit blocks as top-level elements
+            block_gate_orders: List of lists mapping each block index to the
+                               original flat gate indices it contains
         """
         gates = circuit.get_Gates()
 
@@ -294,22 +297,9 @@ def _group_circuit_with_params(circuit, parameters):
                 # Qubit only has single-qubit gates — standalone block
                 block_gate_orders.append(list(gate_indices))
 
-        # Build parameter reordering from original gate indices
-        param_indices = []
-        for block_order in block_gate_orders:
-            for g_idx in block_order:
-                gate = gates[g_idx]
-                start = gate.get_Parameter_Start_Index()
-                num = gate.get_Parameter_Num()
-                param_indices.extend(range(start, start + num))
-
         grouped_circ = group_into_two_qubit_blocks(circuit)
-        if param_indices:
-            grouped_params = parameters[np.array(param_indices)]
-        else:
-            grouped_params = np.array([])
 
-        return grouped_circ, grouped_params
+        return grouped_circ, block_gate_orders
 
     # ------------------------------------------------------------------------
     # Partition Decomposition Methods
@@ -399,13 +389,13 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
     # Circuit Synthesis
     # ------------------------------------------------------------------------
 
-    def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG_start=0, DAG_end=0):
+    def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
+                              DAG_start=0, DAG_end=0, window_gate_indices=None):
         """
-        Partition and synthesize a circuit, optionally restricted to a window
-        of K DAG levels.
+        Partition and synthesize a circuit, optionally restricted to a window.
 
         Args:
-            circ: The full quantum circuit
+            circ: The full quantum circuit (must be flat — no subcircuit blocks)
             orig_parameters: Parameters for circ
             pi_init: Current qubit permutation (logical->physical). When provided,
                      enables routing-aware ILP scoring.
@@ -413,6 +403,8 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
                when None and a window is active.
             DAG_start: First DAG level to process (inclusive)
             DAG_end: Last DAG level to process (exclusive). 0 means all levels.
+            window_gate_indices: Optional list of gate indices (into circ) to
+                process. When provided, overrides DAG_start/DAG_end.
 
         Returns:
             optimized_partitions: List of PartitionSynthesisResult / SingleQubitPartitionResult
@@ -420,33 +412,38 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
         # ---- Phase 0: Window extraction ----
         all_gates = circ.get_Gates()
         qbit_num = circ.get_Qbit_Num()
-        levels = self.generate_DAG_levels(circ)
-        total_levels = len(levels)
 
-        # Backward compatibility: DAG_start=0, DAG_end=0 means all levels
-        if DAG_start == 0 and DAG_end == 0:
-            effective_end = total_levels
+        if window_gate_indices is not None:
+            # Window specified by explicit gate indices
+            window_topo_order = list(window_gate_indices)
+            window_gate_set = set(window_topo_order)
+            full_circuit_mode = (len(window_gate_set) == len(all_gates))
+            has_gates_beyond_window = len(window_gate_set) < len(all_gates)
         else:
-            effective_end = min(DAG_end, total_levels)
-        effective_start = DAG_start
-
-        # Guard: empty window
-        if effective_start >= total_levels or effective_start >= effective_end:
-            self._last_synthesis_metadata = {
-                'E': E if E is not None else [],
-                'levels_processed': (effective_start, effective_end),
-                'total_levels': total_levels,
-            }
-            return []
-
-        # Collect window gate indices in topological order (level by level)
-        window_topo_order = []
-        for level_idx in range(effective_start, effective_end):
-            window_topo_order.extend(levels[level_idx])
-        window_gate_set = set(window_topo_order)
+            # Window specified by DAG level range
+            levels = self.generate_DAG_levels(circ)
+            total_levels = len(levels)
 
-        # Determine if we're processing the full circuit
-        full_circuit_mode = (len(window_gate_set) == len(all_gates))
+            if DAG_start == 0 and DAG_end == 0:
+                effective_end = total_levels
+            else:
+                effective_end = min(DAG_end, total_levels)
+            effective_start = DAG_start
+
+            if effective_start >= total_levels or effective_start >= effective_end:
+                self._last_synthesis_metadata = {
+                    'E': E if E is not None else [],
+                    'window_gates': 0,
+                    'total_gates': len(all_gates),
+                }
+                return []
+
+            window_topo_order = []
+            for level_idx in range(effective_start, effective_end):
+                window_topo_order.extend(levels[level_idx])
+            window_gate_set = set(window_topo_order)
+            full_circuit_mode = (len(window_gate_set) == len(all_gates))
+            has_gates_beyond_window = effective_end < total_levels
 
         if full_circuit_mode:
             working_circ = circ
@@ -468,7 +465,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
                 working_parameters = np.array([])
 
         # ---- Phase 0b: Identify virtual outgoing gates (E) ----
-        if E is None and not full_circuit_mode and effective_end < total_levels:
+        if E is None and not full_circuit_mode and has_gates_beyond_window:
             E = []
             for orig_idx in window_topo_order:
                 gate = all_gates[orig_idx]
@@ -490,12 +487,9 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
             D = None
 
         # ---- Phase 1: Partition enumeration ----
-        # Flatten the circuit so get_all_partitions sees individual gates
-        # (not Circuit blocks from group_into_two_qubit_blocks)
-        flat_circ = working_circ.get_Flat_Circuit()
-        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(flat_circ, self.config["max_partition_size"])
-        qbit_num_orig_circuit = flat_circ.get_Qbit_Num()
-        gate_dict = {i: gate for i, gate in enumerate(flat_circ.get_Gates())}
+        allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
+        qbit_num_orig_circuit = working_circ.get_Qbit_Num()
+        gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())}
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
@@ -555,10 +549,10 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
 
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
-        L = topo_sort_partitions(flat_circ, self.config["max_partition_size"], parts)
+        L = topo_sort_partitions(working_circ, self.config["max_partition_size"], parts)
         from squander.partitioning.kahn import kahn_partition_preparts
         from squander.partitioning.tools import translate_param_order
-        partitioned_circuit, param_order, _ = kahn_partition_preparts(flat_circ, self.config["max_partition_size"], [parts[i] for i in L])
+        partitioned_circuit, param_order, _ = kahn_partition_preparts(working_circ, self.config["max_partition_size"], [parts[i] for i in L])
         parameters = translate_param_order(working_parameters, param_order)
 
         # ---- Phase 4: Stage 2 synthesis (Full) ----
@@ -587,8 +581,8 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
         # ---- Phase 5: Store metadata and return ----
         self._last_synthesis_metadata = {
             'E': E,
-            'levels_processed': (effective_start, effective_end),
-            'total_levels': total_levels,
+            'window_gates': len(window_gate_set),
+            'total_gates': len(all_gates),
         }
 
         return optimized_partitions
@@ -600,20 +594,22 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None, DAG
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         N = circ.get_Qbit_Num()
 
-        # Pre-process: group into 2-qubit blocks (skip if no 2-qubit gates)
+        # Pre-process: group circuit for coarser DAG levels (window boundaries)
         has_2q_gates = any(len(g.get_Involved_Qbits()) >= 2 for g in circ.get_Gates())
         if has_2q_gates:
-            grouped_circ, grouped_params = self._group_circuit_with_params(circ, orig_parameters)
+            grouped_circ, block_gate_orders = self._group_circuit_for_levels(circ)
         else:
-            grouped_circ, grouped_params = circ, orig_parameters
+            grouped_circ = circ
+            block_gate_orders = None
 
         window_size = self.config.get('window_size', 0)
-        levels = self.generate_DAG_levels(grouped_circ)
-        total_levels = len(levels)
+        grouped_levels = self.generate_DAG_levels(grouped_circ)
+        total_levels = len(grouped_levels)
 
         # ---- Full-circuit path (backward compat) ----
         if window_size <= 0 or window_size >= total_levels:
-            optimized_partitions = self.SynthesizeWideCircuit(grouped_circ, grouped_params)
+            # Pass the original flat circuit — no grouping needed for full circuit
+            optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
 
             for partition in optimized_partitions:
                 if isinstance(partition, PartitionSynthesisResult):
@@ -645,10 +641,19 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         for window_start in range(0, total_levels, window_size):
             window_end = min(window_start + window_size, total_levels)
 
-            # a. Synthesize this window
+            # Expand grouped block indices to flat gate indices
+            window_gate_indices = []
+            for level_idx in range(window_start, window_end):
+                for block_idx in grouped_levels[level_idx]:
+                    if block_gate_orders is not None:
+                        window_gate_indices.extend(block_gate_orders[block_idx])
+                    else:
+                        window_gate_indices.append(block_idx)
+
+            # a. Synthesize this window (pass original flat circuit)
             window_partitions = self.SynthesizeWideCircuit(
-                grouped_circ, grouped_params,
-                pi_init=pi, DAG_start=window_start, DAG_end=window_end
+                circ, orig_parameters,
+                pi_init=pi, window_gate_indices=window_gate_indices
             )
 
             # Skip empty windows

From 61f9181982464d416e601cb39faca4fa39bebd71 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 12:58:14 +0100
Subject: [PATCH 076/232] Update window size remove OSR

---
 examples/decomposition/PartAM_example.py | 2 +-
 squander/synthesis/PartAM.py             | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 82cba7c1e..315bb7164 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -104,7 +104,7 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
         'max_partition_size': 3,
         'progressbar': True,
         'topology': topology,
-        'window_size': 3,
+        'window_size': 5,
     }
 
     start_time = time.time()
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3a5009310..3c2347a6d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -126,6 +126,7 @@ def __init__(self, config):
         self.config.setdefault('optimizer', 'BFGS')
         self.config.setdefault('hs_score_workers', os.cpu_count() or 1)
         self.config.setdefault('window_size', 0)  # 0 = full circuit (backward compat)
+        self.config.setdefault('use_osr',0)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:

From 0e93f38bb90c4f71ad6a3cea001c1744b51e235b Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 13:22:11 +0100
Subject: [PATCH 077/232] Add DAG position based partition weight

---
 squander/synthesis/PartAM.py | 69 ++++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 27 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3c2347a6d..9292bea20 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -140,20 +140,22 @@ def __init__(self, config):
     # Scoring Methods
     # ------------------------------------------------------------------------
 
-    def compute_routing_aware_weight(self, result, pi_init, D, E):
+    def compute_routing_aware_weight(self, result, pi_init, D, E, dag_position=0.5):
         """
         Compute a routing-aware ILP weight for a partition synthesis result.
 
-        Combines three components:
-        1. Base synthesis cost (min CNOT count across topologies)
-        2. Routing cost from pi_init (estimated SWAP overhead)
-        3. Virtual outgoing gate penalty (future routing constraints)
+        Evaluates each (topology, permutation) combination together and uses
+        DAG-position-dependent weighting:
+        - Early partitions (dag_position~0): routing cost weighted higher
+        - Late partitions (dag_position~1): E penalty weighted higher
 
         Args:
             result: PartitionSynthesisResult or SingleQubitPartitionResult
             pi_init: Current qubit layout (logical -> physical mapping)
             D: Distance matrix between physical qubits
             E: List of (q_a, q_b) tuples for virtual outgoing 2-qubit gates
+            dag_position: Float in [0, 1] indicating partition's relative
+                depth in the DAG (0 = start, 1 = end)
 
         Returns:
             float: Combined weight (lower is better)
@@ -161,27 +163,33 @@ def compute_routing_aware_weight(self, result, pi_init, D, E):
         if isinstance(result, SingleQubitPartitionResult):
             return 0
 
-        # 1. Base synthesis cost
-        base_cost = result.get_partition_synthesis_score()
+        # Position-dependent weighting
+        routing_weight = 1.0 - dag_position
+        e_weight = dag_position
 
-        # 2. Routing cost: best (minimum) across topologies
-        routing_cost = np.inf
-        for mini_topology in result.mini_topologies:
-            dist = calculate_dist_small(mini_topology, result.qubit_map, D, pi_init)
-            routing_cost = min(routing_cost, dist)
-        if np.isinf(routing_cost):
-            routing_cost = 0
-
-        # 3. Virtual outgoing gate penalty
+        # E penalty
         involved = set(result.involved_qbits)
         e_penalty = 0.0
-        for (q_a, q_b) in E:
-            if q_a in involved or q_b in involved:
-                dist = D[pi_init[q_a]][pi_init[q_b]]
-                if not np.isinf(dist):
-                    e_penalty += max(0, (dist - 1)) * 3
-
-        return base_cost + routing_cost + 0.3 * e_penalty
+        if E:
+            for (q_a, q_b) in E:
+                if q_a in involved or q_b in involved:
+                    dist = D[pi_init[q_a]][pi_init[q_b]]
+                    if not np.isinf(dist):
+                        e_penalty += max(0, (dist - 1)) * 3
+
+        # Evaluate each (topology, permutation) combination together
+        best_score = np.inf
+        for tdx, mini_topology in enumerate(result.mini_topologies):
+            routing_cost = calculate_dist_small(mini_topology, result.qubit_map, D, pi_init)
+            for pdx in range(len(result.cnot_counts[tdx])):
+                cnot_count = result.cnot_counts[tdx][pdx]
+                score = cnot_count + routing_weight * routing_cost
+                best_score = min(best_score, score)
+
+        if np.isinf(best_score):
+            best_score = 0
+
+        return best_score + e_weight * e_penalty
 
     # ------------------------------------------------------------------------
     # Caching Methods
@@ -541,10 +549,17 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
 
         # ---- Phase 3: ILP partition selection with routing-aware weights ----
         if pi_init is not None and D is not None:
-            weights = [
-                self.compute_routing_aware_weight(result, pi_init, D, E)
-                for result in optimized_results[:len(allparts)]
-            ]
+            gate_to_level = self.get_gate_DAG_level_map(working_circ)
+            max_level = max(gate_to_level.values()) if gate_to_level else 0
+
+            weights = []
+            for idx, result in enumerate(optimized_results[:len(allparts)]):
+                partition_gates = allparts[idx]
+                part_depth = max(gate_to_level.get(g, 0) for g in partition_gates)
+                dag_position = part_depth / max_level if max_level > 0 else 0.5
+                weights.append(
+                    self.compute_routing_aware_weight(result, pi_init, D, E, dag_position)
+                )
         else:
             weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
 

From 9db7f56e4d8a99bd64c098fe6ba37296752cc43f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 15:22:27 +0100
Subject: [PATCH 078/232] Add proper free routing

---
 squander/synthesis/PartAM.py | 202 +++++++++++++++++++++++++----------
 1 file changed, 146 insertions(+), 56 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 9292bea20..efd0630b8 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -81,14 +81,15 @@ def _init_scoring_worker(scoring_partitions, distance_matrix, virtual_E=None):
 def _score_candidate_worker(payload):
     """
     Worker wrapper that reconstructs scoring inputs from a lightweight payload.
-    Payload format: (PartitionCandidate, F_snapshot, pi_snapshot)
+    Payload format: (PartitionCandidate, F_snapshot, pi_snapshot[, free_routing])
     """
     if (
         _WORKER_SCORING_PARTITIONS is None
         or _WORKER_DISTANCE_MATRIX is None
     ):
         raise RuntimeError("Scoring worker not initialized with shared data.")
-    partition_candidate, F_snapshot, pi_snapshot = payload
+    partition_candidate, F_snapshot, pi_snapshot = payload[:3]
+    free_routing = payload[3] if len(payload) > 3 else False
     return qgd_Partition_Aware_Mapping.score_partition_candidate(
         partition_candidate,
         F_snapshot,
@@ -97,6 +98,7 @@ def _score_candidate_worker(payload):
         _WORKER_DISTANCE_MATRIX,
         _WORKER_SWAP_CACHE,
         _WORKER_VIRTUAL_E,
+        free_routing=free_routing,
     )
 
 
@@ -140,22 +142,24 @@ def __init__(self, config):
     # Scoring Methods
     # ------------------------------------------------------------------------
 
-    def compute_routing_aware_weight(self, result, pi_init, D, E, dag_position=0.5):
+    def compute_routing_aware_weight(self, result, pi_init, D, E, dag_start=0.0, dag_end=1.0):
         """
         Compute a routing-aware ILP weight for a partition synthesis result.
 
         Evaluates each (topology, permutation) combination together and uses
         DAG-position-dependent weighting:
-        - Early partitions (dag_position~0): routing cost weighted higher
-        - Late partitions (dag_position~1): E penalty weighted higher
+        - Early partitions (dag_start~0): routing cost weighted higher
+        - Late partitions (dag_end~1): E penalty weighted higher
+        - Partitions spanning the full window get both weights high
 
         Args:
             result: PartitionSynthesisResult or SingleQubitPartitionResult
-            pi_init: Current qubit layout (logical -> physical mapping)
+            pi_init: Current qubit layout (logical -> physical mapping).
+                     None when routing is free (first window).
             D: Distance matrix between physical qubits
             E: List of (q_a, q_b) tuples for virtual outgoing 2-qubit gates
-            dag_position: Float in [0, 1] indicating partition's relative
-                depth in the DAG (0 = start, 1 = end)
+            dag_start: Float in [0, 1] — earliest DAG level of the partition
+            dag_end: Float in [0, 1] — latest DAG level of the partition
 
         Returns:
             float: Combined weight (lower is better)
@@ -164,8 +168,8 @@ def compute_routing_aware_weight(self, result, pi_init, D, E, dag_position=0.5):
             return 0
 
         # Position-dependent weighting
-        routing_weight = 1.0 - dag_position
-        e_weight = dag_position
+        routing_weight = 1.0 - dag_start
+        e_weight = dag_end
 
         # E penalty
         involved = set(result.involved_qbits)
@@ -173,14 +177,21 @@ def compute_routing_aware_weight(self, result, pi_init, D, E, dag_position=0.5):
         if E:
             for (q_a, q_b) in E:
                 if q_a in involved or q_b in involved:
-                    dist = D[pi_init[q_a]][pi_init[q_b]]
-                    if not np.isinf(dist):
-                        e_penalty += max(0, (dist - 1)) * 3
+                    if pi_init is not None:
+                        dist = D[pi_init[q_a]][pi_init[q_b]]
+                        if not np.isinf(dist):
+                            e_penalty += max(0, (dist - 1)) * 3
+                    else:
+                        # No layout yet — fixed penalty per touching E edge
+                        e_penalty += 3.0
 
         # Evaluate each (topology, permutation) combination together
         best_score = np.inf
         for tdx, mini_topology in enumerate(result.mini_topologies):
-            routing_cost = calculate_dist_small(mini_topology, result.qubit_map, D, pi_init)
+            if pi_init is not None:
+                routing_cost = calculate_dist_small(mini_topology, result.qubit_map, D, pi_init)
+            else:
+                routing_cost = 0
             for pdx in range(len(result.cnot_counts[tdx])):
                 cnot_count = result.cnot_counts[tdx][pdx]
                 score = cnot_count + routing_weight * routing_cost
@@ -217,6 +228,45 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
         
         return self._topology_cache[canonical_key]
 
+    @staticmethod
+    def _compute_ideal_pi_for_candidate(candidate, N):
+        """
+        Compute the ideal pi_initial such that the given candidate needs zero
+        SWAPs for input routing, plus the resulting pi_output after the partition.
+
+        Returns:
+            pi_initial: np.ndarray — layout where partition qubits are already
+                        at their required physical positions.
+            pi_output:  np.ndarray — layout after the partition circuit (P_o applied).
+        """
+        P_i_inv = [candidate.P_i.index(i) for i in range(len(candidate.P_i))]
+
+        # Required physical position for each partition qubit
+        required = {}
+        for k, v in candidate.qbit_map.items():
+            required[k] = candidate.node_mapping[P_i_inv[v]]
+
+        pi_initial = np.zeros(N, dtype=int)
+        used_physical = set(required.values())
+
+        for k, p in required.items():
+            pi_initial[k] = p
+
+        remaining_physical = sorted(p for p in range(N) if p not in used_physical)
+        remaining_logical = sorted(q for q in range(N) if q not in required)
+        for q, p in zip(remaining_logical, remaining_physical):
+            pi_initial[q] = p
+
+        # Apply P_o to get output permutation (mirrors transform_pi logic)
+        pi_output = np.array(pi_initial, dtype=int)
+        qbit_map_inverse = {v: k for k, v in candidate.qbit_map.items()}
+        for q_star in range(len(candidate.P_o)):
+            if q_star in qbit_map_inverse:
+                k = qbit_map_inverse[q_star]
+                pi_output[k] = candidate.node_mapping[candidate.P_o[q_star]]
+
+        return pi_initial, pi_output
+
     def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[PartitionScoreData]]:
         """
         Create lightweight, picklable views of partitions that contain only the
@@ -489,11 +539,8 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
         elif E is None:
             E = []
 
-        # ---- Phase 0c: Compute distance matrix if routing-aware ----
-        if pi_init is not None:
-            D = self.compute_distances_bfs(qbit_num)
-        else:
-            D = None
+        # ---- Phase 0c: Compute distance matrix ----
+        D = self.compute_distances_bfs(qbit_num)
 
         # ---- Phase 1: Partition enumeration ----
         allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
@@ -548,20 +595,19 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
 
         # ---- Phase 3: ILP partition selection with routing-aware weights ----
-        if pi_init is not None and D is not None:
-            gate_to_level = self.get_gate_DAG_level_map(working_circ)
-            max_level = max(gate_to_level.values()) if gate_to_level else 0
-
-            weights = []
-            for idx, result in enumerate(optimized_results[:len(allparts)]):
-                partition_gates = allparts[idx]
-                part_depth = max(gate_to_level.get(g, 0) for g in partition_gates)
-                dag_position = part_depth / max_level if max_level > 0 else 0.5
-                weights.append(
-                    self.compute_routing_aware_weight(result, pi_init, D, E, dag_position)
-                )
-        else:
-            weights = [result.get_partition_synthesis_score() for result in optimized_results[:len(allparts)]]
+        gate_to_level = self.get_gate_DAG_level_map(working_circ)
+        max_level = max(gate_to_level.values()) if gate_to_level else 0
+
+        weights = []
+        for idx, result in enumerate(optimized_results[:len(allparts)]):
+            partition_gates = allparts[idx]
+            part_start = min(gate_to_level.get(g, 0) for g in partition_gates)
+            part_end = max(gate_to_level.get(g, 0) for g in partition_gates)
+            dag_start = part_start / max_level if max_level > 0 else 0.0
+            dag_end = part_end / max_level if max_level > 0 else 1.0
+            weights.append(
+                self.compute_routing_aware_weight(result, pi_init, D, E, dag_start, dag_end)
+            )
 
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
@@ -635,12 +681,12 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
             DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
 
             D = self.compute_distances_bfs(N)
-            pi = self._compute_smart_initial_layout(circ, N, D)
+            pi = np.arange(N)  # Dummy — free_initial_routing will derive pi_initial
 
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
             scoring_partitions = self._build_scoring_partitions(optimized_partitions)
 
-            partition_order, pi, pi_initial = self.Heuristic_Search(F, pi.copy(), DAG, IDAG, optimized_partitions, scoring_partitions, D)
+            partition_order, pi, pi_initial = self.Heuristic_Search(F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, free_initial_routing=True)
 
             final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
 
@@ -648,14 +694,15 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
 
         # ---- Windowed mode ----
         D = self.compute_distances_bfs(N)
-        pi = self._compute_smart_initial_layout(circ, N, D)
-        pi_initial = pi.copy()
+        pi = np.arange(N)  # Dummy for first window — free_initial_routing derives pi_initial
+        pi_initial = None
 
         all_window_circuits = []
         all_window_params = []
 
         for window_start in range(0, total_levels, window_size):
             window_end = min(window_start + window_size, total_levels)
+            is_first_window = (window_start == 0)
 
             # Expand grouped block indices to flat gate indices
             window_gate_indices = []
@@ -669,7 +716,8 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
             # a. Synthesize this window (pass original flat circuit)
             window_partitions = self.SynthesizeWideCircuit(
                 circ, orig_parameters,
-                pi_init=pi, window_gate_indices=window_gate_indices
+                pi_init=pi if not is_first_window else None,
+                window_gate_indices=window_gate_indices
             )
 
             # Skip empty windows
@@ -691,12 +739,16 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
             scoring_partitions = self._build_scoring_partitions(window_partitions)
 
             # d. Heuristic search for this window (pi carries forward)
-            partition_order, pi, _ = self.Heuristic_Search(
+            partition_order, pi, window_pi_initial = self.Heuristic_Search(
                 F, pi.copy(), DAG, IDAG,
                 window_partitions, scoring_partitions, D,
-                virtual_E=virtual_E if virtual_E else None
+                virtual_E=virtual_E if virtual_E else None,
+                free_initial_routing=is_first_window,
             )
 
+            if is_first_window:
+                pi_initial = window_pi_initial
+
             # e. Construct window circuit
             window_circuit, window_params = self.Construct_circuit_from_HS(
                 partition_order, window_partitions, N
@@ -716,25 +768,36 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         else:
             final_parameters = np.array([])
 
+        if pi_initial is None:
+            pi_initial = np.arange(N)
+
         return final_circuit, final_parameters, pi_initial, pi
 
     # ------------------------------------------------------------------------
     # Heuristic Search
     # ------------------------------------------------------------------------
 
-    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, virtual_E=None):
+    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, virtual_E=None, free_initial_routing=False):
         pi_initial = pi.copy()
 
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
+        first_routing_done = not free_initial_routing
+        buffered_single_qubit = []
+
         for partition_idx in list(F):
             if isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
                 F.remove(partition_idx)
                 single_qubit_part = optimized_partitions[partition_idx]
-                qubit = single_qubit_part.circuit.get_Qbits()[0]
-                single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
-                partition_order.append(single_qubit_part)
+
+                if free_initial_routing and not first_routing_done:
+                    # Buffer — will remap after pi_initial is determined
+                    buffered_single_qubit.append(single_qubit_part)
+                else:
+                    qubit = single_qubit_part.circuit.get_Qbits()[0]
+                    single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
+                    partition_order.append(single_qubit_part)
 
                 resolved_partitions[partition_idx] = True
                 children = list(DAG[partition_idx])
@@ -776,10 +839,11 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 if len(partition_candidates) == 0:
                     break
                 F_snapshot = tuple(F)
+                use_free_routing = not first_routing_done
                 if executor is not None:
                     pi_snapshot = tuple(int(x) for x in pi)
                     payloads = [
-                        (partition_candidate, F_snapshot, pi_snapshot)
+                        (partition_candidate, F_snapshot, pi_snapshot, use_free_routing)
                         for partition_candidate in partition_candidates
                     ]
                     scores = list(executor.map(_score_candidate_worker, payloads))
@@ -793,23 +857,37 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             D,
                             self._swap_cache,
                             virtual_E,
+                            free_routing=use_free_routing,
                         )
                         for partition_candidate in partition_candidates
                     ]
                 min_idx = np.argmin(scores)
                 min_partition_candidate = partition_candidates[min_idx]
-                
+
                 F.remove(min_partition_candidate.partition_idx)
                 resolved_partitions[min_partition_candidate.partition_idx] = True
                 resolved_count = sum(resolved_partitions)
                 pbar.n = resolved_count
                 pbar.refresh()
-                pi_prev = pi # Save previous pi state for filtering
-                swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
-                if len(swap_order)!=0:
-                    partition_order.append(construct_swap_circuit(swap_order, len(pi)))
-                
-                
+
+                if not first_routing_done:
+                    # Derive pi_initial from chosen candidate — no SWAPs needed
+                    pi_initial, pi = self._compute_ideal_pi_for_candidate(
+                        min_partition_candidate, len(pi)
+                    )
+                    first_routing_done = True
+
+                    # Remap and insert buffered single-qubit partitions
+                    for sq_part in buffered_single_qubit:
+                        qubit = sq_part.circuit.get_Qbits()[0]
+                        sq_part.circuit.Remap_Qbits({int(qubit): int(pi_initial[qubit])}, max(D.shape))
+                        partition_order.append(sq_part)
+                    buffered_single_qubit = []
+                else:
+                    swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
+                    if len(swap_order)!=0:
+                        partition_order.append(construct_swap_circuit(swap_order, len(pi)))
+
                 partition_order.append(min_partition_candidate)
                 children = list(DAG[min_partition_candidate.partition_idx])
                 step += 1
@@ -834,6 +912,14 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         finally:
             if executor is not None:
                 executor.shutdown()
+
+        # If no multi-qubit partition was resolved, flush buffered single-qubit parts
+        if buffered_single_qubit:
+            for sq_part in buffered_single_qubit:
+                qubit = sq_part.circuit.get_Qbits()[0]
+                sq_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])}, max(D.shape))
+                partition_order.append(sq_part)
+
         pbar.close()
         return partition_order, pi, pi_initial
 
@@ -874,11 +960,12 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache, virtual_E=None):
+    def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache, virtual_E=None, free_routing=False):
         score_F = 0
         swap_weight = 4
         swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
-        score_F += swap_weight * len(swaps) * 3
+        if not free_routing:
+            score_F += swap_weight * len(swaps) * 3
         score_F += len(partition_candidate.circuit_structure)
 
         for partition_idx in F:
@@ -887,13 +974,16 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 continue
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                dist_placeholder = swap_weight * 3 * calculate_dist_small(mini_topology, partition.qubit_map, D, output_perm)
+                if not free_routing:
+                    dist_placeholder = swap_weight * 3 * calculate_dist_small(mini_topology, partition.qubit_map, D, output_perm)
+                else:
+                    dist_placeholder = 0
                 circuit_length = np.min([len(circ) for circ in partition.circuit_structures[tdx]])
                 mini_scores.append(dist_placeholder + circuit_length)
             if mini_scores:
                 score_F += np.min(mini_scores)
 
-        # Virtual outgoing gate penalty: cross-window look-ahead
+        # Virtual outgoing gate penalty: cross-window look-ahead (always active)
         virtual_e_score = 0.0
         if virtual_E:
             for (q_a, q_b) in virtual_E:

From 5427fbb1f3f8068f3b124e81e87e28f810c2c5f6 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 15:33:51 +0100
Subject: [PATCH 079/232] Add Basin-hopping and OSR to PartAM

---
 squander/synthesis/PartAM.py | 52 ++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index efd0630b8..6e44380f7 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -125,10 +125,16 @@ def __init__(self, config):
         self.config.setdefault('topology', None)
         self.config.setdefault('routed', False)
         self.config.setdefault('partition_strategy','ilp')
-        self.config.setdefault('optimizer', 'BFGS')
+        self.config.setdefault('optimizer', 'BFGS2')
+        self.config.setdefault('use_basin_hopping', 1)
+        self.config.setdefault('bh_T', 1.0)
+        self.config.setdefault('bh_stepsize', 0.5)
+        self.config.setdefault('bh_interval', 50)
+        self.config.setdefault('bh_target_accept_rate', 0.5)
+        self.config.setdefault('bh_stepwise_factor', 0.9)
         self.config.setdefault('hs_score_workers', os.cpu_count() or 1)
         self.config.setdefault('window_size', 0)  # 0 = full circuit (backward compat)
-        self.config.setdefault('use_osr',0)
+        self.config.setdefault('use_osr',1)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -422,24 +428,36 @@ def DecomposePartition_Full(Partition_circuit: Circuit, Partition_parameters: np
         return result
 
     @staticmethod
-    def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None) -> Circuit:
+    def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None, max_retries: int = 5) -> Circuit:
         """
-        Call to decompose a partition
+        Call to decompose a partition. Retries up to max_retries times if the
+        decomposition error exceeds the configured tolerance.
         """
+        tolerance = config["tolerance"]
         strategy = config["strategy"]
-        if strategy == "TreeSearch":
-            cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
-        elif strategy == "TabuSearch":
-            cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
-        elif strategy == "Adaptive":
-            cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
-        else:
-            raise Exception(f"Unsupported decomposition type: {strategy}")
-        cDecompose.set_Verbose( config["verbosity"] )
-        cDecompose.set_Cost_Function_Variant( 3 )    
-        cDecompose.set_Optimization_Tolerance( config["tolerance"] )
-        cDecompose.set_Optimizer( config["optimizer"] )
-        cDecompose.Start_Decomposition()
+
+        for attempt in range(max_retries):
+            if strategy == "TreeSearch":
+                cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
+            elif strategy == "TabuSearch":
+                cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
+            elif strategy == "Adaptive":
+                cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
+            else:
+                raise Exception(f"Unsupported decomposition type: {strategy}")
+            cDecompose.set_Verbose( config["verbosity"] )
+            cDecompose.set_Cost_Function_Variant( 3 )
+            cDecompose.set_Optimization_Tolerance( tolerance )
+            cDecompose.set_Optimizer( config["optimizer"] )
+            cDecompose.Start_Decomposition()
+
+            err = cDecompose.get_Decomposition_Error()
+            if err <= tolerance:
+                break
+
+            if attempt >= max_retries - 1:
+                break
+
         squander_circuit = cDecompose.get_Circuit()
         parameters       = cDecompose.get_Optimized_Parameters()
         return squander_circuit, parameters

From 7bb8b9e566789addc9594132e7dc6411b9508b09 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 27 Feb 2026 21:38:00 +0100
Subject: [PATCH 080/232] fix bug

---
 squander/synthesis/PartAM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6e44380f7..f3f0cf1bb 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -966,7 +966,7 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
                 partition_count += 1
         
         if final_parameters:
-            final_parameters = np.concatenate(final_parameters,axis=0)
+            final_parameters = np.concatenate([np.atleast_1d(p).ravel() for p in final_parameters], axis=0)
         else:
             final_parameters = np.array([])
         if not check_circuit_compatibility(final_circuit,self.topology):

From cffc8c6e75db92b900fd498b2d387cdd1aa2616a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 1 Mar 2026 00:20:43 +0100
Subject: [PATCH 081/232] add interpartition scores

---
 examples/decomposition/PartAM_example.py |  11 +-
 squander/partitioning/ilp.py             |  35 ++-
 squander/synthesis/PartAM.py             | 272 +++++++++++++++-----
 squander/synthesis/PartAM_utils.py       | 307 ++++++++++++++++++-----
 4 files changed, 480 insertions(+), 145 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 315bb7164..42d9affb5 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -56,9 +56,9 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
 
 if __name__ == '__main__':
 
-    filename = "benchmarks/qfast/5q/vqe.qasm"
+    filename = "bv_n14.qasm"
     circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
-    topology = [(0, 1), (0, 2), (0, 3), (0, 4)]
+    topology = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12),(12, 13)]
 
     # ================================================================
     # Full-circuit mode (default, window_size=0)
@@ -71,7 +71,7 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
         'strategy': "TreeSearch",
         'test_subcircuits': True,
         'test_final_circuit': True,
-        'max_partition_size': 3,
+        'max_partition_size': 4,
         'progressbar': True,
         'topology': topology,
     }
@@ -101,10 +101,10 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
         'strategy': "TreeSearch",
         'test_subcircuits': True,
         'test_final_circuit': True,
-        'max_partition_size': 3,
+        'max_partition_size': 4,
         'progressbar': True,
         'topology': topology,
-        'window_size': 5,
+        'window_size': 7,
     }
 
     start_time = time.time()
@@ -133,3 +133,4 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
     print(f"{'='*70}\n")
 
 
+
diff --git a/squander/partitioning/ilp.py b/squander/partitioning/ilp.py
index 422090c9e..22898ce1f 100644
--- a/squander/partitioning/ilp.py
+++ b/squander/partitioning/ilp.py
@@ -538,7 +538,7 @@ def sol_to_badsccs(g, allparts, L):
     _, scc = scc_tarjan_iterative(G_part)
     return {frozenset(v) for v in scc if len(v) > 1}
 
-def ilp_global_optimal(allparts, g, weighted_info=None, gurobi_direct=False, use_order=False, weights=None):
+def ilp_global_optimal(allparts, g, weighted_info=None, gurobi_direct=False, use_order=False, weights=None, transition_costs=None):
     """
     Select an optimal set of non-overlapping parts via ILP/MIP with cycle cuts.
 
@@ -578,8 +578,19 @@ def fortet_inequalities(x, y, z): #-z-x<=0 -z+x+y<=1 z-x<=0 z+x-y<=1
                 m.setParam(GRB.Param.LazyConstraints, 1)
                 x = m.addVars(range(N), lb=[0]*N, ub=[1]*N, vtype=[GRB.BINARY]*N, name=["x_" + str(i) for i in range(N)])
                 for i in g: m.addConstr(gp.quicksum(x[j] for j in gate_to_parts[i]) == 1)
-                if weights is not None: m.setObjective(gp.quicksum((weights[i]*N+1) * x[i] for i in range(N)), GRB.MINIMIZE)
-                elif weighted_info is None: m.setObjective(gp.quicksum(x[i] for i in range(N)), GRB.MINIMIZE)
+                transition_obj_gurobi = 0
+                if transition_costs:
+                    y_vars_g = {}
+                    for (i, j), cost in transition_costs.items():
+                        y_var = m.addVar(vtype=GRB.BINARY, name=f"y_{i}_{j}")
+                        m.update()
+                        m.addConstr(y_var >= x[i] + x[j] - 1)
+                        m.addConstr(y_var <= x[i])
+                        m.addConstr(y_var <= x[j])
+                        y_vars_g[(i, j)] = (y_var, cost)
+                    transition_obj_gurobi = gp.quicksum(cost * yv for yv, cost in y_vars_g.values())
+                if weights is not None: m.setObjective(gp.quicksum((weights[i]*N+1) * x[i] for i in range(N)) + transition_obj_gurobi, GRB.MINIMIZE)
+                elif weighted_info is None: m.setObjective(gp.quicksum(x[i] for i in range(N)) + transition_obj_gurobi, GRB.MINIMIZE)
                 else:
                     Npre, Npost, Nprepost = len(single_qubit_chains_pre), len(single_qubit_chains_post), len(single_qubit_chains_prepost)
                     pre = m.addVars(list(single_qubit_chains_pre), lb=[0]*Npre, ub=[1]*Npre, vtype=[GRB.BINARY]*Npre, name=["pre_" + str(i) for i in single_qubit_chains_pre])
@@ -658,7 +669,7 @@ def fortet_inequalities(x, y, z): #-z-x<=0 -z+x+y<=1 z-x<=0 z+x-y<=1
                     for s in post:                        
                         if not single_qubit_chains_post[s][0] in noprepost:
                             S.append((1-post[s])*(2**max_qubits_per_partition * (2 * (4 + 2) + 2)))
-                    m.setObjective(gp.quicksum(S)*N+gp.quicksum(x[i] for i in range(N)), GRB.MINIMIZE)
+                    m.setObjective(gp.quicksum(S)*N+gp.quicksum(x[i] for i in range(N)) + transition_obj_gurobi, GRB.MINIMIZE)
                 def cb(m, where):
                     if where == GRB.Callback.MIPSOL:
                         x_val = m.cbGetSolution([x[i] for i in range(N)])
@@ -683,8 +694,18 @@ def cb(m, where):
     #print(all_cycles_from_dag_edges(succ))
     #for u, v in two_cycles_from_dag_edges(g, gate_to_parts, allparts):
     #    prob += x[u] + x[v] <= 1 #constraint that no two cycles are included
-    if weights is not None: prob.setObjective(pulp.lpSum((weights[i]*N+1) * x[i] for i in range(N)))
-    elif weighted_info is None: prob.setObjective(pulp.lpSum(x[i] for i in range(N)))
+    transition_obj = 0
+    if transition_costs:
+        y_vars = {}
+        for (i, j), cost in transition_costs.items():
+            y_var = pulp.LpVariable(f"y_{i}_{j}", cat="Binary")
+            prob += y_var >= x[i] + x[j] - 1
+            prob += y_var <= x[i]
+            prob += y_var <= x[j]
+            y_vars[(i, j)] = y_var
+        transition_obj = pulp.lpSum(cost * y_vars[(i, j)] for (i, j), cost in transition_costs.items())
+    if weights is not None: prob.setObjective(pulp.lpSum((weights[i]*N+1) * x[i] for i in range(N)) + transition_obj)
+    elif weighted_info is None: prob.setObjective(pulp.lpSum(x[i] for i in range(N)) + transition_obj)
     else:
         Npre, Npost, Nprepost = len(single_qubit_chains_pre), len(single_qubit_chains_post), len(single_qubit_chains_prepost)
         pre = pulp.LpVariable.dicts("pre", list(single_qubit_chains_pre), cat="Binary")
@@ -760,7 +781,7 @@ def cb(m, where):
         for s in post:                        
             if not single_qubit_chains_post[s][0] in noprepost:
                 S.append((1-post[s])*(2**max_qubits_per_partition * (2 * (4 + 2) + 2)))
-        prob.setObjective(pulp.lpSum(S)*N+pulp.lpSum(x[i] for i in range(N)))
+        prob.setObjective(pulp.lpSum(S)*N+pulp.lpSum(x[i] for i in range(N)) + transition_obj)
     while True:
         from gurobipy import GRB
         import gurobipy as gp
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index f3f0cf1bb..86d47ef82 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -33,6 +33,7 @@
     get_subtopologies_of_type,
     get_unique_subtopologies,
     get_canonical_form,
+    get_node_mapping,
     SingleQubitPartitionResult,
     PartitionSynthesisResult,
     PartitionCandidate,
@@ -134,7 +135,7 @@ def __init__(self, config):
         self.config.setdefault('bh_stepwise_factor', 0.9)
         self.config.setdefault('hs_score_workers', os.cpu_count() or 1)
         self.config.setdefault('window_size', 0)  # 0 = full circuit (backward compat)
-        self.config.setdefault('use_osr',1)
+        self.config.setdefault('use_osr',0)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -152,11 +153,15 @@ def compute_routing_aware_weight(self, result, pi_init, D, E, dag_start=0.0, dag
         """
         Compute a routing-aware ILP weight for a partition synthesis result.
 
-        Evaluates each (topology, permutation) combination together and uses
+        For each (topology, P_I, P_O, node_mapping) combination:
+        - Routing cost is computed using the specific P_I and node_mapping, so the
+          cost reflects where each partition qubit actually needs to go.
+        - E penalty is computed on the output layout after applying P_O, so it
+          correctly penalises layouts that leave future gates far from each other.
+
         DAG-position-dependent weighting:
         - Early partitions (dag_start~0): routing cost weighted higher
         - Late partitions (dag_end~1): E penalty weighted higher
-        - Partitions spanning the full window get both weights high
 
         Args:
             result: PartitionSynthesisResult or SingleQubitPartitionResult
@@ -173,40 +178,161 @@ def compute_routing_aware_weight(self, result, pi_init, D, E, dag_start=0.0, dag
         if isinstance(result, SingleQubitPartitionResult):
             return 0
 
-        # Position-dependent weighting
         routing_weight = 1.0 - dag_start
         e_weight = dag_end
 
-        # E penalty
-        involved = set(result.involved_qbits)
-        e_penalty = 0.0
-        if E:
-            for (q_a, q_b) in E:
-                if q_a in involved or q_b in involved:
-                    if pi_init is not None:
-                        dist = D[pi_init[q_a]][pi_init[q_b]]
-                        if not np.isinf(dist):
-                            e_penalty += max(0, (dist - 1)) * 3
-                    else:
-                        # No layout yet — fixed penalty per touching E edge
-                        e_penalty += 3.0
+        N = len(D)
+        k = result.N
+        qbit_map_inv = {v: q for q, v in result.qubit_map.items()}  # q* → circuit qubit q
 
-        # Evaluate each (topology, permutation) combination together
         best_score = np.inf
+
         for tdx, mini_topology in enumerate(result.mini_topologies):
-            if pi_init is not None:
-                routing_cost = calculate_dist_small(mini_topology, result.qubit_map, D, pi_init)
-            else:
-                routing_cost = 0
-            for pdx in range(len(result.cnot_counts[tdx])):
+            topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
+            if not topology_candidates:
+                continue
+
+            # Precompute node_mappings (Q* → Q) once per topology candidate — independent of pdx
+            node_mappings = [get_node_mapping(mini_topology, tc) for tc in topology_candidates]
+            node_mappings = [nm for nm in node_mappings if nm]
+            if not node_mappings:
+                continue
+
+            for pdx, (P_i, P_o) in enumerate(result.permutations_pairs[tdx]):
                 cnot_count = result.cnot_counts[tdx][pdx]
-                score = cnot_count + routing_weight * routing_cost
-                best_score = min(best_score, score)
+                P_i_list = list(P_i)
+                P_o_list = list(P_o)
+                P_i_inv = [P_i_list.index(i) for i in range(k)]
+
+                for node_mapping in node_mappings:
+                    # --- Routing cost: bring each partition qubit from pi_init to its
+                    # target physical position determined by P_I and node_mapping ---
+                    routing_cost = 0
+                    if pi_init is not None:
+                        for q_star, q in qbit_map_inv.items():
+                            target_Q = node_mapping[P_i_inv[q_star]]
+                            dist = D[int(pi_init[q])][target_Q]
+                            if not np.isinf(dist):
+                                routing_cost += max(0, dist - 1) * 3
+
+                    # --- Output layout: start from pi_init then apply P_O ---
+                    if pi_init is not None:
+                        pi_out = [int(x) for x in pi_init]
+                    else:
+                        pi_out = list(range(N))
+                    for q_star in range(len(P_o_list)):
+                        if q_star in qbit_map_inv:
+                            q = qbit_map_inv[q_star]
+                            pi_out[q] = node_mapping[P_o_list[q_star]]
+
+                    # --- E penalty computed on the output layout after P_O ---
+                    e_penalty = 0.0
+                    if E:
+                        involved = set(result.involved_qbits)
+                        for (q_a, q_b) in E:
+                            if q_a in involved or q_b in involved:
+                                dist = D[pi_out[q_a]][pi_out[q_b]]
+                                if not np.isinf(dist):
+                                    e_penalty += max(0, (dist - 1)) * 3
+                                else:
+                                    e_penalty += 3.0
+
+                    score = cnot_count + routing_weight * routing_cost + e_weight * e_penalty
+                    best_score = min(best_score, score)
 
         if np.isinf(best_score):
             best_score = 0
 
-        return best_score + e_weight * e_penalty
+        return best_score
+
+    def compute_transition_cost(self, result_pred, result_succ, pi_init, D):
+        """
+        Compute the minimum transition cost between two partitions over all
+        (topology, P_o, node_mapping) configs of pred and (topology, P_i, node_mapping)
+        configs of succ.
+
+        The cost measures how far each qubit involved in succ needs to travel
+        from its position after pred's output to where succ's input requires it.
+
+        Args:
+            result_pred: PartitionSynthesisResult for the predecessor partition
+            result_succ: PartitionSynthesisResult for the successor partition
+            pi_init: Current qubit layout (logical -> physical), or None
+            D: Distance matrix between physical qubits
+
+        Returns:
+            float: Minimum transition cost (lower is better)
+        """
+        if isinstance(result_pred, SingleQubitPartitionResult) or isinstance(result_succ, SingleQubitPartitionResult):
+            return 0
+
+        N = len(D)
+        involved_pred = set(result_pred.involved_qbits)
+        involved_succ = set(result_succ.involved_qbits)
+        qmap_pred_inv = {v: k for k, v in result_pred.qubit_map.items()}  # q* -> q
+        qmap_succ_inv = {v: k for k, v in result_succ.qubit_map.items()}  # q* -> q
+        k_succ = result_succ.N
+
+        # Precompute all output positions for pred: list of dicts {q: physical_pos}
+        pred_outputs = []
+        for tdx, mini_topo in enumerate(result_pred.mini_topologies):
+            topo_candidates = self._get_subtopologies_of_type_cached(mini_topo)
+            if not topo_candidates:
+                continue
+            node_mappings = [get_node_mapping(mini_topo, tc) for tc in topo_candidates]
+            node_mappings = [nm for nm in node_mappings if nm]
+            if not node_mappings:
+                continue
+            for pdx, (_, P_o) in enumerate(result_pred.permutations_pairs[tdx]):
+                P_o_list = list(P_o)
+                for nm in node_mappings:
+                    out_pos = {}
+                    for q_star, q in qmap_pred_inv.items():
+                        out_pos[q] = nm[P_o_list[q_star]]
+                    pred_outputs.append(out_pos)
+
+        # Precompute all input target positions for succ: list of dicts {q: physical_pos}
+        succ_inputs = []
+        for tdx, mini_topo in enumerate(result_succ.mini_topologies):
+            topo_candidates = self._get_subtopologies_of_type_cached(mini_topo)
+            if not topo_candidates:
+                continue
+            node_mappings = [get_node_mapping(mini_topo, tc) for tc in topo_candidates]
+            node_mappings = [nm for nm in node_mappings if nm]
+            if not node_mappings:
+                continue
+            for pdx, (P_i, _) in enumerate(result_succ.permutations_pairs[tdx]):
+                P_i_list = list(P_i)
+                P_i_inv = [P_i_list.index(i) for i in range(k_succ)]
+                for nm in node_mappings:
+                    in_pos = {}
+                    for q_star, q in qmap_succ_inv.items():
+                        in_pos[q] = nm[P_i_inv[q_star]]
+                    succ_inputs.append(in_pos)
+
+        if not pred_outputs or not succ_inputs:
+            return 0
+
+        # Find minimum transition cost over all (pred_output, succ_input) pairs
+        best_cost = np.inf
+        for out_pos in pred_outputs:
+            for in_pos in succ_inputs:
+                cost = 0
+                for q, target in in_pos.items():
+                    if q in out_pos:
+                        current = out_pos[q]
+                    elif pi_init is not None:
+                        current = int(pi_init[q])
+                    else:
+                        current = q
+                    dist = D[current][target]
+                    if not np.isinf(dist):
+                        cost += max(0, dist - 1) * 3
+                    if cost >= best_cost:
+                        break
+                best_cost = min(best_cost, cost)
+
+        return best_cost if not np.isinf(best_cost) else 0
 
     # ------------------------------------------------------------------------
     # Caching Methods
@@ -627,7 +753,43 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
                 self.compute_routing_aware_weight(result, pi_init, D, E, dag_start, dag_end)
             )
 
-        L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
+        # ---- Phase 3b: Compute inter-partition transition costs ----
+        transition_weight = self.config.setdefault('transition_weight', 1.0)
+        transition_costs = {}
+        if transition_weight > 0:
+            gate_to_part = {}
+            for idx, part in enumerate(allparts):
+                for gate in part:
+                    gate_to_part.setdefault(gate, []).append(idx)
+
+            # Build directed DAG-neighbor partition pairs (pred -> succ)
+            directed_neighbors = set()
+            for gate_u, successors in g.items():
+                for part_u in gate_to_part.get(gate_u, []):
+                    for gate_v in successors:
+                        for part_v in gate_to_part.get(gate_v, []):
+                            if part_u != part_v:
+                                directed_neighbors.add((part_u, part_v))
+
+            # Compute transition cost for each directed pair, keyed by (min, max)
+            seen_pairs = set()
+            for pred_idx, succ_idx in directed_neighbors:
+                pair_key = (min(pred_idx, succ_idx), max(pred_idx, succ_idx))
+                if pair_key in seen_pairs:
+                    continue
+                seen_pairs.add(pair_key)
+                if allparts[pred_idx] & allparts[succ_idx]:
+                    continue
+                result_pred = optimized_results[pred_idx]
+                result_succ = optimized_results[succ_idx]
+                # Compute both directions and take the minimum
+                cost_fwd = self.compute_transition_cost(result_pred, result_succ, pi_init, D)
+                cost_rev = self.compute_transition_cost(result_succ, result_pred, pi_init, D)
+                cost = min(cost_fwd, cost_rev)
+                if cost > 0:
+                    transition_costs[pair_key] = cost * transition_weight
+
+        L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights, transition_costs=transition_costs)
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
         L = topo_sort_partitions(working_circ, self.config["max_partition_size"], parts)
         from squander.partitioning.kahn import kahn_partition_preparts
@@ -834,39 +996,13 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                    bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved',
                    disable=self.config.get('progressbar', 0) == False)
 
-        configured_workers = self.config.get('hs_score_workers', os.cpu_count() or 1)
-        score_workers = max(1, configured_workers if configured_workers else 1)
-        executor: Optional[ProcessPoolExecutor] = None
-        if score_workers > 1:
-            try:
-                executor = ProcessPoolExecutor(
-                    max_workers=score_workers,
-                    initializer=_init_scoring_worker,
-                    initargs=(scoring_partitions, D, virtual_E),
-                )
-            except Exception as exc:
-                logging.warning(
-                    "Falling back to sequential heuristic scoring: %s",
-                    exc,
-                )
-                executor = None
-
-        try:
-            while len(F) != 0:
+        while len(F) != 0:
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
                     break
                 F_snapshot = tuple(F)
                 use_free_routing = not first_routing_done
-                if executor is not None:
-                    pi_snapshot = tuple(int(x) for x in pi)
-                    payloads = [
-                        (partition_candidate, F_snapshot, pi_snapshot, use_free_routing)
-                        for partition_candidate in partition_candidates
-                    ]
-                    scores = list(executor.map(_score_candidate_worker, payloads))
-                else:
-                    scores = [
+                scores = [
                         self.score_partition_candidate(
                             partition_candidate,
                             F_snapshot,
@@ -927,9 +1063,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             children.extend(DAG[child])
                         else:
                             F.append(child)
-        finally:
-            if executor is not None:
-                executor.shutdown()
 
         # If no multi-qubit partition was resolved, flush buffered single-qubit parts
         if buffered_single_qubit:
@@ -980,24 +1113,29 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     @staticmethod
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache, virtual_E=None, free_routing=False):
         score_F = 0
-        swap_weight = 4
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
+        swap_weight = 1
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, free_routing=free_routing)
         if not free_routing:
             score_F += swap_weight * len(swaps) * 3
-        score_F += len(partition_candidate.circuit_structure)
+        score_F += 0.1*len(partition_candidate.circuit_structure)
 
         for partition_idx in F:
             partition = scoring_partitions[partition_idx]
             if partition is None or partition_idx == partition_candidate.partition_idx:
                 continue
+            qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}  # q* → circuit qubit
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                if not free_routing:
-                    dist_placeholder = swap_weight * 3 * calculate_dist_small(mini_topology, partition.qubit_map, D, output_perm)
-                else:
-                    dist_placeholder = 0
-                circuit_length = np.min([len(circ) for circ in partition.circuit_structures[tdx]])
-                mini_scores.append(dist_placeholder + circuit_length)
+                for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
+                    cnot_count = len(partition.circuit_structures[tdx][pdx])
+                    if mini_topology:
+                        routing_cost = swap_weight * 3 * sum(
+                            max(0, D[int(output_perm[qbit_map_inv[P_i[u]]])][int(output_perm[qbit_map_inv[P_i[v]]])] - 1)
+                            for u, v in mini_topology
+                        )
+                    else:
+                        routing_cost = 0
+                    mini_scores.append(routing_cost + cnot_count)
             if mini_scores:
                 score_F += np.min(mini_scores)
 
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index d523aefb9..54ff2053a 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -1,6 +1,6 @@
 import numpy as np
 from typing import List, Tuple, Set, FrozenSet
-from itertools import permutations
+from itertools import permutations, combinations
 from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
 import heapq
 import math
@@ -114,7 +114,121 @@ def heuristic(state):
     return None, None  # No solution found
 
 def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
-    return find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix)
+    """
+    Route partition qubits to their target physical positions using A* over
+    the k-dimensional state space of partition qubit positions only.
+
+    For k partition qubits on an n-node topology the state space has at most
+    n^k entries (n*(n-1)*...*(n-k+1) distinct states).  For the typical case
+    of k=2 or k=3 and n≤20 this is tiny (≤2744 states) so the search
+    completes in microseconds while still finding an optimal SWAP sequence.
+
+    The original full-state A* had O(n!) state space which was exponentially
+    slow.  The naive greedy replacement oscillated when two adjacent partition
+    qubits needed to move in the same direction.  This implementation avoids
+    both problems.
+
+    Args:
+        pi_A        : List[int], pi_A[q] = current physical position of virtual qubit q.
+        pi_B_dict   : Dict {q: target_physical} for the qubits that need routing.
+        dist_matrix : n×n distance/cost matrix; dist[i][j]==1 means i and j are adjacent.
+
+    Returns:
+        swaps            : List of (P1, P2) adjacent-qubit SWAP operations (optimal).
+        final_permutation: Updated virtual→physical mapping after all SWAPs.
+    """
+    n = len(pi_A)
+
+    # Build adjacency list from dist_matrix
+    adj = [[] for _ in range(n)]
+    for i in range(n):
+        for j in range(i + 1, n):
+            if dist_matrix[i][j] == 1:
+                adj[i].append(j)
+                adj[j].append(i)
+
+    partition_qubits = sorted(pi_B_dict.keys())
+    k = len(partition_qubits)
+
+    initial_positions = tuple(int(pi_A[q]) for q in partition_qubits)
+    target_positions  = tuple(int(pi_B_dict[q]) for q in partition_qubits)
+
+    if initial_positions == target_positions:
+        return [], list(pi_A)
+
+    def heuristic(positions):
+        # Admissible lower bound: sum of individual distances / 2
+        return sum(dist_matrix[positions[i]][target_positions[i]] for i in range(k)) / 2
+
+    # A* over k-dimensional state space.
+    # Each state is a tuple of physical positions, one per partition qubit.
+    # Paths are reconstructed via a parent-pointer dict to avoid copying lists
+    # on every heap push (which would be O(depth²) total).
+    counter = 0  # tiebreak counter so tuples never compare paths
+    parent = {}  # state → (parent_state, swap) for path reconstruction
+    parent[initial_positions] = None
+
+    heap = []
+    heapq.heappush(heap, (heuristic(initial_positions), 0, counter, initial_positions))
+    visited = {initial_positions: 0}
+
+    while heap:
+        f, g, _, positions = heapq.heappop(heap)
+
+        if positions == target_positions:
+            # Reconstruct swap path via parent pointers
+            path = []
+            state = positions
+            while parent[state] is not None:
+                prev_state, swap = parent[state]
+                path.append(swap)
+                state = prev_state
+            path.reverse()
+
+            # Replay swaps on the full mapping to get final virt→phys
+            final_v2p = list(pi_A)
+            final_p2v = [0] * n
+            for q_idx in range(n):
+                final_p2v[int(final_v2p[q_idx])] = q_idx
+            for P1, P2 in path:
+                q1, q2 = final_p2v[P1], final_p2v[P2]
+                final_p2v[P1], final_p2v[P2] = q2, q1
+                final_v2p[q1], final_v2p[q2] = P2, P1
+            return path, final_v2p
+
+        if visited.get(positions, float('inf')) < g:
+            continue
+
+        # Quick lookup: physical position → index within partition_qubits list
+        pos_to_k_idx = {p: i for i, p in enumerate(positions)}
+
+        # Expand: try every SWAP that moves at least one partition qubit
+        for i, p in enumerate(positions):
+            for nb in adj[p]:
+                new_positions = list(positions)
+                new_positions[i] = nb
+                # If the neighbor also holds a partition qubit, swap it too
+                if nb in pos_to_k_idx:
+                    j = pos_to_k_idx[nb]
+                    new_positions[j] = p
+                new_positions = tuple(new_positions)
+
+                new_g = g + 1
+                if visited.get(new_positions, float('inf')) <= new_g:
+                    continue
+
+                visited[new_positions] = new_g
+                swap_key = (min(p, nb), max(p, nb))
+                parent[new_positions] = (positions, swap_key)
+                counter += 1
+                heapq.heappush(heap, (new_g + heuristic(new_positions), new_g,
+                                      counter, new_positions))
+
+    logging.warning(
+        "find_constrained_swaps_partial: failed to route %s → %s",
+        initial_positions, target_positions,
+    )
+    return [], list(pi_A)
 
 def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
     P_i_inv = [P_i.index(i) for i in range(len(P_i))]  # Compute inverse
@@ -143,39 +257,27 @@ def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
 # Topology Utilities
 # ============================================================================
 
-def _build_adj_list(edges: List[Tuple[int, int]]) -> dict:
-    adj_list = {}
-    for u, v in edges:
-        if u not in adj_list:
-            adj_list[u] = set()
-        if v not in adj_list:
-            adj_list[v] = set()
-        adj_list[u].add(v)
-        adj_list[v].add(u)
-    return adj_list
-
 def _get_induced_edges(edges: List[Tuple[int, int]], qubit_subset: Set[int]) -> List[Tuple[int, int]]:
     return [edge for edge in edges if edge[0] in qubit_subset and edge[1] in qubit_subset]
 
-def _dfs_enumerate(adj_list: dict, k: int, callback):
-    all_qubits = sorted(adj_list.keys())
-    seen = set()
-    def dfs(current_qubits: Set[int], candidates: Set[int]):
-        if len(current_qubits) == k:
-            frozen = frozenset(current_qubits)
-            if frozen not in seen:
-                seen.add(frozen)
-                callback(current_qubits)
-            return
-        if len(current_qubits) + len(candidates) < k:
-            return
-        for node in sorted(candidates):
-            new_qubits = current_qubits | {node}
-            new_candidates = {neighbor for q in new_qubits for neighbor in adj_list[q] 
-                            if neighbor not in new_qubits and neighbor > node}
-            dfs(new_qubits, new_candidates)
-    for start in all_qubits:
-        dfs({start}, {n for n in adj_list[start] if n > start})
+def _is_connected(nodes: Set[int], edges: List[Tuple[int, int]]) -> bool:
+    if len(nodes) <= 1:
+        return True
+    adj = defaultdict(set)
+    for u, v in edges:
+        if u in nodes and v in nodes:
+            adj[u].add(v)
+            adj[v].add(u)
+    start = next(iter(nodes))
+    visited = {start}
+    stack = [start]
+    while stack:
+        node = stack.pop()
+        for neighbor in adj[node]:
+            if neighbor not in visited:
+                visited.add(neighbor)
+                stack.append(neighbor)
+    return visited == nodes
 
 def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]:
     qubits = sorted(qubit_subset)
@@ -189,21 +291,36 @@ def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, in
     return frozenset(best_edges)
 
 def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]:
+    """Return one representative locally-labeled (0..k-1) edge list per unique k-node
+    connected subgraph isomorphism class found in the graph defined by *edges*."""
     if k <= 0:
         return []
-    adj_list = _build_adj_list(edges)
     if k == 1:
         return [[]]
+    nodes = set()
+    for u, v in edges:
+        nodes.add(u)
+        nodes.add(v)
+    nodes = sorted(nodes)
+    if len(nodes) < k:
+        return []
     canonical_forms = {}
-    def process(qubits):
-        induced = _get_induced_edges(edges, qubits)
-        canonical = get_canonical_form(qubits, induced)
+    for subset in combinations(nodes, k):
+        subset_set = set(subset)
+        induced = _get_induced_edges(edges, subset_set)
+        if not _is_connected(subset_set, induced):
+            continue
+        canonical = get_canonical_form(subset_set, induced)
         if canonical not in canonical_forms:
-            canonical_forms[canonical] = induced
-    _dfs_enumerate(adj_list, k, process)
+            # Store locally-labeled edges (0..k-1) so the decomposer always
+            # receives a valid k-qubit topology regardless of global qubit indices.
+            canonical_forms[canonical] = sorted(canonical)
     return list(canonical_forms.values())
 
 def get_subtopologies_of_type(edges: List[Tuple[int, int]], target_topology: List[Tuple[int, int]]) -> List[List[Tuple[int, int]]]:
+    """Return all connected k-node subgraphs of *edges* that are isomorphic to
+    *target_topology*, each expressed with the original global qubit labels
+    (needed for physical routing decisions)."""
     target_qubits = set()
     for u, v in target_topology:
         target_qubits.add(u)
@@ -211,17 +328,25 @@ def get_subtopologies_of_type(edges: List[Tuple[int, int]], target_topology: Lis
     k = len(target_qubits) if target_qubits else 1
     if k <= 0:
         return []
-    adj_list = _build_adj_list(edges)
+    nodes = set()
+    for u, v in edges:
+        nodes.add(u)
+        nodes.add(v)
     if k == 1:
-        return [[] for _ in adj_list.keys()]
+        return [[] for _ in nodes]
+    nodes = sorted(nodes)
+    if len(nodes) < k:
+        return []
     target_canonical = get_canonical_form(target_qubits, target_topology)
     matches = []
-    def process(qubits):
-        induced = _get_induced_edges(edges, qubits)
-        canonical = get_canonical_form(qubits, induced)
+    for subset in combinations(nodes, k):
+        subset_set = set(subset)
+        induced = _get_induced_edges(edges, subset_set)
+        if not _is_connected(subset_set, induced):
+            continue
+        canonical = get_canonical_form(subset_set, induced)
         if canonical == target_canonical:
-            matches.append(induced)
-    _dfs_enumerate(adj_list, k, process)
+            matches.append(induced)  # global labels retained for routing
     return matches
 
 def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int, int]]) -> dict:
@@ -260,12 +385,33 @@ def extract_subtopology(involved_qbits, qbit_map, config ):
 # Distance & Cost Calculations
 # ============================================================================
 
-def calculate_dist_small(mini_topology, qbit_map, dist_matrix,pi):
-    dist_placeholder = 0
-    qbit_map_inv = { k:v for v,k in qbit_map.items()}
-    for u,v in mini_topology:
-        dist_placeholder += (dist_matrix[pi[qbit_map_inv[u]]][pi[qbit_map_inv[v]]]-1)*3
-    return dist_placeholder
+def calculate_dist_small(mini_topology, qbit_map, dist_matrix, pi):
+    """Estimate the routing cost needed to bring the partition qubits adjacent.
+
+    Minimises over all assignments of circuit qubits to local topology
+    positions so that, e.g., the hub qubit in a 3-qubit star topology is
+    always matched to the physically most central circuit qubit rather than
+    whichever qubit happened to be assigned local label 0 by the ILP.
+
+    Returns sum-of-edge-distances * 3 (three gates per SWAP) for the best
+    qubit-to-position assignment.
+    """
+    if not mini_topology:
+        return 0
+    # Build ordered list: circuit_qubits[j] = circuit qubit at local position j
+    k = len(qbit_map)
+    qbit_map_inv = {local: circ for circ, local in qbit_map.items()}
+    circuit_qubits = [qbit_map_inv[j] for j in range(k)]
+
+    # Try all k! permutations to find the cheapest qubit→position assignment
+    best = float('inf')
+    for perm in permutations(range(k)):
+        cost = 0
+        for u, v in mini_topology:
+            cost += (dist_matrix[pi[circuit_qubits[perm[u]]]][pi[circuit_qubits[perm[v]]]] - 1) * 3
+        if cost < best:
+            best = cost
+    return best
 
 # ============================================================================
 # Data Classes
@@ -410,7 +556,7 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
-    def transform_pi(self, pi, D, swap_cache=None):
+    def transform_pi(self, pi, D, swap_cache=None, free_routing=False):
         # Fixed: Use P_i^{-1} instead of P_i for input routing
         # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
         # For Original to see logical qubit q* at partition position q*, we need:
@@ -421,22 +567,35 @@ def transform_pi(self, pi, D, swap_cache=None):
         qbit_map_input = {k : self.node_mapping[P_i_inv[v]] for k,v in self.qbit_map.items()}
         # Convert pi to plain Python list of ints (may contain np.int64)
         pi_list = [int(x) for x in pi]
-        
-        # Check cache if provided
-        cache_key = None
-        if swap_cache is not None:
-            # Create cache key: (pi_tuple, frozenset of qbit_map_input items)
-            pi_tuple = tuple(pi_list)
-            qbit_map_frozen = frozenset(qbit_map_input.items())
-            cache_key = (pi_tuple, qbit_map_frozen)
-            if cache_key in swap_cache:
-                swaps, pi_init = swap_cache[cache_key]
+        n = len(pi_list)
+
+        if free_routing:
+            # Routing is free: build the ideal pi_init that places partition qubits
+            # at their target physical positions with zero SWAPs, then assign the
+            # remaining virtual qubits to the remaining physical positions.
+            used_physical = set(qbit_map_input.values())
+            pi_init = [0] * n
+            for k, target_P in qbit_map_input.items():
+                pi_init[k] = target_P
+            remaining_physical = sorted(p for p in range(n) if p not in used_physical)
+            remaining_logical  = sorted(q for q in range(n) if q not in qbit_map_input)
+            for q, p in zip(remaining_logical, remaining_physical):
+                pi_init[q] = p
+            swaps = []
+        else:
+            # Check cache if provided
+            if swap_cache is not None:
+                pi_tuple = tuple(pi_list)
+                qbit_map_frozen = frozenset(qbit_map_input.items())
+                cache_key = (pi_tuple, qbit_map_frozen)
+                if cache_key in swap_cache:
+                    swaps, pi_init = swap_cache[cache_key]
+                else:
+                    swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+                    swap_cache[cache_key] = (swaps, pi_init)
             else:
                 swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
-                swap_cache[cache_key] = (swaps, pi_init)
-        else:
-            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
-        
+
         pi_output = pi_init.copy()
         # Fixed: P_o should be indexed by partition virtual index q*, not physical index Q*
         # After the circuit, logical qubit k with qbit_map[k] = q* ends up at 
@@ -448,6 +607,22 @@ def transform_pi(self, pi, D, swap_cache=None):
                 pi_output[k] = self.node_mapping[self.P_o[q_star]]
         return swaps, pi_output
     
+    def estimate_swap_count(self, pi, D) -> int:
+        """O(n) lower-bound on the number of SWAPs needed to route this
+        partition's virtual qubits to their target physical positions.
+        Uses the same admissible heuristic as the A* search internaly:
+            floor(sum_of_distances / 2)
+        """
+        P_i_inv = [self.P_i.index(i) for i in range(len(self.P_i))]
+        total = 0.0
+        for k, v in self.qbit_map.items():
+            target_P = self.node_mapping[P_i_inv[v]]
+            current_P = int(pi[k])
+            d = D[current_P][target_P]
+            if not np.isinf(d):
+                total += d
+        return int(total / 2)
+
     def get_final_circuit(self,optimized_partitions,N):
         partition = optimized_partitions[self.partition_idx]
         part_parameters = partition.synthesised_parameters[self.topology_idx][self.permutation_idx]

From bfb4dc2754de27fa344546ddb6cead129396a443 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 1 Mar 2026 00:47:57 +0100
Subject: [PATCH 082/232] Clean up

---
 squander/synthesis/PartAM.py       | 743 +++++------------------------
 squander/synthesis/PartAM_utils.py | 222 ---------
 2 files changed, 130 insertions(+), 835 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 86d47ef82..fa8d1ee43 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -7,7 +7,6 @@
     qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search,
 )
 from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
-from concurrent.futures import ProcessPoolExecutor
 from itertools import permutations
 from squander.partitioning.ilp import (
     get_all_partitions,
@@ -19,11 +18,11 @@
 
 import numpy as np
 
-from typing import Callable, Dict, List, Optional, Set, Tuple, FrozenSet
+from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass
 
 import multiprocessing as mp
-from multiprocessing import Process, Pool
+from multiprocessing import Pool
 import os
 import logging
 from tqdm import tqdm
@@ -39,8 +38,6 @@
     PartitionCandidate,
     check_circuit_compatibility,
     construct_swap_circuit,
-    calculate_dist_small,
-    group_into_two_qubit_blocks,
 )
 
 
@@ -60,49 +57,6 @@ class PartitionScoreData:
     involved_qbits: Tuple[int, ...]
 
 
-# ============================================================================
-# Parallel Processing Setup
-# ============================================================================
-
-_WORKER_SCORING_PARTITIONS: Optional[List[Optional[PartitionScoreData]]] = None
-_WORKER_DISTANCE_MATRIX: Optional[np.ndarray] = None
-_WORKER_SWAP_CACHE: Optional[Dict] = None
-_WORKER_VIRTUAL_E: Optional[List[Tuple[int, int]]] = None
-
-
-def _init_scoring_worker(scoring_partitions, distance_matrix, virtual_E=None):
-    """Initializer for process-based scoring workers."""
-    global _WORKER_SCORING_PARTITIONS, _WORKER_DISTANCE_MATRIX, _WORKER_SWAP_CACHE, _WORKER_VIRTUAL_E
-    _WORKER_SCORING_PARTITIONS = scoring_partitions
-    _WORKER_DISTANCE_MATRIX = distance_matrix
-    _WORKER_SWAP_CACHE = {}
-    _WORKER_VIRTUAL_E = virtual_E
-
-
-def _score_candidate_worker(payload):
-    """
-    Worker wrapper that reconstructs scoring inputs from a lightweight payload.
-    Payload format: (PartitionCandidate, F_snapshot, pi_snapshot[, free_routing])
-    """
-    if (
-        _WORKER_SCORING_PARTITIONS is None
-        or _WORKER_DISTANCE_MATRIX is None
-    ):
-        raise RuntimeError("Scoring worker not initialized with shared data.")
-    partition_candidate, F_snapshot, pi_snapshot = payload[:3]
-    free_routing = payload[3] if len(payload) > 3 else False
-    return qgd_Partition_Aware_Mapping.score_partition_candidate(
-        partition_candidate,
-        F_snapshot,
-        pi_snapshot,
-        _WORKER_SCORING_PARTITIONS,
-        _WORKER_DISTANCE_MATRIX,
-        _WORKER_SWAP_CACHE,
-        _WORKER_VIRTUAL_E,
-        free_routing=free_routing,
-    )
-
-
 # ============================================================================
 # Main Class: qgd_Partition_Aware_Mapping
 # ============================================================================
@@ -134,8 +88,7 @@ def __init__(self, config):
         self.config.setdefault('bh_target_accept_rate', 0.5)
         self.config.setdefault('bh_stepwise_factor', 0.9)
         self.config.setdefault('hs_score_workers', os.cpu_count() or 1)
-        self.config.setdefault('window_size', 0)  # 0 = full circuit (backward compat)
-        self.config.setdefault('use_osr',0)
+        self.config.setdefault('use_osr', 0)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -149,103 +102,7 @@ def __init__(self, config):
     # Scoring Methods
     # ------------------------------------------------------------------------
 
-    def compute_routing_aware_weight(self, result, pi_init, D, E, dag_start=0.0, dag_end=1.0):
-        """
-        Compute a routing-aware ILP weight for a partition synthesis result.
-
-        For each (topology, P_I, P_O, node_mapping) combination:
-        - Routing cost is computed using the specific P_I and node_mapping, so the
-          cost reflects where each partition qubit actually needs to go.
-        - E penalty is computed on the output layout after applying P_O, so it
-          correctly penalises layouts that leave future gates far from each other.
-
-        DAG-position-dependent weighting:
-        - Early partitions (dag_start~0): routing cost weighted higher
-        - Late partitions (dag_end~1): E penalty weighted higher
-
-        Args:
-            result: PartitionSynthesisResult or SingleQubitPartitionResult
-            pi_init: Current qubit layout (logical -> physical mapping).
-                     None when routing is free (first window).
-            D: Distance matrix between physical qubits
-            E: List of (q_a, q_b) tuples for virtual outgoing 2-qubit gates
-            dag_start: Float in [0, 1] — earliest DAG level of the partition
-            dag_end: Float in [0, 1] — latest DAG level of the partition
-
-        Returns:
-            float: Combined weight (lower is better)
-        """
-        if isinstance(result, SingleQubitPartitionResult):
-            return 0
-
-        routing_weight = 1.0 - dag_start
-        e_weight = dag_end
-
-        N = len(D)
-        k = result.N
-        qbit_map_inv = {v: q for q, v in result.qubit_map.items()}  # q* → circuit qubit q
-
-        best_score = np.inf
-
-        for tdx, mini_topology in enumerate(result.mini_topologies):
-            topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
-            if not topology_candidates:
-                continue
-
-            # Precompute node_mappings (Q* → Q) once per topology candidate — independent of pdx
-            node_mappings = [get_node_mapping(mini_topology, tc) for tc in topology_candidates]
-            node_mappings = [nm for nm in node_mappings if nm]
-            if not node_mappings:
-                continue
-
-            for pdx, (P_i, P_o) in enumerate(result.permutations_pairs[tdx]):
-                cnot_count = result.cnot_counts[tdx][pdx]
-                P_i_list = list(P_i)
-                P_o_list = list(P_o)
-                P_i_inv = [P_i_list.index(i) for i in range(k)]
-
-                for node_mapping in node_mappings:
-                    # --- Routing cost: bring each partition qubit from pi_init to its
-                    # target physical position determined by P_I and node_mapping ---
-                    routing_cost = 0
-                    if pi_init is not None:
-                        for q_star, q in qbit_map_inv.items():
-                            target_Q = node_mapping[P_i_inv[q_star]]
-                            dist = D[int(pi_init[q])][target_Q]
-                            if not np.isinf(dist):
-                                routing_cost += max(0, dist - 1) * 3
-
-                    # --- Output layout: start from pi_init then apply P_O ---
-                    if pi_init is not None:
-                        pi_out = [int(x) for x in pi_init]
-                    else:
-                        pi_out = list(range(N))
-                    for q_star in range(len(P_o_list)):
-                        if q_star in qbit_map_inv:
-                            q = qbit_map_inv[q_star]
-                            pi_out[q] = node_mapping[P_o_list[q_star]]
-
-                    # --- E penalty computed on the output layout after P_O ---
-                    e_penalty = 0.0
-                    if E:
-                        involved = set(result.involved_qbits)
-                        for (q_a, q_b) in E:
-                            if q_a in involved or q_b in involved:
-                                dist = D[pi_out[q_a]][pi_out[q_b]]
-                                if not np.isinf(dist):
-                                    e_penalty += max(0, (dist - 1)) * 3
-                                else:
-                                    e_penalty += 3.0
-
-                    score = cnot_count + routing_weight * routing_cost + e_weight * e_penalty
-                    best_score = min(best_score, score)
-
-        if np.isinf(best_score):
-            best_score = 0
-
-        return best_score
-
-    def compute_transition_cost(self, result_pred, result_succ, pi_init, D):
+    def compute_transition_cost(self, result_pred, result_succ, D):
         """
         Compute the minimum transition cost between two partitions over all
         (topology, P_o, node_mapping) configs of pred and (topology, P_i, node_mapping)
@@ -257,7 +114,6 @@ def compute_transition_cost(self, result_pred, result_succ, pi_init, D):
         Args:
             result_pred: PartitionSynthesisResult for the predecessor partition
             result_succ: PartitionSynthesisResult for the successor partition
-            pi_init: Current qubit layout (logical -> physical), or None
             D: Distance matrix between physical qubits
 
         Returns:
@@ -321,8 +177,6 @@ def compute_transition_cost(self, result_pred, result_succ, pi_init, D):
                 for q, target in in_pos.items():
                     if q in out_pos:
                         current = out_pos[q]
-                    elif pi_init is not None:
-                        current = int(pi_init[q])
                     else:
                         current = q
                     dist = D[current][target]
@@ -447,51 +301,6 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
             )
         return scoring_partitions
 
-    @staticmethod
-    def _group_circuit_for_levels(circuit):
-        """
-        Group a flat circuit into 2-qubit blocks for coarser DAG level generation.
-
-        Returns:
-            grouped_circ: Circuit with 2-qubit blocks as top-level elements
-            block_gate_orders: List of lists mapping each block index to the
-                               original flat gate indices it contains
-        """
-        gates = circuit.get_Gates()
-
-        # Track gate indices following group_into_two_qubit_blocks logic
-        pending = defaultdict(list)
-        block_gate_orders = []
-        last_block_for_qubit = {}
-
-        for gate_idx, gate in enumerate(gates):
-            qubits = gate.get_Involved_Qbits()
-            if len(qubits) == 1:
-                pending[qubits[0]].append(gate_idx)
-            else:
-                q0, q1 = qubits[0], qubits[1]
-                block_order = list(pending[q0]) + list(pending[q1]) + [gate_idx]
-                pending[q0].clear()
-                pending[q1].clear()
-                block_idx = len(block_gate_orders)
-                block_gate_orders.append(block_order)
-                last_block_for_qubit[q0] = block_idx
-                last_block_for_qubit[q1] = block_idx
-
-        # Trailing single-qubit gates
-        for q, gate_indices in pending.items():
-            if not gate_indices:
-                continue
-            if q in last_block_for_qubit:
-                block_gate_orders[last_block_for_qubit[q]].extend(gate_indices)
-            else:
-                # Qubit only has single-qubit gates — standalone block
-                block_gate_orders.append(list(gate_indices))
-
-        grouped_circ = group_into_two_qubit_blocks(circuit)
-
-        return grouped_circ, block_gate_orders
-
     # ------------------------------------------------------------------------
     # Partition Decomposition Methods
     # ------------------------------------------------------------------------
@@ -592,98 +401,22 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
     # Circuit Synthesis
     # ------------------------------------------------------------------------
 
-    def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
-                              DAG_start=0, DAG_end=0, window_gate_indices=None):
+    def SynthesizeWideCircuit(self, circ, orig_parameters):
         """
-        Partition and synthesize a circuit, optionally restricted to a window.
+        Partition and synthesize a full circuit.
 
         Args:
             circ: The full quantum circuit (must be flat — no subcircuit blocks)
             orig_parameters: Parameters for circ
-            pi_init: Current qubit permutation (logical->physical). When provided,
-                     enables routing-aware ILP scoring.
-            E: Virtual outgoing gates as List[(q_a, q_b)]. Computed automatically
-               when None and a window is active.
-            DAG_start: First DAG level to process (inclusive)
-            DAG_end: Last DAG level to process (exclusive). 0 means all levels.
-            window_gate_indices: Optional list of gate indices (into circ) to
-                process. When provided, overrides DAG_start/DAG_end.
 
         Returns:
             optimized_partitions: List of PartitionSynthesisResult / SingleQubitPartitionResult
         """
-        # ---- Phase 0: Window extraction ----
-        all_gates = circ.get_Gates()
+        working_circ = circ
+        working_parameters = orig_parameters
         qbit_num = circ.get_Qbit_Num()
 
-        if window_gate_indices is not None:
-            # Window specified by explicit gate indices
-            window_topo_order = list(window_gate_indices)
-            window_gate_set = set(window_topo_order)
-            full_circuit_mode = (len(window_gate_set) == len(all_gates))
-            has_gates_beyond_window = len(window_gate_set) < len(all_gates)
-        else:
-            # Window specified by DAG level range
-            levels = self.generate_DAG_levels(circ)
-            total_levels = len(levels)
-
-            if DAG_start == 0 and DAG_end == 0:
-                effective_end = total_levels
-            else:
-                effective_end = min(DAG_end, total_levels)
-            effective_start = DAG_start
-
-            if effective_start >= total_levels or effective_start >= effective_end:
-                self._last_synthesis_metadata = {
-                    'E': E if E is not None else [],
-                    'window_gates': 0,
-                    'total_gates': len(all_gates),
-                }
-                return []
-
-            window_topo_order = []
-            for level_idx in range(effective_start, effective_end):
-                window_topo_order.extend(levels[level_idx])
-            window_gate_set = set(window_topo_order)
-            full_circuit_mode = (len(window_gate_set) == len(all_gates))
-            has_gates_beyond_window = effective_end < total_levels
-
-        if full_circuit_mode:
-            working_circ = circ
-            working_parameters = orig_parameters
-        else:
-            # Build sub-circuit from window gates
-            working_circ = Circuit(qbit_num)
-            working_params_list = []
-            for orig_idx in window_topo_order:
-                gate = all_gates[orig_idx]
-                working_circ.add_Gate(gate)
-                start = gate.get_Parameter_Start_Index()
-                working_params_list.append(
-                    orig_parameters[start:start + gate.get_Parameter_Num()]
-                )
-            if working_params_list:
-                working_parameters = np.concatenate(working_params_list, axis=0)
-            else:
-                working_parameters = np.array([])
-
-        # ---- Phase 0b: Identify virtual outgoing gates (E) ----
-        if E is None and not full_circuit_mode and has_gates_beyond_window:
-            E = []
-            for orig_idx in window_topo_order:
-                gate = all_gates[orig_idx]
-                children = circ.get_Children(gate)
-                for child_idx in children:
-                    if child_idx not in window_gate_set:
-                        child_gate = all_gates[child_idx]
-                        child_qubits = child_gate.get_Involved_Qbits()
-                        if len(child_qubits) == 2:
-                            E.append((child_qubits[0], child_qubits[1]))
-            E = list(set(E))
-        elif E is None:
-            E = []
-
-        # ---- Phase 0c: Compute distance matrix ----
+        # ---- Phase 0: Compute distance matrix ----
         D = self.compute_distances_bfs(qbit_num)
 
         # ---- Phase 1: Partition enumeration ----
@@ -738,20 +471,13 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
 
-        # ---- Phase 3: ILP partition selection with routing-aware weights ----
-        gate_to_level = self.get_gate_DAG_level_map(working_circ)
-        max_level = max(gate_to_level.values()) if gate_to_level else 0
-
+        # ---- Phase 3: ILP partition selection with synthesis-cost weights ----
         weights = []
         for idx, result in enumerate(optimized_results[:len(allparts)]):
-            partition_gates = allparts[idx]
-            part_start = min(gate_to_level.get(g, 0) for g in partition_gates)
-            part_end = max(gate_to_level.get(g, 0) for g in partition_gates)
-            dag_start = part_start / max_level if max_level > 0 else 0.0
-            dag_end = part_end / max_level if max_level > 0 else 1.0
-            weights.append(
-                self.compute_routing_aware_weight(result, pi_init, D, E, dag_start, dag_end)
-            )
+            if isinstance(result, SingleQubitPartitionResult):
+                weights.append(0)
+            else:
+                weights.append(result.get_partition_synthesis_score())
 
         # ---- Phase 3b: Compute inter-partition transition costs ----
         transition_weight = self.config.setdefault('transition_weight', 1.0)
@@ -783,8 +509,8 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
                 result_pred = optimized_results[pred_idx]
                 result_succ = optimized_results[succ_idx]
                 # Compute both directions and take the minimum
-                cost_fwd = self.compute_transition_cost(result_pred, result_succ, pi_init, D)
-                cost_rev = self.compute_transition_cost(result_succ, result_pred, pi_init, D)
+                cost_fwd = self.compute_transition_cost(result_pred, result_succ, D)
+                cost_rev = self.compute_transition_cost(result_succ, result_pred, D)
                 cost = min(cost_fwd, cost_rev)
                 if cost > 0:
                     transition_costs[pair_key] = cost * transition_weight
@@ -820,13 +546,6 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
 
-        # ---- Phase 5: Store metadata and return ----
-        self._last_synthesis_metadata = {
-            'E': E,
-            'window_gates': len(window_gate_set),
-            'total_gates': len(all_gates),
-        }
-
         return optimized_partitions
 
     # ------------------------------------------------------------------------
@@ -836,120 +555,24 @@ def SynthesizeWideCircuit(self, circ, orig_parameters, pi_init=None, E=None,
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         N = circ.get_Qbit_Num()
 
-        # Pre-process: group circuit for coarser DAG levels (window boundaries)
-        has_2q_gates = any(len(g.get_Involved_Qbits()) >= 2 for g in circ.get_Gates())
-        if has_2q_gates:
-            grouped_circ, block_gate_orders = self._group_circuit_for_levels(circ)
-        else:
-            grouped_circ = circ
-            block_gate_orders = None
-
-        window_size = self.config.get('window_size', 0)
-        grouped_levels = self.generate_DAG_levels(grouped_circ)
-        total_levels = len(grouped_levels)
-
-        # ---- Full-circuit path (backward compat) ----
-        if window_size <= 0 or window_size >= total_levels:
-            # Pass the original flat circuit — no grouping needed for full circuit
-            optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
-
-            for partition in optimized_partitions:
-                if isinstance(partition, PartitionSynthesisResult):
-                    partition._topology = self.topology
-                    partition._topology_cache = self._topology_cache
+        optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
 
-            DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
-
-            D = self.compute_distances_bfs(N)
-            pi = np.arange(N)  # Dummy — free_initial_routing will derive pi_initial
-
-            F = self.get_initial_layer(IDAG, N, optimized_partitions)
-            scoring_partitions = self._build_scoring_partitions(optimized_partitions)
-
-            partition_order, pi, pi_initial = self.Heuristic_Search(F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, free_initial_routing=True)
-
-            final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
+        for partition in optimized_partitions:
+            if isinstance(partition, PartitionSynthesisResult):
+                partition._topology = self.topology
+                partition._topology_cache = self._topology_cache
 
-            return final_circuit, final_parameters, pi_initial, pi
+        DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
 
-        # ---- Windowed mode ----
         D = self.compute_distances_bfs(N)
-        pi = np.arange(N)  # Dummy for first window — free_initial_routing derives pi_initial
-        pi_initial = None
-
-        all_window_circuits = []
-        all_window_params = []
-
-        for window_start in range(0, total_levels, window_size):
-            window_end = min(window_start + window_size, total_levels)
-            is_first_window = (window_start == 0)
-
-            # Expand grouped block indices to flat gate indices
-            window_gate_indices = []
-            for level_idx in range(window_start, window_end):
-                for block_idx in grouped_levels[level_idx]:
-                    if block_gate_orders is not None:
-                        window_gate_indices.extend(block_gate_orders[block_idx])
-                    else:
-                        window_gate_indices.append(block_idx)
-
-            # a. Synthesize this window (pass original flat circuit)
-            window_partitions = self.SynthesizeWideCircuit(
-                circ, orig_parameters,
-                pi_init=pi if not is_first_window else None,
-                window_gate_indices=window_gate_indices
-            )
-
-            # Skip empty windows
-            if not window_partitions:
-                continue
-
-            # Retrieve virtual outgoing gates computed by SynthesizeWideCircuit
-            virtual_E = self._last_synthesis_metadata.get('E', []) or []
-
-            # b. Set topology info on partition results
-            for partition in window_partitions:
-                if isinstance(partition, PartitionSynthesisResult):
-                    partition._topology = self.topology
-                    partition._topology_cache = self._topology_cache
-
-            # c. Build per-window structures
-            DAG, IDAG = self.construct_DAG_and_IDAG(window_partitions)
-            F = self.get_initial_layer(IDAG, N, window_partitions)
-            scoring_partitions = self._build_scoring_partitions(window_partitions)
-
-            # d. Heuristic search for this window (pi carries forward)
-            partition_order, pi, window_pi_initial = self.Heuristic_Search(
-                F, pi.copy(), DAG, IDAG,
-                window_partitions, scoring_partitions, D,
-                virtual_E=virtual_E if virtual_E else None,
-                free_initial_routing=is_first_window,
-            )
-
-            if is_first_window:
-                pi_initial = window_pi_initial
-
-            # e. Construct window circuit
-            window_circuit, window_params = self.Construct_circuit_from_HS(
-                partition_order, window_partitions, N
-            )
-
-            # f. Append results
-            all_window_circuits.append(window_circuit)
-            all_window_params.append(window_params)
+        pi = np.arange(N)
 
-        # Concatenate all window circuits and parameters
-        final_circuit = Circuit(N)
-        for wc in all_window_circuits:
-            final_circuit.add_Circuit(wc)
+        F = self.get_initial_layer(IDAG, N, optimized_partitions)
+        scoring_partitions = self._build_scoring_partitions(optimized_partitions)
 
-        if all_window_params:
-            final_parameters = np.concatenate(all_window_params, axis=0)
-        else:
-            final_parameters = np.array([])
+        partition_order, pi, pi_initial = self.Heuristic_Search(F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, free_initial_routing=True)
 
-        if pi_initial is None:
-            pi_initial = np.arange(N)
+        final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
 
         return final_circuit, final_parameters, pi_initial, pi
 
@@ -957,7 +580,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
     # Heuristic Search
     # ------------------------------------------------------------------------
 
-    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, virtual_E=None, free_initial_routing=False):
+    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, free_initial_routing=False):
         pi_initial = pi.copy()
 
         resolved_partitions = [False] * len(DAG)
@@ -996,12 +619,23 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                    bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved',
                    disable=self.config.get('progressbar', 0) == False)
 
+        max_E_size = self.config.get('max_E_size', 20)
+        max_lookahead = self.config.get('max_lookahead', 4)
+        E_W = self.config.get('E_weight', 0.5)
+        E_alpha = self.config.get('E_alpha', 0.9)
+
         while len(F) != 0:
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
                     break
                 F_snapshot = tuple(F)
                 use_free_routing = not first_routing_done
+
+                E = self.generate_extended_set(
+                    F, DAG, IDAG, resolved_partitions, optimized_partitions,
+                    max_E_size=max_E_size, max_lookahead=max_lookahead
+                )
+
                 scores = [
                         self.score_partition_candidate(
                             partition_candidate,
@@ -1010,8 +644,10 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             scoring_partitions,
                             D,
                             self._swap_cache,
-                            virtual_E,
                             free_routing=use_free_routing,
+                            E=E,
+                            W=E_W,
+                            alpha=E_alpha,
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -1111,19 +747,20 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache, virtual_E=None, free_routing=False):
-        score_F = 0
+    def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
+                                  free_routing=False, E=None, W=0.5, alpha=0.9):
+        score = 0
         swap_weight = 1
         swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, free_routing=free_routing)
         if not free_routing:
-            score_F += swap_weight * len(swaps) * 3
-        score_F += 0.1*len(partition_candidate.circuit_structure)
+            score += swap_weight * len(swaps) * 3
+        score += 0.1*len(partition_candidate.circuit_structure)
 
         for partition_idx in F:
             partition = scoring_partitions[partition_idx]
             if partition is None or partition_idx == partition_candidate.partition_idx:
                 continue
-            qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}  # q* → circuit qubit
+            qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
             mini_scores = []
             for tdx, mini_topology in enumerate(partition.mini_topologies):
                 for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
@@ -1137,20 +774,91 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                         routing_cost = 0
                     mini_scores.append(routing_cost + cnot_count)
             if mini_scores:
-                score_F += np.min(mini_scores)
+                score += np.min(mini_scores)
+
+        # Extended set look-ahead scoring
+        if E:
+            e_score = 0
+            for partition_idx, depth in E:
+                partition = scoring_partitions[partition_idx]
+                if partition is None or partition_idx == partition_candidate.partition_idx:
+                    continue
+                qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
+                mini_scores = []
+                for tdx, mini_topology in enumerate(partition.mini_topologies):
+                    for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
+                        cnot_count = len(partition.circuit_structures[tdx][pdx])
+                        if mini_topology:
+                            routing_cost = swap_weight * 3 * sum(
+                                max(0, D[int(output_perm[qbit_map_inv[P_i[u]]])][int(output_perm[qbit_map_inv[P_i[v]]])] - 1)
+                                for u, v in mini_topology
+                            )
+                        else:
+                            routing_cost = 0
+                        mini_scores.append(routing_cost + cnot_count)
+                if mini_scores:
+                    e_score += np.min(mini_scores) * (alpha ** depth)
+            if len(E) > 0:
+                score += W * e_score / len(E)
+
+        return score
+
+    # ------------------------------------------------------------------------
+    # Extended Set
+    # ------------------------------------------------------------------------
 
-        # Virtual outgoing gate penalty: cross-window look-ahead (always active)
-        virtual_e_score = 0.0
-        if virtual_E:
-            for (q_a, q_b) in virtual_E:
-                dist = D[int(output_perm[q_a])][int(output_perm[q_b])]
-                if not np.isinf(dist):
-                    virtual_e_score += max(0, (dist - 1)) * 3
+    @staticmethod
+    def generate_extended_set(F, DAG, IDAG, resolved_partitions, optimized_partitions,
+                              max_E_size=20, max_lookahead=4):
+        """
+        Generate SABRE-style extended set: multi-qubit partitions near the
+        front layer, up to ``max_lookahead`` levels deep and ``max_E_size``
+        entries.  Returns list of (partition_idx, depth) tuples.
+        """
+        E = []
+        E_set = set()
+        F_set = set(F)
+
+        for front_idx in F:
+            if len(E) >= max_E_size:
+                break
+
+            # BFS from front_idx through DAG children
+            queue = []  # (child_idx, depth)
+            for child in DAG[front_idx]:
+                queue.append((child, 1))
+
+            while queue and len(E) < max_E_size:
+                child_idx, depth = queue.pop(0)
+                if depth > max_lookahead:
+                    continue
+                if child_idx in E_set or child_idx in F_set:
+                    continue
+                if resolved_partitions[child_idx]:
+                    continue
+
+                # Check all parents resolved (except those still in F)
+                parents_resolved = all(
+                    resolved_partitions[p] or p in F_set
+                    for p in IDAG[child_idx]
+                )
+                if not parents_resolved:
+                    continue
+
+                # Skip single-qubit partitions — follow through them
+                if isinstance(optimized_partitions[child_idx], SingleQubitPartitionResult):
+                    for grandchild in DAG[child_idx]:
+                        queue.append((grandchild, depth))
+                    continue
 
-        E_score = 0.3 * virtual_e_score if virtual_e_score > 0.0 else 0.0
-        F_score = 0.7 * score_F
+                E.append((child_idx, depth))
+                E_set.add(child_idx)
 
-        return E_score + F_score
+                if depth < max_lookahead:
+                    for grandchild in DAG[child_idx]:
+                        queue.append((grandchild, depth + 1))
+
+        return E
 
     # ------------------------------------------------------------------------
     # Candidate Generation
@@ -1219,35 +927,6 @@ def construct_DAG_and_IDAG(self, optimized_partitions):
             IDAG.append(parents)
         return DAG, IDAG
     
-    def construct_sDAG(self, optimized_partitions):
-        sDAG = [[] for _ in range(len(optimized_partitions))]
-        
-        for idx in range(len(optimized_partitions)):
-            # Skip single-qubit partitions
-            if len(optimized_partitions[idx].involved_qbits) <= 1:
-                continue
-                
-            children = []
-            
-            if idx != len(optimized_partitions)-1:
-                involved_qbits_current = optimized_partitions[idx].involved_qbits.copy()
-                for next_idx in range(idx+1, len(optimized_partitions)):
-                    # Skip single-qubit partitions when searching for children
-                    if len(optimized_partitions[next_idx].involved_qbits) <= 1:
-                        continue
-                        
-                    involved_qbits_next = optimized_partitions[next_idx].involved_qbits
-                    intersection = [i for i in involved_qbits_current if i in involved_qbits_next]
-                    if len(intersection) > 0:
-                        children.append(next_idx)
-                        for intersection_qbit in intersection:
-                            involved_qbits_current.remove(intersection_qbit)
-                    if len(involved_qbits_current) == 0:
-                        break                        
-            sDAG[idx] = children
-            
-        return sDAG
-
     # ------------------------------------------------------------------------
     # Distance & Layout
     # ------------------------------------------------------------------------
@@ -1279,135 +958,6 @@ def compute_distances_bfs(self, N):
         return D #multiply by 3 to make it CNOT cost instead of SWAP cost
 
 
-    def _compute_smart_initial_layout(self, circuit, N, D):
-        """
-        Compute initial layout using interaction graph + simulated annealing.
-        Much better than the greedy approach.
-        """
-        # Build interaction graph: weight = number of CNOTs between qubits
-        interaction_graph = defaultdict(int)
-        gates = circuit.get_Gates()
-        
-        for gate in gates:
-            if gate.get_Control_Qbit() != -1:
-                q1, q2 = sorted([gate.get_Target_Qbit(), gate.get_Control_Qbit()])
-                if q1 < N and q2 < N:
-                    interaction_graph[(q1, q2)] += 1
-        
-        # If no 2-qubit gates, return identity
-        if not interaction_graph:
-            return np.arange(N)
-        
-        # Start with greedy mapping as baseline
-        pi_greedy = self._greedy_initial_layout(interaction_graph, N, D)
-        best_pi = pi_greedy.copy()
-        best_score = self._evaluate_layout_score(best_pi, interaction_graph, D)
-        
-        # Simulated annealing to improve
-        current_pi = best_pi.copy()
-        current_score = best_score
-        
-        # Temperature schedule
-        max_iter = 100 * N
-        for iteration in range(max_iter):
-            temp = 1.0 - (iteration / max_iter)
-            
-            # Propose swap of two physical qubits
-            p1, p2 = np.random.choice(N, 2, replace=False)
-            new_pi = current_pi.copy()
-            new_pi[p1], new_pi[p2] = new_pi[p2], new_pi[p1]  # Swap assignments
-            
-            # Evaluate new layout
-            new_score = self._evaluate_layout_score(new_pi, interaction_graph, D)
-            
-            # Accept if better or with probability
-            delta = new_score - current_score
-            if delta < 0 or np.random.random() < np.exp(-delta / (temp + 1e-6)):
-                current_pi = new_pi
-                current_score = new_score
-                
-                if current_score < best_score:
-                    best_score = current_score
-                    best_pi = current_pi.copy()
-        
-        return best_pi
-    
-    def _greedy_initial_layout(self, interaction_graph, N, D):
-        """Greedy baseline mapping - much simpler and reliable"""
-        pi = np.arange(N)
-        placed_logical = set()
-        placed_physical = set()
-        
-        # Sort interactions by weight (descending)
-        sorted_interactions = sorted(
-            interaction_graph.items(), 
-            key=lambda x: x[1], 
-            reverse=True
-        )
-        
-        # Place highest interaction pair first
-        if sorted_interactions:
-            (q1, q2), _ = sorted_interactions[0]
-            # Find closest physical pair
-            min_dist = float('inf')
-            best_pair = None
-            for p1 in range(N):
-                for p2 in range(p1 + 1, N):
-                    if D[p1][p2] < min_dist:
-                        min_dist = D[p1][p2]
-                        best_pair = (p1, p2)
-            
-            if best_pair:
-                p1, p2 = best_pair
-                pi[q1] = p1
-                pi[q2] = p2
-                placed_logical = {q1, q2}
-                placed_physical = {p1, p2}
-        
-        # Place remaining qubits
-        remaining_logical = [q for q in range(N) if q not in placed_logical]
-        for q in remaining_logical:
-            best_p = None
-            best_cost = float('inf')
-            
-            for p in range(N):
-                if p in placed_physical:
-                    continue
-                
-                # Cost = sum of distances to already placed interacting qubits
-                cost = 0
-                for other_q in placed_logical:
-                    weight = interaction_graph.get(tuple(sorted((q, other_q))), 0)
-                    if weight > 0:
-                        other_p = pi[other_q]
-                        cost += D[p][other_p] * weight
-                
-                if cost < best_cost:
-                    best_cost = cost
-                    best_p = p
-            
-            if best_p is not None:
-                pi[q] = best_p
-                placed_logical.add(q)
-                placed_physical.add(best_p)
-        
-        return pi
-    
-    def _evaluate_layout_score(self, pi, interaction_graph, D):
-        """
-        Evaluate layout quality: lower score is better.
-        Score = sum(distance(physical_q1, physical_q2) * interaction_weight)
-        """
-        score = 0.0
-        for (q1, q2), weight in interaction_graph.items():
-            p1, p2 = pi[q1], pi[q2]
-            distance = D[p1][p2]
-            if np.isinf(distance):
-                return float('inf')  # Invalid layout
-            score += distance * weight
-        
-        return score
-    
     def generate_DAG_levels(self, circuit):
         """
         Generate DAG levels - groups gates by their topological level.
@@ -1467,36 +1017,3 @@ def generate_DAG_levels(self, circuit):
         
         return levels
 
-    def get_gate_DAG_level(self, circuit, gate_idx):
-        """
-        Find the DAG level a specific gate belongs to.
-
-        Args:
-            circuit: The quantum circuit to analyze
-            gate_idx: Index of the gate within the circuit's gate list
-
-        Returns:
-            int: The DAG level the gate belongs to (0-indexed), or -1 if not found.
-        """
-        levels = self.generate_DAG_levels(circuit)
-        for level_idx, level_gates in enumerate(levels):
-            if gate_idx in level_gates:
-                return level_idx
-        return -1
-
-    def get_gate_DAG_level_map(self, circuit):
-        """
-        Build a mapping from gate index to its DAG level.
-
-        Args:
-            circuit: The quantum circuit to analyze
-
-        Returns:
-            dict: Mapping {gate_idx: level} for every gate in the circuit.
-        """
-        levels = self.generate_DAG_levels(circuit)
-        gate_to_level = {}
-        for level_idx, level_gates in enumerate(levels):
-            for gate_idx in level_gates:
-                gate_to_level[gate_idx] = level_idx
-        return gate_to_level
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 54ff2053a..a6cfa3e52 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -3,116 +3,13 @@
 from itertools import permutations, combinations
 from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
 import heapq
-import math
 import logging
-import pulp
 from collections import defaultdict
 
 
 # ============================================================================
 # SWAP Routing Algorithms
 # ============================================================================
-def find_constrained_swaps_A_star(pi_A, pi_B_dict, dist_matrix):
-    """
-    Find SWAP sequence to route subset of virtual qubits to targets.
-    
-    Args:
-        pi_A: List [Q0, Q1, ...] where pi_A[q] = Q (complete initial mapping)
-        pi_B_dict: Dict {q: Q} specifying only qubits that need routing
-        dist_matrix: Pre-computed distance matrix dist[i][j] between physical qubits
-    
-    Returns:
-        swaps: List of (i, j) SWAP operations on adjacent physical qubits
-        final_permutation: List showing final virtual→physical mapping
-    """
-    n = len(pi_A)
-    
-    # Build adjacency list from distance matrix
-    adj = [set() for _ in range(n)]
-    for i in range(n):
-        for j in range(i+1, n):
-            if dist_matrix[i][j] == 1:  # Adjacent in topology
-                adj[i].add(j)
-                adj[j].add(i)
-    
-    # Use physical-to-virtual representation for easier SWAP handling
-    # state[P] = q means physical qubit P contains virtual qubit q
-    def to_phys_to_virt(virt_to_phys):
-        """Convert virtual→physical list to physical→virtual list"""
-        p2v = [0] * n
-        for q in range(n):
-            P = virt_to_phys[q]
-            p2v[P] = q
-        return p2v
-    
-    def to_virt_to_phys(phys_to_virt):
-        """Convert physical→virtual list to virtual→physical list"""
-        v2p = [0] * n
-        for P in range(n):
-            q = phys_to_virt[P]
-            v2p[q] = P
-        return v2p
-    
-    start_state = tuple(to_phys_to_virt(pi_A))
-    
-    def is_goal(state):
-        """Check if target qubits are in correct physical positions"""
-        for q, target_P in pi_B_dict.items():
-            if state[target_P] != q:  # Physical position target_P should contain virtual q
-                return False
-        return True
-
-    def heuristic(state):
-        """Lower bound: sum of distances for qubits needing routing"""
-        total = 0.0
-        for q, target_P in pi_B_dict.items():
-            # Find where virtual qubit q currently is
-            current_P = state.index(q)
-            distance = dist_matrix[current_P][target_P]
-            if np.isinf(distance):
-                logging.warning(
-                    "Encountered unreachable qubit pair (%s, %s) in routing heuristic; returning inf cost.",
-                    current_P,
-                    target_P,
-                )
-                return math.inf
-            total += float(distance)
-        return math.floor(total / 2)  # Optimistic: each SWAP helps 2 qubits
-    
-    heap = [(heuristic(start_state), 0, start_state, [])]
-    visited = {start_state: 0}
-    
-    while heap:
-        f, g, current, path = heapq.heappop(heap)
-        
-        if is_goal(current):
-            # Convert final state back to virtual→physical mapping
-            final_permutation = to_virt_to_phys(current)
-            return path, final_permutation
-        
-        if visited.get(current, float('inf')) < g:
-            continue
-        
-        # Try all valid SWAPs on adjacent physical qubits
-        current_list = list(current)
-        for i in range(n):
-            for j in adj[i]:
-                if i < j:  # Avoid duplicate (i,j) and (j,i)
-                    # SWAP physical qubits i and j
-                    new_state = current_list[:]
-                    new_state[i], new_state[j] = new_state[j], new_state[i]
-                    new_state_tuple = tuple(new_state)
-                    
-                    new_g = g + 1
-                    
-                    if visited.get(new_state_tuple, float('inf')) > new_g:
-                        visited[new_state_tuple] = new_g
-                        new_f = new_g + heuristic(new_state_tuple)
-                        new_path = path + [(i, j)]
-                        heapq.heappush(heap, (new_f, new_g, new_state_tuple, new_path))
-    
-    return None, None  # No solution found
-
 def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
     """
     Route partition qubits to their target physical positions using A* over
@@ -230,28 +127,6 @@ def heuristic(positions):
     )
     return [], list(pi_A)
 
-def calculate_swaps_quick(P_i, qbit_map, node_mapping, pi, D, swap_cache=None):
-    P_i_inv = [P_i.index(i) for i in range(len(P_i))]  # Compute inverse
-    qbit_map_input = {k : node_mapping[P_i_inv[v]] for k,v in qbit_map.items()}
-    # Convert pi to plain Python list of ints (may contain np.int64)
-    pi_list = [int(x) for x in pi]
-
-    # Check cache if provided
-    cache_key = None
-    if swap_cache is not None:
-        # Create cache key: (pi_tuple, frozenset of qbit_map_input items)
-        pi_tuple = tuple(pi_list)
-        qbit_map_frozen = frozenset(qbit_map_input.items())
-        cache_key = (pi_tuple, qbit_map_frozen)
-        if cache_key in swap_cache:
-            swaps, pi_init = swap_cache[cache_key]
-        else:
-            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
-            swap_cache[cache_key] = (swaps, pi_init)
-    else:
-        swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
-    return len(swaps)
-
 
 # ============================================================================
 # Topology Utilities
@@ -373,46 +248,11 @@ def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int
             return mapping
     return {}
 
-def extract_subtopology(involved_qbits, qbit_map, config ):
-    mini_topology = []
-    for edge in config["topology"]:
-        if edge[0] in involved_qbits and edge[1] in involved_qbits:
-            mini_topology.append((qbit_map[edge[0]],qbit_map[edge[1]]))
-    return mini_topology
-
 
 # ============================================================================
 # Distance & Cost Calculations
 # ============================================================================
 
-def calculate_dist_small(mini_topology, qbit_map, dist_matrix, pi):
-    """Estimate the routing cost needed to bring the partition qubits adjacent.
-
-    Minimises over all assignments of circuit qubits to local topology
-    positions so that, e.g., the hub qubit in a 3-qubit star topology is
-    always matched to the physically most central circuit qubit rather than
-    whichever qubit happened to be assigned local label 0 by the ILP.
-
-    Returns sum-of-edge-distances * 3 (three gates per SWAP) for the best
-    qubit-to-position assignment.
-    """
-    if not mini_topology:
-        return 0
-    # Build ordered list: circuit_qubits[j] = circuit qubit at local position j
-    k = len(qbit_map)
-    qbit_map_inv = {local: circ for circ, local in qbit_map.items()}
-    circuit_qubits = [qbit_map_inv[j] for j in range(k)]
-
-    # Try all k! permutations to find the cheapest qubit→position assignment
-    best = float('inf')
-    for perm in permutations(range(k)):
-        cost = 0
-        for u, v in mini_topology:
-            cost += (dist_matrix[pi[circuit_qubits[perm[u]]]][pi[circuit_qubits[perm[v]]]] - 1) * 3
-        if cost < best:
-            best = cost
-    return best
-
 # ============================================================================
 # Data Classes
 # ============================================================================
@@ -668,65 +508,3 @@ def construct_swap_circuit(swap_order, N):
         swap_circ.add_CNOT(swap[0],swap[1])
     return swap_circ
 
-def group_into_two_qubit_blocks(circuit: Circuit) -> Circuit:
-    """
-    Takes a flat circuit and returns an equivalent circuit whose top-level
-    elements are all 2-qubit Circuit blocks, each containing exactly one
-    2-qubit gate.
-
-    Single-qubit gates are buffered and flushed into the next 2-qubit block
-    on that qubit. Trailing single-qubit gates (after the last 2-qubit gate
-    on a qubit) are appended to the last block that involved that qubit.
-
-    Assumes the circuit contains only 1- and 2-qubit gates.
-
-    Args:
-        circuit: Flat input circuit with individual gates
-
-    Returns:
-        Circuit: Equivalent circuit whose top-level elements are all 2-qubit blocks
-    """
-    N = circuit.get_Qbit_Num()
-
-    pending = defaultdict(list)  # pending[q] = single-qubit gates waiting for next block on q
-    blocks = []                  # accumulated Circuit block objects
-    last_block_for_qubit = {}    # last_block_for_qubit[q] = index into blocks
-
-    for gate in circuit.get_Gates():
-        qubits = gate.get_Involved_Qbits()
-        if len(qubits) == 1:
-            pending[qubits[0]].append(gate)
-        else:  # 2-qubit gate
-            q0, q1 = qubits[0], qubits[1]
-            block = Circuit(N)
-            for g in pending[q0]:
-                block.add_Gate(g)
-            for g in pending[q1]:
-                block.add_Gate(g)
-            pending[q0].clear()
-            pending[q1].clear()
-            block.add_Gate(gate)
-            idx = len(blocks)
-            blocks.append(block)
-            last_block_for_qubit[q0] = idx
-            last_block_for_qubit[q1] = idx
-
-    # Append trailing single-qubit gates to the last block that touched that qubit
-    for q, gates_list in pending.items():
-        if not gates_list:
-            continue
-        if q in last_block_for_qubit:
-            block = blocks[last_block_for_qubit[q]]
-            for g in gates_list:
-                block.add_Gate(g)
-        else:
-            # Qubit only has single-qubit gates — create a standalone block
-            block = Circuit(N)
-            for g in gates_list:
-                block.add_Gate(g)
-            blocks.append(block)
-
-    result = Circuit(N)
-    for block in blocks:
-        result.add_Circuit(block)
-    return result

From f44519293ca36e5211e1f043e07bd5885f9eb3fc Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 2 Mar 2026 23:13:52 +0100
Subject: [PATCH 083/232] Add verbosity to wide circuit optimization

---
 .../qgd_Wide_Circuit_Optimization.py          | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 82d261d4d..753383cec 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -991,7 +991,7 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
 
         #subcircuits = subcircuits[9:10]
 
-        if parent_process() is None: print(len(subcircuits), "partitions found to optimize")
+        if parent_process() is None and self.config["verbosity"] > 0: print(len(subcircuits), "partitions found to optimize")
 
 
         # the list of optimized subcircuits
@@ -1020,12 +1020,11 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
                 config = config if structures is None or partition_idx >= len(structures) else {**config, 'strategy': 'Custom', 'max_inner_iterations': 10000, 'max_iteration_loops': 4}
                 new_subcircuit, new_parameters = callback_fnc(self.PartitionDecompositionProcess( subcircuit, subcircuit_parameters, config,
                                                                                      None if structures is None or partition_idx >= len(structures) else structures[partition_idx] ))
-                if subcircuit != new_subcircuit:
+                if subcircuit != new_subcircuit and self.config["verbosity"] > 0:
+                    print( "original subcircuit:    ", subcircuit.get_Gate_Nums())
+                    print( "reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums())
 
-                    print( "original subcircuit:    ", subcircuit.get_Gate_Nums()) 
-                    print( "reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums()) 
-
-                if partition_idx % 100 == 99: print(partition_idx+1, "partitions optimized")
+                if partition_idx % 100 == 99 and self.config["verbosity"] > 0: print(partition_idx+1, "partitions optimized")
                 optimized_subcircuits[ partition_idx ] = new_subcircuit
                 optimized_parameter_list[ partition_idx ] = new_parameters
         else:
@@ -1057,11 +1056,10 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
                     callback_fnc = lambda  x : self.CompareAndPickCircuits( [subcircuit, *(z[0] for z in x)], [subcircuit_parameters, *(z[1] for z in x)] )
                     new_subcircuit, new_parameters = callback_fnc(async_results[partition_idx].get( timeout = None ))
 
-                    if subcircuit != new_subcircuit:
-
-                        print( "original subcircuit:    ", subcircuit.get_Gate_Nums()) 
-                        print( "reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums()) 
-                    if partition_idx % 100 == 99: print(partition_idx+1, "partitions optimized")
+                    if subcircuit != new_subcircuit and self.config["verbosity"] > 0:
+                        print( "original subcircuit:    ", subcircuit.get_Gate_Nums())
+                        print( "reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums())
+                    if partition_idx % 100 == 99 and self.config["verbosity"] > 0: print(partition_idx+1, "partitions optimized")
                     optimized_subcircuits[ partition_idx ] = new_subcircuit
                     optimized_parameter_list[ partition_idx ] = new_parameters
 
@@ -1074,8 +1072,8 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
         else:
             wide_circuit, wide_parameters = self.ConstructCircuitFromPartitions( optimized_subcircuits, optimized_parameter_list )
 
-        if parent_process() is None:
-            print( "original circuit:    ", circ.get_Gate_Nums()) 
+        if parent_process() is None and self.config["verbosity"] > 0:
+            print( "original circuit:    ", circ.get_Gate_Nums())
             print( "reoptimized circuit: ", wide_circuit.get_Gate_Nums()) 
 
 

From d06333243d80eb566554c1351e3e277baa271471 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 3 Mar 2026 00:10:36 +0100
Subject: [PATCH 084/232] Remove transition costs (no effect on results)

---
 squander/partitioning/ilp.py |  35 ++--------
 squander/synthesis/PartAM.py | 124 +----------------------------------
 2 files changed, 8 insertions(+), 151 deletions(-)

diff --git a/squander/partitioning/ilp.py b/squander/partitioning/ilp.py
index 22898ce1f..422090c9e 100644
--- a/squander/partitioning/ilp.py
+++ b/squander/partitioning/ilp.py
@@ -538,7 +538,7 @@ def sol_to_badsccs(g, allparts, L):
     _, scc = scc_tarjan_iterative(G_part)
     return {frozenset(v) for v in scc if len(v) > 1}
 
-def ilp_global_optimal(allparts, g, weighted_info=None, gurobi_direct=False, use_order=False, weights=None, transition_costs=None):
+def ilp_global_optimal(allparts, g, weighted_info=None, gurobi_direct=False, use_order=False, weights=None):
     """
     Select an optimal set of non-overlapping parts via ILP/MIP with cycle cuts.
 
@@ -578,19 +578,8 @@ def fortet_inequalities(x, y, z): #-z-x<=0 -z+x+y<=1 z-x<=0 z+x-y<=1
                 m.setParam(GRB.Param.LazyConstraints, 1)
                 x = m.addVars(range(N), lb=[0]*N, ub=[1]*N, vtype=[GRB.BINARY]*N, name=["x_" + str(i) for i in range(N)])
                 for i in g: m.addConstr(gp.quicksum(x[j] for j in gate_to_parts[i]) == 1)
-                transition_obj_gurobi = 0
-                if transition_costs:
-                    y_vars_g = {}
-                    for (i, j), cost in transition_costs.items():
-                        y_var = m.addVar(vtype=GRB.BINARY, name=f"y_{i}_{j}")
-                        m.update()
-                        m.addConstr(y_var >= x[i] + x[j] - 1)
-                        m.addConstr(y_var <= x[i])
-                        m.addConstr(y_var <= x[j])
-                        y_vars_g[(i, j)] = (y_var, cost)
-                    transition_obj_gurobi = gp.quicksum(cost * yv for yv, cost in y_vars_g.values())
-                if weights is not None: m.setObjective(gp.quicksum((weights[i]*N+1) * x[i] for i in range(N)) + transition_obj_gurobi, GRB.MINIMIZE)
-                elif weighted_info is None: m.setObjective(gp.quicksum(x[i] for i in range(N)) + transition_obj_gurobi, GRB.MINIMIZE)
+                if weights is not None: m.setObjective(gp.quicksum((weights[i]*N+1) * x[i] for i in range(N)), GRB.MINIMIZE)
+                elif weighted_info is None: m.setObjective(gp.quicksum(x[i] for i in range(N)), GRB.MINIMIZE)
                 else:
                     Npre, Npost, Nprepost = len(single_qubit_chains_pre), len(single_qubit_chains_post), len(single_qubit_chains_prepost)
                     pre = m.addVars(list(single_qubit_chains_pre), lb=[0]*Npre, ub=[1]*Npre, vtype=[GRB.BINARY]*Npre, name=["pre_" + str(i) for i in single_qubit_chains_pre])
@@ -669,7 +658,7 @@ def fortet_inequalities(x, y, z): #-z-x<=0 -z+x+y<=1 z-x<=0 z+x-y<=1
                     for s in post:                        
                         if not single_qubit_chains_post[s][0] in noprepost:
                             S.append((1-post[s])*(2**max_qubits_per_partition * (2 * (4 + 2) + 2)))
-                    m.setObjective(gp.quicksum(S)*N+gp.quicksum(x[i] for i in range(N)) + transition_obj_gurobi, GRB.MINIMIZE)
+                    m.setObjective(gp.quicksum(S)*N+gp.quicksum(x[i] for i in range(N)), GRB.MINIMIZE)
                 def cb(m, where):
                     if where == GRB.Callback.MIPSOL:
                         x_val = m.cbGetSolution([x[i] for i in range(N)])
@@ -694,18 +683,8 @@ def cb(m, where):
     #print(all_cycles_from_dag_edges(succ))
     #for u, v in two_cycles_from_dag_edges(g, gate_to_parts, allparts):
     #    prob += x[u] + x[v] <= 1 #constraint that no two cycles are included
-    transition_obj = 0
-    if transition_costs:
-        y_vars = {}
-        for (i, j), cost in transition_costs.items():
-            y_var = pulp.LpVariable(f"y_{i}_{j}", cat="Binary")
-            prob += y_var >= x[i] + x[j] - 1
-            prob += y_var <= x[i]
-            prob += y_var <= x[j]
-            y_vars[(i, j)] = y_var
-        transition_obj = pulp.lpSum(cost * y_vars[(i, j)] for (i, j), cost in transition_costs.items())
-    if weights is not None: prob.setObjective(pulp.lpSum((weights[i]*N+1) * x[i] for i in range(N)) + transition_obj)
-    elif weighted_info is None: prob.setObjective(pulp.lpSum(x[i] for i in range(N)) + transition_obj)
+    if weights is not None: prob.setObjective(pulp.lpSum((weights[i]*N+1) * x[i] for i in range(N)))
+    elif weighted_info is None: prob.setObjective(pulp.lpSum(x[i] for i in range(N)))
     else:
         Npre, Npost, Nprepost = len(single_qubit_chains_pre), len(single_qubit_chains_post), len(single_qubit_chains_prepost)
         pre = pulp.LpVariable.dicts("pre", list(single_qubit_chains_pre), cat="Binary")
@@ -781,7 +760,7 @@ def cb(m, where):
         for s in post:                        
             if not single_qubit_chains_post[s][0] in noprepost:
                 S.append((1-post[s])*(2**max_qubits_per_partition * (2 * (4 + 2) + 2)))
-        prob.setObjective(pulp.lpSum(S)*N+pulp.lpSum(x[i] for i in range(N)) + transition_obj)
+        prob.setObjective(pulp.lpSum(S)*N+pulp.lpSum(x[i] for i in range(N)))
     while True:
         from gurobipy import GRB
         import gurobipy as gp
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index fa8d1ee43..ad267ccff 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -102,92 +102,6 @@ def __init__(self, config):
     # Scoring Methods
     # ------------------------------------------------------------------------
 
-    def compute_transition_cost(self, result_pred, result_succ, D):
-        """
-        Compute the minimum transition cost between two partitions over all
-        (topology, P_o, node_mapping) configs of pred and (topology, P_i, node_mapping)
-        configs of succ.
-
-        The cost measures how far each qubit involved in succ needs to travel
-        from its position after pred's output to where succ's input requires it.
-
-        Args:
-            result_pred: PartitionSynthesisResult for the predecessor partition
-            result_succ: PartitionSynthesisResult for the successor partition
-            D: Distance matrix between physical qubits
-
-        Returns:
-            float: Minimum transition cost (lower is better)
-        """
-        if isinstance(result_pred, SingleQubitPartitionResult) or isinstance(result_succ, SingleQubitPartitionResult):
-            return 0
-
-        N = len(D)
-        involved_pred = set(result_pred.involved_qbits)
-        involved_succ = set(result_succ.involved_qbits)
-        qmap_pred_inv = {v: k for k, v in result_pred.qubit_map.items()}  # q* -> q
-        qmap_succ_inv = {v: k for k, v in result_succ.qubit_map.items()}  # q* -> q
-        k_succ = result_succ.N
-
-        # Precompute all output positions for pred: list of dicts {q: physical_pos}
-        pred_outputs = []
-        for tdx, mini_topo in enumerate(result_pred.mini_topologies):
-            topo_candidates = self._get_subtopologies_of_type_cached(mini_topo)
-            if not topo_candidates:
-                continue
-            node_mappings = [get_node_mapping(mini_topo, tc) for tc in topo_candidates]
-            node_mappings = [nm for nm in node_mappings if nm]
-            if not node_mappings:
-                continue
-            for pdx, (_, P_o) in enumerate(result_pred.permutations_pairs[tdx]):
-                P_o_list = list(P_o)
-                for nm in node_mappings:
-                    out_pos = {}
-                    for q_star, q in qmap_pred_inv.items():
-                        out_pos[q] = nm[P_o_list[q_star]]
-                    pred_outputs.append(out_pos)
-
-        # Precompute all input target positions for succ: list of dicts {q: physical_pos}
-        succ_inputs = []
-        for tdx, mini_topo in enumerate(result_succ.mini_topologies):
-            topo_candidates = self._get_subtopologies_of_type_cached(mini_topo)
-            if not topo_candidates:
-                continue
-            node_mappings = [get_node_mapping(mini_topo, tc) for tc in topo_candidates]
-            node_mappings = [nm for nm in node_mappings if nm]
-            if not node_mappings:
-                continue
-            for pdx, (P_i, _) in enumerate(result_succ.permutations_pairs[tdx]):
-                P_i_list = list(P_i)
-                P_i_inv = [P_i_list.index(i) for i in range(k_succ)]
-                for nm in node_mappings:
-                    in_pos = {}
-                    for q_star, q in qmap_succ_inv.items():
-                        in_pos[q] = nm[P_i_inv[q_star]]
-                    succ_inputs.append(in_pos)
-
-        if not pred_outputs or not succ_inputs:
-            return 0
-
-        # Find minimum transition cost over all (pred_output, succ_input) pairs
-        best_cost = np.inf
-        for out_pos in pred_outputs:
-            for in_pos in succ_inputs:
-                cost = 0
-                for q, target in in_pos.items():
-                    if q in out_pos:
-                        current = out_pos[q]
-                    else:
-                        current = q
-                    dist = D[current][target]
-                    if not np.isinf(dist):
-                        cost += max(0, dist - 1) * 3
-                    if cost >= best_cost:
-                        break
-                best_cost = min(best_cost, cost)
-
-        return best_cost if not np.isinf(best_cost) else 0
-
     # ------------------------------------------------------------------------
     # Caching Methods
     # ------------------------------------------------------------------------
@@ -479,43 +393,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             else:
                 weights.append(result.get_partition_synthesis_score())
 
-        # ---- Phase 3b: Compute inter-partition transition costs ----
-        transition_weight = self.config.setdefault('transition_weight', 1.0)
-        transition_costs = {}
-        if transition_weight > 0:
-            gate_to_part = {}
-            for idx, part in enumerate(allparts):
-                for gate in part:
-                    gate_to_part.setdefault(gate, []).append(idx)
-
-            # Build directed DAG-neighbor partition pairs (pred -> succ)
-            directed_neighbors = set()
-            for gate_u, successors in g.items():
-                for part_u in gate_to_part.get(gate_u, []):
-                    for gate_v in successors:
-                        for part_v in gate_to_part.get(gate_v, []):
-                            if part_u != part_v:
-                                directed_neighbors.add((part_u, part_v))
-
-            # Compute transition cost for each directed pair, keyed by (min, max)
-            seen_pairs = set()
-            for pred_idx, succ_idx in directed_neighbors:
-                pair_key = (min(pred_idx, succ_idx), max(pred_idx, succ_idx))
-                if pair_key in seen_pairs:
-                    continue
-                seen_pairs.add(pair_key)
-                if allparts[pred_idx] & allparts[succ_idx]:
-                    continue
-                result_pred = optimized_results[pred_idx]
-                result_succ = optimized_results[succ_idx]
-                # Compute both directions and take the minimum
-                cost_fwd = self.compute_transition_cost(result_pred, result_succ, D)
-                cost_rev = self.compute_transition_cost(result_succ, result_pred, D)
-                cost = min(cost_fwd, cost_rev)
-                if cost > 0:
-                    transition_costs[pair_key] = cost * transition_weight
-
-        L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights, transition_costs=transition_costs)
+        L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
         parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
         L = topo_sort_partitions(working_circ, self.config["max_partition_size"], parts)
         from squander.partitioning.kahn import kahn_partition_preparts

From 4dc8e76fd73db4fdb69c9d5eeef5dfce83d0991d Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 3 Mar 2026 23:56:09 +0100
Subject: [PATCH 085/232] Remove hard qubit limit

---
 squander/src-cpp/gates/Gate.cpp        | 9 +++++++--
 squander/src-cpp/gates/Gates_block.cpp | 5 -----
 squander/src-cpp/gates/U3.cpp          | 4 +++-
 squander/src-cpp/gates/X.cpp           | 4 +++-
 squander/src-cpp/gates/Y.cpp           | 4 +++-
 squander/src-cpp/gates/Z.cpp           | 4 +++-
 6 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/squander/src-cpp/gates/Gate.cpp b/squander/src-cpp/gates/Gate.cpp
index b508834fe..2997a6814 100644
--- a/squander/src-cpp/gates/Gate.cpp
+++ b/squander/src-cpp/gates/Gate.cpp
@@ -76,7 +76,9 @@ Gate::Gate(int qbit_num_in) {
     // number of qubits spanning the matrix of the operation
     qbit_num = qbit_num_in;
     // the size of the matrix
-    matrix_size = Power_of_2(qbit_num);
+    if (qbit_num<31){
+        matrix_size = Power_of_2(qbit_num);
+    }
     // A string describing the type of the operation
     type = GENERAL_OPERATION;
     // The index of the qubit on which the operation acts (target_qbit >= 0)
@@ -109,7 +111,10 @@ Gate::Gate(int qbit_num_in, const std::vector<int>& target_qbits_in, const std::
     // number of qubits spanning the matrix of the operation
     qbit_num = qbit_num_in;
     // the size of the matrix
-    matrix_size = Power_of_2(qbit_num);
+    if (qbit_num<31){
+        matrix_size = Power_of_2(qbit_num);
+    }
+   
     // A string describing the type of the operation
     type = GENERAL_OPERATION;
     // The number of parameters
diff --git a/squander/src-cpp/gates/Gates_block.cpp b/squander/src-cpp/gates/Gates_block.cpp
index 71879aa79..5938747c7 100644
--- a/squander/src-cpp/gates/Gates_block.cpp
+++ b/squander/src-cpp/gates/Gates_block.cpp
@@ -2539,11 +2539,6 @@ void Gates_block::set_min_fusion( int min_fusion ) {
 */
 void Gates_block::set_qbit_num( int qbit_num_in ) {
 
-    if (qbit_num_in > 30) {
-        std::string err("Gates_block::set_qbit_num: Number of qubits supported up to 30"); 
-        throw err;        
-    }
-
     // setting the number of qubits
     Gate::set_qbit_num(qbit_num_in);
 
diff --git a/squander/src-cpp/gates/U3.cpp b/squander/src-cpp/gates/U3.cpp
index efb297984..5d483ea4d 100644
--- a/squander/src-cpp/gates/U3.cpp
+++ b/squander/src-cpp/gates/U3.cpp
@@ -64,7 +64,9 @@ U3::U3(int qbit_num_in, int target_qbit_in) {
     // number of qubits spanning the matrix of the gate
     qbit_num = qbit_num_in;
     // the size of the matrix
-    matrix_size = Power_of_2(qbit_num);
+    if (qbit_num<31){
+        matrix_size = Power_of_2(qbit_num);
+    }
     // A string describing the type of the gate
     type = U3_OPERATION;
 
diff --git a/squander/src-cpp/gates/X.cpp b/squander/src-cpp/gates/X.cpp
index a95b90ec6..a5f43caa1 100644
--- a/squander/src-cpp/gates/X.cpp
+++ b/squander/src-cpp/gates/X.cpp
@@ -69,7 +69,9 @@ X::X(int qbit_num_in, int target_qbit_in) {
     // number of qubits spanning the matrix of the gate
     qbit_num = qbit_num_in;
     // the size of the matrix
-    matrix_size = Power_of_2(qbit_num);
+    if (qbit_num<31){
+        matrix_size = Power_of_2(qbit_num);
+    }
     // A string describing the type of the gate
     type = X_OPERATION;
 
diff --git a/squander/src-cpp/gates/Y.cpp b/squander/src-cpp/gates/Y.cpp
index 96b8a3ffc..86f2a414a 100644
--- a/squander/src-cpp/gates/Y.cpp
+++ b/squander/src-cpp/gates/Y.cpp
@@ -68,7 +68,9 @@ Y::Y(int qbit_num_in, int target_qbit_in) {
     // number of qubits spanning the matrix of the gate
     qbit_num = qbit_num_in;
     // the size of the matrix
-    matrix_size = Power_of_2(qbit_num);
+    if (qbit_num<31){
+        matrix_size = Power_of_2(qbit_num);
+    }
     // A string describing the type of the gate
     type = Y_OPERATION;
 
diff --git a/squander/src-cpp/gates/Z.cpp b/squander/src-cpp/gates/Z.cpp
index 8c5341c67..465f57140 100644
--- a/squander/src-cpp/gates/Z.cpp
+++ b/squander/src-cpp/gates/Z.cpp
@@ -68,7 +68,9 @@ Z::Z(int qbit_num_in, int target_qbit_in) {
     // number of qubits spanning the matrix of the gate
     qbit_num = qbit_num_in;
     // the size of the matrix
-    matrix_size = Power_of_2(qbit_num);
+    if (qbit_num<31){
+        matrix_size = Power_of_2(qbit_num);
+    }
     // A string describing the type of the gate
     type = Z_OPERATION;
 

From 6fef61c88b151e0465f53cb9330eeb42e5ae54fa Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 3 Mar 2026 23:56:55 +0100
Subject: [PATCH 086/232] Switch back to classical forwards backwards initial
 mapping heuristic

---
 squander/synthesis/PartAM.py       | 276 ++++++++++++++++++++---------
 squander/synthesis/PartAM_utils.py |  36 ++--
 squander/synthesis/qgd_SABRE.py    |  10 +-
 3 files changed, 205 insertions(+), 117 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index ad267ccff..3f80f6fe2 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -89,6 +89,10 @@ def __init__(self, config):
         self.config.setdefault('bh_stepwise_factor', 0.9)
         self.config.setdefault('hs_score_workers', os.cpu_count() or 1)
         self.config.setdefault('use_osr', 0)
+        self.config.setdefault('n_layout_trials', 1)
+        self.config.setdefault('score_tolerance', 0.05)
+        self.config.setdefault('random_seed', 42)
+        self.config.setdefault('cleanup', True)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -128,45 +132,6 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
         
         return self._topology_cache[canonical_key]
 
-    @staticmethod
-    def _compute_ideal_pi_for_candidate(candidate, N):
-        """
-        Compute the ideal pi_initial such that the given candidate needs zero
-        SWAPs for input routing, plus the resulting pi_output after the partition.
-
-        Returns:
-            pi_initial: np.ndarray — layout where partition qubits are already
-                        at their required physical positions.
-            pi_output:  np.ndarray — layout after the partition circuit (P_o applied).
-        """
-        P_i_inv = [candidate.P_i.index(i) for i in range(len(candidate.P_i))]
-
-        # Required physical position for each partition qubit
-        required = {}
-        for k, v in candidate.qbit_map.items():
-            required[k] = candidate.node_mapping[P_i_inv[v]]
-
-        pi_initial = np.zeros(N, dtype=int)
-        used_physical = set(required.values())
-
-        for k, p in required.items():
-            pi_initial[k] = p
-
-        remaining_physical = sorted(p for p in range(N) if p not in used_physical)
-        remaining_logical = sorted(q for q in range(N) if q not in required)
-        for q, p in zip(remaining_logical, remaining_physical):
-            pi_initial[q] = p
-
-        # Apply P_o to get output permutation (mirrors transform_pi logic)
-        pi_output = np.array(pi_initial, dtype=int)
-        qbit_map_inverse = {v: k for k, v in candidate.qbit_map.items()}
-        for q_star in range(len(candidate.P_o)):
-            if q_star in qbit_map_inverse:
-                k = qbit_map_inverse[q_star]
-                pi_output[k] = candidate.node_mapping[candidate.P_o[q_star]]
-
-        return pi_initial, pi_output
-
     def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[PartitionScoreData]]:
         """
         Create lightweight, picklable views of partitions that contain only the
@@ -337,20 +302,25 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
         qbit_num_orig_circuit = working_circ.get_Qbit_Num()
         gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())}
+
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
+
         partitioned_circuit = Circuit( qbit_num_orig_circuit )
         params = []
+
         for part in allparts:
             surrounded_chains = {t for s in part for t in go[s] if t in single_qubit_chains_prepost and go[single_qubit_chains_prepost[t][-1]] and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
             gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded_chains))
             #topo sort part + surrounded chains
             c = Circuit( qbit_num_orig_circuit )
+
             for gate_idx in _get_topo_order({x: go[x] & gates for x in gates}, {x: rgo[x] & gates for x in gates}):
                 c.add_Gate( gate_dict[gate_idx] )
                 start = gate_dict[gate_idx].get_Parameter_Start_Index()
                 params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+
             partitioned_circuit.add_Circuit(c)
         # Only add single-qubit chains as separate partitions if minimum_partition_size allows it
         for chain in single_qubit_chains:
@@ -433,6 +403,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
     def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         N = circ.get_Qbit_Num()
 
+
         optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
 
         for partition in optimized_partitions:
@@ -443,42 +414,114 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions)
 
         D = self.compute_distances_bfs(N)
-        pi = np.arange(N)
-
-        F = self.get_initial_layer(IDAG, N, optimized_partitions)
         scoring_partitions = self._build_scoring_partitions(optimized_partitions)
 
-        partition_order, pi, pi_initial = self.Heuristic_Search(F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, free_initial_routing=True)
+        n_iterations = self.config.get('sabre_iterations', 1)
+        n_trials = self.config.get('n_layout_trials', 1)
+        random_seed = self.config.get('random_seed', 42)
+
+        if n_iterations == 0:
+            # Single forward pass from identity layout
+            F = self.get_initial_layer(IDAG, N, optimized_partitions)
+            partition_order, pi, pi_initial = self.Heuristic_Search(
+                F, pi=np.arange(N), DAG=DAG, IDAG=IDAG,
+                optimized_partitions=optimized_partitions,
+                scoring_partitions=scoring_partitions, D=D,
+            )
+        else:
+            best_pi = None
+            best_cost = float('inf')
+
+            for trial in range(max(1, n_trials)):
+                rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
+                pi = np.arange(N)
+
+                for iteration in range(n_iterations):
+                    # Reverse pass: walk DAG backwards (swap DAG↔IDAG)
+                    F_rev = self.get_final_layer(DAG, N, optimized_partitions)
+                    pi, _ = self._heuristic_search_layout_only(
+                        F_rev, pi, IDAG, DAG, optimized_partitions, scoring_partitions, D,
+                        rng=rng,
+                    )
+
+                    # Forward layout-only pass (skip on last iteration — real pass follows)
+                    if iteration < n_iterations - 1:
+                        F_fwd = self.get_initial_layer(IDAG, N, optimized_partitions)
+                        pi, _ = self._heuristic_search_layout_only(
+                            F_fwd, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                            rng=rng,
+                        )
+
+                # Score this trial: deterministic forward layout-only pass
+                F_eval = self.get_initial_layer(IDAG, N, optimized_partitions)
+                _, cost = self._heuristic_search_layout_only(
+                    F_eval, pi.copy(), DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                    rng=None,
+                )
+
+                if cost < best_cost:
+                    best_cost = cost
+                    best_pi = pi.copy()
+
+            # Final forward pass — builds actual circuits
+            F = self.get_initial_layer(IDAG, N, optimized_partitions)
+            partition_order, pi, pi_initial = self.Heuristic_Search(
+                F, best_pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
+            )
 
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
 
+        self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get('CNOT', 0)
+
+        # Cleanup phase: re-partition and resynthesize to eliminate
+        # redundancies at SWAP-partition boundaries
+        if self.config.get('cleanup', True):
+            from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
+            cleanup_config = dict(self.config)
+            cleanup_config['topology'] = self.topology
+            cleanup_config['routed'] = True
+            cleanup_config['test_subcircuits'] = False
+            cleanup_config['test_final_circuit'] = False
+            wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+            final_circuit, final_parameters = wco.OptimizeWideCircuit(
+                final_circuit.get_Flat_Circuit(), final_parameters
+            )
+
         return final_circuit, final_parameters, pi_initial, pi
 
     # ------------------------------------------------------------------------
     # Heuristic Search
     # ------------------------------------------------------------------------
 
-    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, free_initial_routing=False):
+    def _select_best_candidate(self, partition_candidates, scores, rng=None):
+        """Select best candidate, with optional stochastic tie-breaking."""
+        scores_array = np.array(scores)
+        min_score = np.min(scores_array)
+        tolerance = self.config.get('score_tolerance', 0.05)
+
+        if rng is not None and min_score > 0:
+            threshold = min_score * (1 + tolerance)
+            close_indices = np.where(scores_array <= threshold)[0]
+            if len(close_indices) > 1:
+                return partition_candidates[rng.choice(close_indices)]
+            return partition_candidates[close_indices[0]]
+        else:
+            return partition_candidates[np.argmin(scores_array)]
+
+    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D):
         pi_initial = pi.copy()
 
         resolved_partitions = [False] * len(DAG)
         partition_order = []
         step = 0
-        first_routing_done = not free_initial_routing
-        buffered_single_qubit = []
 
         for partition_idx in list(F):
             if isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
                 F.remove(partition_idx)
                 single_qubit_part = optimized_partitions[partition_idx]
-
-                if free_initial_routing and not first_routing_done:
-                    # Buffer — will remap after pi_initial is determined
-                    buffered_single_qubit.append(single_qubit_part)
-                else:
-                    qubit = single_qubit_part.circuit.get_Qbits()[0]
-                    single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
-                    partition_order.append(single_qubit_part)
+                qubit = single_qubit_part.circuit.get_Qbits()[0]
+                single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
+                partition_order.append(single_qubit_part)
 
                 resolved_partitions[partition_idx] = True
                 children = list(DAG[partition_idx])
@@ -490,7 +533,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                     if parents_resolved:
                         F.append(child)
 
-
         # Initialize progress bar
         total_partitions = len(DAG)
         pbar = tqdm(total=total_partitions, desc="Heuristic Search",
@@ -507,7 +549,6 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 if len(partition_candidates) == 0:
                     break
                 F_snapshot = tuple(F)
-                use_free_routing = not first_routing_done
 
                 E = self.generate_extended_set(
                     F, DAG, IDAG, resolved_partitions, optimized_partitions,
@@ -522,15 +563,13 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             scoring_partitions,
                             D,
                             self._swap_cache,
-                            free_routing=use_free_routing,
                             E=E,
                             W=E_W,
                             alpha=E_alpha,
                         )
                         for partition_candidate in partition_candidates
                     ]
-                min_idx = np.argmin(scores)
-                min_partition_candidate = partition_candidates[min_idx]
+                min_partition_candidate = self._select_best_candidate(partition_candidates, scores, rng=None)
 
                 F.remove(min_partition_candidate.partition_idx)
                 resolved_partitions[min_partition_candidate.partition_idx] = True
@@ -538,23 +577,9 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 pbar.n = resolved_count
                 pbar.refresh()
 
-                if not first_routing_done:
-                    # Derive pi_initial from chosen candidate — no SWAPs needed
-                    pi_initial, pi = self._compute_ideal_pi_for_candidate(
-                        min_partition_candidate, len(pi)
-                    )
-                    first_routing_done = True
-
-                    # Remap and insert buffered single-qubit partitions
-                    for sq_part in buffered_single_qubit:
-                        qubit = sq_part.circuit.get_Qbits()[0]
-                        sq_part.circuit.Remap_Qbits({int(qubit): int(pi_initial[qubit])}, max(D.shape))
-                        partition_order.append(sq_part)
-                    buffered_single_qubit = []
-                else:
-                    swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
-                    if len(swap_order)!=0:
-                        partition_order.append(construct_swap_circuit(swap_order, len(pi)))
+                swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
+                if len(swap_order)!=0:
+                    partition_order.append(construct_swap_circuit(swap_order, len(pi)))
 
                 partition_order.append(min_partition_candidate)
                 children = list(DAG[min_partition_candidate.partition_idx])
@@ -578,16 +603,81 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                         else:
                             F.append(child)
 
-        # If no multi-qubit partition was resolved, flush buffered single-qubit parts
-        if buffered_single_qubit:
-            for sq_part in buffered_single_qubit:
-                qubit = sq_part.circuit.get_Qbits()[0]
-                sq_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])}, max(D.shape))
-                partition_order.append(sq_part)
-
         pbar.close()
         return partition_order, pi, pi_initial
 
+    def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, rng=None):
+        """Run heuristic search but only track layout (pi). No circuit modification.
+
+        Returns:
+            (pi, total_swaps): final layout and total number of SWAPs accumulated.
+        """
+        resolved_partitions = [False] * len(DAG)
+        total_swaps = 0
+
+        # Resolve initial single-qubit partitions
+        for partition_idx in list(F):
+            if isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
+                F.remove(partition_idx)
+                resolved_partitions[partition_idx] = True
+                for child in DAG[partition_idx]:
+                    if all(resolved_partitions[p] for p in IDAG[child]):
+                        F.append(child)
+
+        max_E_size = self.config.get('max_E_size', 20)
+        max_lookahead = self.config.get('max_lookahead', 4)
+        E_W = self.config.get('E_weight', 0.5)
+        E_alpha = self.config.get('E_alpha', 0.9)
+
+        while F:
+            partition_candidates = self.obtain_partition_candidates(F, optimized_partitions)
+            if not partition_candidates:
+                break
+
+            F_snapshot = tuple(F)
+
+            E = self.generate_extended_set(
+                F, DAG, IDAG, resolved_partitions, optimized_partitions,
+                max_E_size=max_E_size, max_lookahead=max_lookahead
+            )
+
+            scores = [
+                self.score_partition_candidate(
+                    pc, F_snapshot, pi, scoring_partitions, D,
+                    self._swap_cache,
+                    E=E, W=E_W, alpha=E_alpha,
+                )
+                for pc in partition_candidates
+            ]
+
+            best = self._select_best_candidate(partition_candidates, scores, rng=rng)
+            F.remove(best.partition_idx)
+            resolved_partitions[best.partition_idx] = True
+
+            swaps, pi = best.transform_pi(pi, D, self._swap_cache)
+            total_swaps += len(swaps)
+
+            # Promote children
+            for child in DAG[best.partition_idx]:
+                if not resolved_partitions[child] and child not in F:
+                    if all(resolved_partitions[p] for p in IDAG[child]):
+                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                            resolved_partitions[child] = True
+                            stack = list(DAG[child])
+                            while stack:
+                                gc = stack.pop()
+                                if not resolved_partitions[gc] and gc not in F:
+                                    if all(resolved_partitions[p] for p in IDAG[gc]):
+                                        if isinstance(optimized_partitions[gc], SingleQubitPartitionResult):
+                                            resolved_partitions[gc] = True
+                                            stack.extend(DAG[gc])
+                                        else:
+                                            F.append(gc)
+                        else:
+                            F.append(child)
+
+        return pi, total_swaps
+
     # ------------------------------------------------------------------------
     # Circuit Construction
     # ------------------------------------------------------------------------
@@ -626,12 +716,11 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
 
     @staticmethod
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
-                                  free_routing=False, E=None, W=0.5, alpha=0.9):
+                                  E=None, W=0.5, alpha=0.9):
         score = 0
         swap_weight = 1
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, free_routing=free_routing)
-        if not free_routing:
-            score += swap_weight * len(swaps) * 3
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
+        score += swap_weight * len(swaps) * 3
         score += 0.1*len(partition_candidate.circuit_structure)
 
         for partition_idx in F:
@@ -772,6 +861,19 @@ def get_initial_layer(self, IDAG, N, optimized_partitions):
             if len(active_qbits) == 0:
                 break
         return initial_layer
+
+    def get_final_layer(self, DAG, N, optimized_partitions):
+        final_layer = []
+        active_qbits = list(range(N))
+        for idx in range(len(DAG) - 1, -1, -1):
+            if len(DAG[idx]) == 0:
+                final_layer.append(idx)
+                for qbit in optimized_partitions[idx].involved_qbits:
+                    if qbit in active_qbits:
+                        active_qbits.remove(qbit)
+            if len(active_qbits) == 0:
+                break
+        return final_layer
             
     def construct_DAG_and_IDAG(self, optimized_partitions):
         DAG = []
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index a6cfa3e52..c6ed98db0 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -396,7 +396,7 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
-    def transform_pi(self, pi, D, swap_cache=None, free_routing=False):
+    def transform_pi(self, pi, D, swap_cache=None):
         # Fixed: Use P_i^{-1} instead of P_i for input routing
         # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
         # For Original to see logical qubit q* at partition position q*, we need:
@@ -409,32 +409,18 @@ def transform_pi(self, pi, D, swap_cache=None, free_routing=False):
         pi_list = [int(x) for x in pi]
         n = len(pi_list)
 
-        if free_routing:
-            # Routing is free: build the ideal pi_init that places partition qubits
-            # at their target physical positions with zero SWAPs, then assign the
-            # remaining virtual qubits to the remaining physical positions.
-            used_physical = set(qbit_map_input.values())
-            pi_init = [0] * n
-            for k, target_P in qbit_map_input.items():
-                pi_init[k] = target_P
-            remaining_physical = sorted(p for p in range(n) if p not in used_physical)
-            remaining_logical  = sorted(q for q in range(n) if q not in qbit_map_input)
-            for q, p in zip(remaining_logical, remaining_physical):
-                pi_init[q] = p
-            swaps = []
-        else:
-            # Check cache if provided
-            if swap_cache is not None:
-                pi_tuple = tuple(pi_list)
-                qbit_map_frozen = frozenset(qbit_map_input.items())
-                cache_key = (pi_tuple, qbit_map_frozen)
-                if cache_key in swap_cache:
-                    swaps, pi_init = swap_cache[cache_key]
-                else:
-                    swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
-                    swap_cache[cache_key] = (swaps, pi_init)
+        # Check cache if provided
+        if swap_cache is not None:
+            pi_tuple = tuple(pi_list)
+            qbit_map_frozen = frozenset(qbit_map_input.items())
+            cache_key = (pi_tuple, qbit_map_frozen)
+            if cache_key in swap_cache:
+                swaps, pi_init = swap_cache[cache_key]
             else:
                 swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+                swap_cache[cache_key] = (swaps, pi_init)
+        else:
+            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
 
         pi_output = pi_init.copy()
         # Fixed: P_o should be indexed by partition virtual index q*, not physical index Q*
diff --git a/squander/synthesis/qgd_SABRE.py b/squander/synthesis/qgd_SABRE.py
index 8b91c2df2..e868fc42a 100644
--- a/squander/synthesis/qgd_SABRE.py
+++ b/squander/synthesis/qgd_SABRE.py
@@ -54,11 +54,11 @@ def _compute_smart_initial_layout(self, circuit):
         gates = circuit.get_Gates()
         
         for gate in gates:
-            if gate.get_Control_Qbit() != -1:
-                q1 = gate.get_Target_Qbit()
-                q2 = gate.get_Control_Qbit()
-                if q1 < self.circuit_qbit_num and q2 < self.circuit_qbit_num:
-                    key = (min(q1, q2), max(q1, q2))
+            q_control = gate.get_Control_Qbit()
+            if q_control != -1:
+                q_target = gate.get_Target_Qbit()
+                if q_target < self.circuit_qbit_num and q_control < self.circuit_qbit_num:
+                    key = (min(q_target, q_control), max(q_target, q_control))
                     interaction_count[key] += 1
         
         if not interaction_count:

From 07523d1d857e51b8dc8bce88d96db36f06708598 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 8 Mar 2026 09:46:53 +0100
Subject: [PATCH 087/232] Add routing time benchmarking tool

---
 squander/synthesis/PartAM.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3f80f6fe2..a7b644406 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -17,7 +17,7 @@
 )
 
 import numpy as np
-
+import time 
 from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass
 
@@ -389,7 +389,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
-                optimized_partitions[partition_idx] = pool.apply_async( self.DecomposePartition_Full, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+                optimized_partitions[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
@@ -419,7 +419,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         n_iterations = self.config.get('sabre_iterations', 1)
         n_trials = self.config.get('n_layout_trials', 1)
         random_seed = self.config.get('random_seed', 42)
-
+        routing_start = time.time()
         if n_iterations == 0:
             # Single forward pass from identity layout
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
@@ -470,7 +470,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
             )
 
         final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
-
+        self._routing_time = time.time() - routing_start
         self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get('CNOT', 0)
 
         # Cleanup phase: re-partition and resynthesize to eliminate

From d0aa53687f653e9f27b43328c24b5b870e647926 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 8 Mar 2026 09:47:11 +0100
Subject: [PATCH 088/232] modify gitignore

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index ba72e661e..5073a3243 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,5 @@ _skbuild/*
 squander.egg-info/*
 costfuncs_and_entropy.txt
 debugfile.txt
+
+.vscode/settings.json

From 4992d196905fecdd091e27f7da35bd67806128e8 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 9 Mar 2026 10:03:35 +0100
Subject: [PATCH 089/232] Fix routing logic

---
 squander/synthesis/PartAM.py       | 22 ++++++++++++++------
 squander/synthesis/PartAM_utils.py | 32 ++++++++++++++++++------------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index a7b644406..795e6f486 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -442,6 +442,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                     pi, _ = self._heuristic_search_layout_only(
                         F_rev, pi, IDAG, DAG, optimized_partitions, scoring_partitions, D,
                         rng=rng,
+                        reverse=True,
                     )
 
                     # Forward layout-only pass (skip on last iteration — real pass follows)
@@ -606,9 +607,13 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         pbar.close()
         return partition_order, pi, pi_initial
 
-    def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, rng=None):
+    def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, rng=None, reverse=False):
         """Run heuristic search but only track layout (pi). No circuit modification.
 
+        Args:
+            reverse: When True, swap P_i/P_o roles in scoring and layout
+                     updates (used for backward passes in SABRE iterations).
+
         Returns:
             (pi, total_swaps): final layout and total number of SWAPs accumulated.
         """
@@ -646,6 +651,7 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
                     pc, F_snapshot, pi, scoring_partitions, D,
                     self._swap_cache,
                     E=E, W=E_W, alpha=E_alpha,
+                    reverse=reverse,
                 )
                 for pc in partition_candidates
             ]
@@ -654,7 +660,7 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 
-            swaps, pi = best.transform_pi(pi, D, self._swap_cache)
+            swaps, pi = best.transform_pi(pi, D, self._swap_cache, reverse=reverse)
             total_swaps += len(swaps)
 
             # Promote children
@@ -716,10 +722,10 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
 
     @staticmethod
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
-                                  E=None, W=0.5, alpha=0.9):
+                                  E=None, W=0.5, alpha=0.9, reverse=False):
         score = 0
         swap_weight = 1
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache)
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, reverse=reverse)
         score += swap_weight * len(swaps) * 3
         score += 0.1*len(partition_candidate.circuit_structure)
 
@@ -732,9 +738,12 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             for tdx, mini_topology in enumerate(partition.mini_topologies):
                 for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
                     cnot_count = len(partition.circuit_structures[tdx][pdx])
+                    # In reverse pass, the "entry" side of neighbor partitions
+                    # is their output (P_o), not their input (P_i).
+                    P_route = P_o if reverse else P_i
                     if mini_topology:
                         routing_cost = swap_weight * 3 * sum(
-                            max(0, D[int(output_perm[qbit_map_inv[P_i[u]]])][int(output_perm[qbit_map_inv[P_i[v]]])] - 1)
+                            max(0, D[int(output_perm[qbit_map_inv[P_route[u]]])][int(output_perm[qbit_map_inv[P_route[v]]])] - 1)
                             for u, v in mini_topology
                         )
                     else:
@@ -755,9 +764,10 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 for tdx, mini_topology in enumerate(partition.mini_topologies):
                     for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
                         cnot_count = len(partition.circuit_structures[tdx][pdx])
+                        P_route = P_o if reverse else P_i
                         if mini_topology:
                             routing_cost = swap_weight * 3 * sum(
-                                max(0, D[int(output_perm[qbit_map_inv[P_i[u]]])][int(output_perm[qbit_map_inv[P_i[v]]])] - 1)
+                                max(0, D[int(output_perm[qbit_map_inv[P_route[u]]])][int(output_perm[qbit_map_inv[P_route[v]]])] - 1)
                                 for u, v in mini_topology
                             )
                         else:
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index c6ed98db0..cd6995a96 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -396,15 +396,24 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
-    def transform_pi(self, pi, D, swap_cache=None):
-        # Fixed: Use P_i^{-1} instead of P_i for input routing
+    def transform_pi(self, pi, D, swap_cache=None, reverse=False):
         # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
-        # For Original to see logical qubit q* at partition position q*, we need:
-        # - After P_i, position q* should have logical qubit q*'s data
-        # - Before P_i (= input to S), position P_i^{-1}[q*] should have logical qubit q*'s data
-        # So we route logical qubit k (with qbit_map[k] = q*) to partition position P_i^{-1}[q*]
-        P_i_inv = [self.P_i.index(i) for i in range(len(self.P_i))]  # Compute inverse
-        qbit_map_input = {k : self.node_mapping[P_i_inv[v]] for k,v in self.qbit_map.items()}
+        #
+        # Forward (reverse=False):
+        #   Route qubits to input positions derived from P_i_inv, then
+        #   update pi to output positions derived from P_o.
+        #
+        # Reverse (reverse=True):
+        #   We traverse the partition backwards, so the "entry" is the output
+        #   side and the "exit" is the input side.  Swap P_i <-> P_o roles.
+        if not reverse:
+            P_route_inv = [self.P_i.index(i) for i in range(len(self.P_i))]
+            P_exit = self.P_o
+        else:
+            P_route_inv = [self.P_o.index(i) for i in range(len(self.P_o))]
+            P_exit = self.P_i
+
+        qbit_map_input = {k : self.node_mapping[P_route_inv[v]] for k,v in self.qbit_map.items()}
         # Convert pi to plain Python list of ints (may contain np.int64)
         pi_list = [int(x) for x in pi]
         n = len(pi_list)
@@ -423,14 +432,11 @@ def transform_pi(self, pi, D, swap_cache=None):
             swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
 
         pi_output = pi_init.copy()
-        # Fixed: P_o should be indexed by partition virtual index q*, not physical index Q*
-        # After the circuit, logical qubit k with qbit_map[k] = q* ends up at 
-        # physical position node_mapping[P_o[q*]]
         qbit_map_inverse = {v: k for k, v in self.qbit_map.items()}
-        for q_star in range(len(self.P_o)):
+        for q_star in range(len(P_exit)):
             if q_star in qbit_map_inverse:
                 k = qbit_map_inverse[q_star]
-                pi_output[k] = self.node_mapping[self.P_o[q_star]]
+                pi_output[k] = self.node_mapping[P_exit[q_star]]
         return swaps, pi_output
     
     def estimate_swap_count(self, pi, D) -> int:

From 0cab2adeb806bc6167b3ee0e22d18f241754c1ff Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 9 Mar 2026 11:03:04 +0100
Subject: [PATCH 090/232] Try to remove second synthesis

---
 examples/decomposition/PartAM_example.py | 112 +++++++++++------------
 squander/synthesis/PartAM.py             |  71 ++++++++------
 2 files changed, 96 insertions(+), 87 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 42d9affb5..428872db4 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -17,16 +17,16 @@
 
 @author: Peter Rakyta, Ph.D.
 """
-## \file wide_circuit_optimization.py
-## \brief Simple example python code demonstrating a wide circuit optimization
+## \file PartAM_example.py
+## \brief Example demonstrating Partition Aware Mapping
 
-import squander.decomposition.qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization
 from squander import Partition_Aware_Mapping
 from squander import utils
-from squander import Qiskit_IO
-import time
 from squander import Circuit
 import numpy as np
+import time
+
+
 def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm):
     """Validate decomposition by applying both circuits to a random state."""
     num_qubits = circ.get_Qbit_Num()
@@ -54,72 +54,71 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
     return state_error, circ_Final
 
 
+def run_and_report(label, config, circ_orig, parameters_orig):
+    """Run PartAM with the given config and print results."""
+    print(f"\n{'='*70}")
+    print(label)
+    print(f"{'='*70}")
+
+    start_time = time.time()
+    pam = Partition_Aware_Mapping(config)
+    circ, params, input_perm, output_perm = pam.Partition_Aware_Mapping(circ_orig, parameters_orig)
+    elapsed = time.time() - start_time
+
+    error, circ_final = validate_result(
+        circ_orig, parameters_orig, circ, params, input_perm, output_perm
+    )
+    print(f"Decomposition error: {error:.10f}")
+    print(f"Gate counts: {circ_final.get_Gate_Nums()}")
+    print(f"Time: {elapsed:.2f}s")
+    return error, elapsed
+
+
 if __name__ == '__main__':
 
     filename = "bv_n14.qasm"
     circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
-    topology = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12),(12, 13)]
+    topology = [
+        (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
+        (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13),
+    ]
+
+    results = {}
 
     # ================================================================
-    # Full-circuit mode (default, window_size=0)
+    # Default: single forward pass (sabre_iterations=0)
     # ================================================================
-    print(f"\n{'='*70}")
-    print("Full-circuit mode (window_size=0)")
-    print(f"{'='*70}")
-
-    config_full = {
+    results['default'] = run_and_report("Default (single forward pass)", {
         'strategy': "TreeSearch",
-        'test_subcircuits': True,
-        'test_final_circuit': True,
         'max_partition_size': 4,
         'progressbar': True,
         'topology': topology,
-    }
-
-    start_time = time.time()
-    pam_full = Partition_Aware_Mapping(config_full)
-    circ_full, params_full, input_perm_full, output_perm_full = \
-        pam_full.Partition_Aware_Mapping(circ_orig, parameters_orig)
-    elapsed_full = time.time() - start_time
-
-    error_full, circ_final_full = validate_result(
-        circ_orig, parameters_orig,
-        circ_full, params_full, input_perm_full, output_perm_full
-    )
-    print(f"Decomposition error: {error_full:.10f}")
-    print(f"Gate counts: {circ_final_full.get_Gate_Nums()}")
-    print(f"Time: {elapsed_full:.2f}s")
+        'sabre_iterations': 0,
+    }, circ_orig, parameters_orig)
 
     # ================================================================
-    # Windowed mode (window_size=3)
+    # SABRE-style layout refinement (sabre_iterations=3)
     # ================================================================
-    print(f"\n{'='*70}")
-    print("Windowed mode (window_size=3)")
-    print(f"{'='*70}")
-
-    config_windowed = {
+    results['sabre'] = run_and_report("SABRE iterations=3", {
         'strategy': "TreeSearch",
-        'test_subcircuits': True,
-        'test_final_circuit': True,
         'max_partition_size': 4,
         'progressbar': True,
         'topology': topology,
-        'window_size': 7,
-    }
+        'sabre_iterations': 3,
+    }, circ_orig, parameters_orig)
 
-    start_time = time.time()
-    pam_windowed = Partition_Aware_Mapping(config_windowed)
-    circ_win, params_win, input_perm_win, output_perm_win = \
-        pam_windowed.Partition_Aware_Mapping(circ_orig, parameters_orig)
-    elapsed_win = time.time() - start_time
-
-    error_win, circ_final_win = validate_result(
-        circ_orig, parameters_orig,
-        circ_win, params_win, input_perm_win, output_perm_win
-    )
-    print(f"Decomposition error: {error_win:.10f}")
-    print(f"Gate counts: {circ_final_win.get_Gate_Nums()}")
-    print(f"Time: {elapsed_win:.2f}s")
+    # ================================================================
+    # Multiple layout trials with SABRE iterations
+    # ================================================================
+    results['trials'] = run_and_report("SABRE iterations=3, layout trials=5", {
+        'strategy': "TreeSearch",
+        'max_partition_size': 4,
+        'progressbar': True,
+        'topology': topology,
+        'sabre_iterations': 3,
+        'n_layout_trials': 5,
+        'random_seed': 42,
+    }, circ_orig, parameters_orig)
 
     # ================================================================
     # Summary
@@ -127,10 +126,7 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
     print(f"\n{'='*70}")
     print("Summary")
     print(f"{'='*70}")
-    print(f"{'Mode':<20} {'Error':<20} {'Time':<10}")
-    print(f"{'Full circuit':<20} {error_full:<20.10f} {elapsed_full:<10.2f}s")
-    print(f"{'Windowed (K=3)':<20} {error_win:<20.10f} {elapsed_win:<10.2f}s")
+    print(f"{'Mode':<40} {'Error':<20} {'Time':<10}")
+    for label, (error, elapsed) in results.items():
+        print(f"{label:<40} {error:<20.10f} {elapsed:<10.2f}s")
     print(f"{'='*70}\n")
-
-
-
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 795e6f486..8be14e0a1 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -13,7 +13,6 @@
     _get_topo_order,
     topo_sort_partitions,
     ilp_global_optimal,
-    recombine_single_qubit_chains,
 )
 
 import numpy as np
@@ -364,35 +363,49 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 weights.append(result.get_partition_synthesis_score())
 
         L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
-        parts = recombine_single_qubit_chains(go, rgo, single_qubit_chains, gate_to_tqubit, [allparts[i] for i in L_parts], fusion_info)
-        L = topo_sort_partitions(working_circ, self.config["max_partition_size"], parts)
-        from squander.partitioning.kahn import kahn_partition_preparts
-        from squander.partitioning.tools import translate_param_order
-        partitioned_circuit, param_order, _ = kahn_partition_preparts(working_circ, self.config["max_partition_size"], [parts[i] for i in L])
-        parameters = translate_param_order(working_parameters, param_order)
-
-        # ---- Phase 4: Stage 2 synthesis (Full) ----
-        subcircuits = partitioned_circuit.get_Gates()
-        optimized_partitions = [None] * len(subcircuits)
-
-        with Pool(processes=mp.cpu_count()) as pool:
-            for partition_idx, subcircuit in enumerate( subcircuits ):
-
-                start_idx = subcircuit.get_Parameter_Start_Index()
-                end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
-                subcircuit_parameters = parameters[ start_idx:end_idx ]
-                involved_qbits = subcircuit.get_Qbits()
 
-                qbit_num_sub = len( involved_qbits )
-                mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub)
-                qbit_map = {}
-                for idx in range( len(involved_qbits) ):
-                    qbit_map[ involved_qbits[idx] ] = idx
-                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
-                optimized_partitions[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
-
-            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="Second Synthesis",disable=self.config.get('progressbar', 0) == False) ):
-                optimized_partitions[partition_idx] = optimized_partitions[partition_idx].get()
+        # ---- Phase 4: Reuse Phase 2 results (no re-synthesis) ----
+        # Build non-overlapping parts from selected allparts + standalone chains.
+        # Phase 2 already synthesized each allpart (with surrounded chains included),
+        # so we reuse those results directly.
+        selected_surrounded_starts = set()
+        selected_parts_gates = []
+        for i in L_parts:
+            part = allparts[i]
+            surrounded = {t for s in part for t in go[s]
+                         if t in single_qubit_chains_prepost
+                         and go[single_qubit_chains_prepost[t][-1]]
+                         and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
+            gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded))
+            selected_parts_gates.append(gates)
+            selected_surrounded_starts.update(surrounded)
+
+        # Non-surrounded chains become standalone SingleQubitPartitionResult entries
+        standalone_chains = []
+        for chain in single_qubit_chains:
+            if chain[0] not in selected_surrounded_starts:
+                selected_parts_gates.append(frozenset(chain))
+                standalone_chains.append(chain)
+
+        L = topo_sort_partitions(working_circ, self.config["max_partition_size"], selected_parts_gates)
+
+        n_selected = len(L_parts)
+        optimized_partitions = []
+        for part_idx in L:
+            if part_idx < n_selected:
+                # Multi-qubit partition — reuse Phase 2 PartitionSynthesisResult
+                optimized_partitions.append(optimized_results[L_parts[part_idx]])
+            else:
+                # Standalone single-qubit chain
+                chain = standalone_chains[part_idx - n_selected]
+                c = Circuit(qbit_num_orig_circuit)
+                chain_params = []
+                for gate_idx in chain:
+                    c.add_Gate(gate_dict[gate_idx])
+                    start = gate_dict[gate_idx].get_Parameter_Start_Index()
+                    chain_params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+                chain_parameters = np.concatenate(chain_params) if chain_params else np.array([])
+                optimized_partitions.append(SingleQubitPartitionResult(c, chain_parameters))
 
         return optimized_partitions
 

From 51503705e63a2bc68e5bcf1ba38316edfc91ac50 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 9 Mar 2026 18:16:51 +0100
Subject: [PATCH 091/232] Fix reversal traversal single qubit logci

---
 squander/synthesis/PartAM.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 8be14e0a1..740623e67 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -534,7 +534,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 F.remove(partition_idx)
                 single_qubit_part = optimized_partitions[partition_idx]
                 qubit = single_qubit_part.circuit.get_Qbits()[0]
-                single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
+                single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
                 partition_order.append(single_qubit_part)
 
                 resolved_partitions[partition_idx] = True
@@ -607,7 +607,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                         if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
                             child_partition = optimized_partitions[child]
                             qubit = child_partition.circuit.get_Qbits()[0]
-                            child_partition.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
+                            child_partition.circuit = child_partition.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
                             partition_order.append(child_partition)
                             resolved_partitions[child] = True
                             resolved_count = sum(resolved_partitions)

From 7e5ffda0db6b99843792c2161b1cc82c6e7d11b0 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 10 Mar 2026 07:33:50 +0100
Subject: [PATCH 092/232] paritition automorphism

---
 examples/decomposition/PartAM_example.py |   6 +-
 squander/synthesis/PartAM.py             | 124 +++++++++++++++++++++--
 squander/synthesis/PartAM_utils.py       |  56 ++++++++++
 3 files changed, 172 insertions(+), 14 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index 428872db4..b3e1482a0 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -90,7 +90,7 @@ def run_and_report(label, config, circ_orig, parameters_orig):
     # ================================================================
     results['default'] = run_and_report("Default (single forward pass)", {
         'strategy': "TreeSearch",
-        'max_partition_size': 4,
+        'max_partition_size': 3,
         'progressbar': True,
         'topology': topology,
         'sabre_iterations': 0,
@@ -101,7 +101,7 @@ def run_and_report(label, config, circ_orig, parameters_orig):
     # ================================================================
     results['sabre'] = run_and_report("SABRE iterations=3", {
         'strategy': "TreeSearch",
-        'max_partition_size': 4,
+        'max_partition_size': 3,
         'progressbar': True,
         'topology': topology,
         'sabre_iterations': 3,
@@ -112,7 +112,7 @@ def run_and_report(label, config, circ_orig, parameters_orig):
     # ================================================================
     results['trials'] = run_and_report("SABRE iterations=3, layout trials=5", {
         'strategy': "TreeSearch",
-        'max_partition_size': 4,
+        'max_partition_size': 3,
         'progressbar': True,
         'topology': topology,
         'sabre_iterations': 3,
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 740623e67..4dce5a13d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -16,7 +16,8 @@
 )
 
 import numpy as np
-import time 
+import math
+import time
 from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass
 
@@ -32,6 +33,8 @@
     get_unique_subtopologies,
     get_canonical_form,
     get_node_mapping,
+    compute_automorphisms,
+    derive_result_from_automorphism,
     SingleQubitPartitionResult,
     PartitionSynthesisResult,
     PartitionCandidate,
@@ -186,7 +189,7 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
         """
-        Call to decompose a partition sequentially
+        Call to decompose a partition sequentially (no automorphism optimization).
         """
         N = Partition_circuit.get_Qbit_Num()
         if N !=1:
@@ -198,18 +201,18 @@ def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_paramete
                 P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
                 for P_i in perumations_all:
                     Partition_circuit_tmp = Circuit(N)
-                    Partition_circuit_tmp.add_Permutation(list(P_i))  # Must convert tuple to list
+                    Partition_circuit_tmp.add_Permutation(list(P_i))
                     Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                    Partition_circuit_tmp.add_Permutation(list(P_o_initial))  # Must convert tuple to list
+                    Partition_circuit_tmp.add_Permutation(list(P_o_initial))
                     synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
                     result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
 
                 P_i_best, _ = result.get_best_result(topology_idx)[0]
                 for P_o in perumations_all:
                     Partition_circuit_tmp = Circuit(N)
-                    Partition_circuit_tmp.add_Permutation(list(P_i_best))  # Must convert tuple to list
+                    Partition_circuit_tmp.add_Permutation(list(P_i_best))
                     Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                    Partition_circuit_tmp.add_Permutation(list(P_o))  # Must convert tuple to list
+                    Partition_circuit_tmp.add_Permutation(list(P_o))
                     synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
                     result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
         else:
@@ -219,27 +222,125 @@ def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_paramete
     @staticmethod
     def DecomposePartition_Full(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
         """
-        Call to decompose a partition sequentially
+        Call to decompose a partition exhaustively over all (P_i, P_o) pairs,
+        exploiting topology automorphisms to skip equivalent decompositions.
         """
         N = Partition_circuit.get_Qbit_Num()
         if N !=1:
             permutations_all = list(permutations(range(N)))
             result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
-            # Sequential permutation search
+            identity = tuple(range(N))
             for topology_idx in range(len(topologies)):
                 mini_topology = topologies[topology_idx]
+                auts = compute_automorphisms(mini_topology)
+                known_pairs = set()
                 for P_i in permutations_all:
                     for P_o in permutations_all:
+                        if (P_i, P_o) in known_pairs:
+                            continue
                         Partition_circuit_tmp = Circuit(N)
-                        Partition_circuit_tmp.add_Permutation(list(P_i))  # Must convert tuple to list
+                        Partition_circuit_tmp.add_Permutation(list(P_i))
                         Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                        Partition_circuit_tmp.add_Permutation(list(P_o))  # Must convert tuple to list
+                        Partition_circuit_tmp.add_Permutation(list(P_o))
                         synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
                         result.add_result((P_i, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
+                        known_pairs.add((P_i, P_o))
+                        for sigma in auts:
+                            if sigma == identity:
+                                continue
+                            new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i, P_o, synthesised_circuit, synthesised_parameters, N)
+                            if (new_P_i, new_P_o) not in known_pairs:
+                                result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
+                                known_pairs.add((new_P_i, new_P_o))
         else:
             result = SingleQubitPartitionResult(Partition_circuit,Partition_parameters)
         return result
 
+    @staticmethod
+    def DecomposePartition_Auto(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
+        """
+        Auto-select between Sequential and orbit-reduced Full mode per topology.
+        Uses Full mode when |Aut(T)| > N!/2 (cheaper than Sequential), otherwise
+        uses Sequential with automorphism derivation.
+        """
+        N = Partition_circuit.get_Qbit_Num()
+        if N == 1:
+            return SingleQubitPartitionResult(Partition_circuit, Partition_parameters)
+
+        perumations_all = list(permutations(range(N)))
+        result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
+        identity = tuple(range(N))
+        factorial_N = math.factorial(N)
+
+        for topology_idx in range(len(topologies)):
+            mini_topology = topologies[topology_idx]
+            auts = compute_automorphisms(mini_topology)
+            aut_size = len(auts)
+            known_pairs = set()
+
+            # Choose mode: Full with orbits when |Aut(T)| > N!/2
+            use_full = (aut_size > factorial_N // 2)
+
+            if use_full:
+                # Orbit-reduced Full mode
+                for P_i in perumations_all:
+                    for P_o in perumations_all:
+                        if (P_i, P_o) in known_pairs:
+                            continue
+                        Partition_circuit_tmp = Circuit(N)
+                        Partition_circuit_tmp.add_Permutation(list(P_i))
+                        Partition_circuit_tmp.add_Circuit(Partition_circuit)
+                        Partition_circuit_tmp.add_Permutation(list(P_o))
+                        synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+                        result.add_result((P_i, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
+                        known_pairs.add((P_i, P_o))
+                        for sigma in auts:
+                            if sigma == identity:
+                                continue
+                            new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i, P_o, synthesised_circuit, synthesised_parameters, N)
+                            if (new_P_i, new_P_o) not in known_pairs:
+                                result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
+                                known_pairs.add((new_P_i, new_P_o))
+            else:
+                # Sequential mode with automorphism derivation
+                P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
+                for P_i in perumations_all:
+                    Partition_circuit_tmp = Circuit(N)
+                    Partition_circuit_tmp.add_Permutation(list(P_i))
+                    Partition_circuit_tmp.add_Circuit(Partition_circuit)
+                    Partition_circuit_tmp.add_Permutation(list(P_o_initial))
+                    synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+                    result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
+                    known_pairs.add((P_i, P_o_initial))
+                    for sigma in auts:
+                        if sigma == identity:
+                            continue
+                        new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i, P_o_initial, synthesised_circuit, synthesised_parameters, N)
+                        if (new_P_i, new_P_o) not in known_pairs:
+                            result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
+                            known_pairs.add((new_P_i, new_P_o))
+
+                P_i_best, _ = result.get_best_result(topology_idx)[0]
+                for P_o in perumations_all:
+                    if (tuple(P_i_best), P_o) in known_pairs:
+                        continue
+                    Partition_circuit_tmp = Circuit(N)
+                    Partition_circuit_tmp.add_Permutation(list(P_i_best))
+                    Partition_circuit_tmp.add_Circuit(Partition_circuit)
+                    Partition_circuit_tmp.add_Permutation(list(P_o))
+                    synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
+                    result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
+                    known_pairs.add((tuple(P_i_best), P_o))
+                    for sigma in auts:
+                        if sigma == identity:
+                            continue
+                        new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i_best, P_o, synthesised_circuit, synthesised_parameters, N)
+                        if (new_P_i, new_P_o) not in known_pairs:
+                            result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
+                            known_pairs.add((new_P_i, new_P_o))
+
+        return result
+
     @staticmethod
     def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None, max_retries: int = 5) -> Circuit:
         """
@@ -349,7 +450,8 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
-                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+                decompose_fn = self.DecomposePartition_Auto if self.config.get('use_automorphisms', True) else self.DecomposePartition_Sequential
+                optimized_results[partition_idx] = pool.apply_async( decompose_fn, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index cd6995a96..8cbbcb7c3 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -249,6 +249,62 @@ def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int
     return {}
 
 
+def compute_automorphisms(mini_topology: List[Tuple[int, int]]) -> List[Tuple[int, ...]]:
+    """Compute all automorphisms of a locally-labeled mini_topology (nodes 0..N-1).
+
+    An automorphism is a permutation sigma of {0,...,N-1} that preserves the
+    undirected edge set.  For N<=4 (typical partition size) brute-forcing all
+    N! permutations is at most 24 checks.
+
+    Returns:
+        List of permutation tuples. Always includes the identity as the first
+        element.
+    """
+    nodes = set()
+    for u, v in mini_topology:
+        nodes.add(u)
+        nodes.add(v)
+    if not nodes:
+        return [()]
+    N = max(nodes) + 1
+    edge_set = set()
+    for u, v in mini_topology:
+        edge_set.add((min(u, v), max(u, v)))
+
+    automorphisms = []
+    for perm in permutations(range(N)):
+        mapped = set()
+        for u, v in mini_topology:
+            mapped.add((min(perm[u], perm[v]), max(perm[u], perm[v])))
+        if mapped == edge_set:
+            automorphisms.append(perm)
+    return automorphisms
+
+
+def derive_result_from_automorphism(sigma, P_i, P_o, circuit, parameters, N):
+    """Derive an equivalent decomposition result from a topology automorphism.
+
+    Given that C(theta) approximates P_o . U . P_i on topology T, the circuit
+    sigma(C)(theta) approximates (sigma . P_o) . U . (P_i . sigma^-1) on T
+    (since sigma preserves T).
+
+    Returns:
+        (new_P_i, new_P_o, new_circuit, parameters)
+        Parameters are returned as-is (identical values, different qubit labels).
+    """
+    sigma_inv = [0] * N
+    for i in range(N):
+        sigma_inv[sigma[i]] = i
+
+    new_P_i = tuple(P_i[sigma_inv[j]] for j in range(N))
+    new_P_o = tuple(sigma[P_o[j]] for j in range(N))
+
+    remap = {i: sigma[i] for i in range(N)}
+    new_circuit = circuit.Remap_Qbits(remap, N)
+
+    return new_P_i, new_P_o, new_circuit, parameters
+
+
 # ============================================================================
 # Distance & Cost Calculations
 # ============================================================================

From 4398cf3925efaa1f0310ccf5882009cc481982f7 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 10 Mar 2026 09:04:42 +0100
Subject: [PATCH 093/232] Remove decomp auto

---
 squander/synthesis/PartAM.py | 146 +++++++----------------------------
 1 file changed, 27 insertions(+), 119 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 4dce5a13d..3f4f96241 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -16,7 +16,6 @@
 )
 
 import numpy as np
-import math
 import time
 from typing import Dict, List, Optional, Tuple
 from dataclasses import dataclass
@@ -189,15 +188,24 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
     @staticmethod
     def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
         """
-        Call to decompose a partition sequentially (no automorphism optimization).
+        Call to decompose a partition sequentially.
+        When use_automorphisms is enabled in config, derives equivalent
+        decompositions from topology automorphisms to skip redundant work.
         """
         N = Partition_circuit.get_Qbit_Num()
         if N !=1:
             perumations_all = list(permutations(range(N)))
             result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
-            # Sequential permutation search
+            use_auts = config.get('use_automorphisms', True)
+
             for topology_idx in range(len(topologies)):
                 mini_topology = topologies[topology_idx]
+                if use_auts:
+                    auts = compute_automorphisms(mini_topology)
+                    identity = tuple(range(N))
+                    known_pairs = set()
+
+                # Stage 1: fix P_o, sweep all P_i
                 P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
                 for P_i in perumations_all:
                     Partition_circuit_tmp = Circuit(N)
@@ -206,49 +214,33 @@ def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_paramete
                     Partition_circuit_tmp.add_Permutation(list(P_o_initial))
                     synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
                     result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
+                    if use_auts:
+                        known_pairs.add((P_i, P_o_initial))
+                        for sigma in auts:
+                            if sigma == identity:
+                                continue
+                            new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i, P_o_initial, synthesised_circuit, synthesised_parameters, N)
+                            if (new_P_i, new_P_o) not in known_pairs:
+                                result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
+                                known_pairs.add((new_P_i, new_P_o))
 
+                # Stage 2: fix P_i_best, sweep all P_o
                 P_i_best, _ = result.get_best_result(topology_idx)[0]
                 for P_o in perumations_all:
+                    if use_auts and (tuple(P_i_best), P_o) in known_pairs:
+                        continue
                     Partition_circuit_tmp = Circuit(N)
                     Partition_circuit_tmp.add_Permutation(list(P_i_best))
                     Partition_circuit_tmp.add_Circuit(Partition_circuit)
                     Partition_circuit_tmp.add_Permutation(list(P_o))
                     synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
                     result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
-        else:
-            result = SingleQubitPartitionResult(Partition_circuit,Partition_parameters)
-        return result
-
-    @staticmethod
-    def DecomposePartition_Full(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
-        """
-        Call to decompose a partition exhaustively over all (P_i, P_o) pairs,
-        exploiting topology automorphisms to skip equivalent decompositions.
-        """
-        N = Partition_circuit.get_Qbit_Num()
-        if N !=1:
-            permutations_all = list(permutations(range(N)))
-            result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
-            identity = tuple(range(N))
-            for topology_idx in range(len(topologies)):
-                mini_topology = topologies[topology_idx]
-                auts = compute_automorphisms(mini_topology)
-                known_pairs = set()
-                for P_i in permutations_all:
-                    for P_o in permutations_all:
-                        if (P_i, P_o) in known_pairs:
-                            continue
-                        Partition_circuit_tmp = Circuit(N)
-                        Partition_circuit_tmp.add_Permutation(list(P_i))
-                        Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                        Partition_circuit_tmp.add_Permutation(list(P_o))
-                        synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-                        result.add_result((P_i, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
-                        known_pairs.add((P_i, P_o))
+                    if use_auts:
+                        known_pairs.add((tuple(P_i_best), P_o))
                         for sigma in auts:
                             if sigma == identity:
                                 continue
-                            new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i, P_o, synthesised_circuit, synthesised_parameters, N)
+                            new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i_best, P_o, synthesised_circuit, synthesised_parameters, N)
                             if (new_P_i, new_P_o) not in known_pairs:
                                 result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
                                 known_pairs.add((new_P_i, new_P_o))
@@ -256,89 +248,6 @@ def DecomposePartition_Full(Partition_circuit: Circuit, Partition_parameters: np
             result = SingleQubitPartitionResult(Partition_circuit,Partition_parameters)
         return result
 
-    @staticmethod
-    def DecomposePartition_Auto(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
-        """
-        Auto-select between Sequential and orbit-reduced Full mode per topology.
-        Uses Full mode when |Aut(T)| > N!/2 (cheaper than Sequential), otherwise
-        uses Sequential with automorphism derivation.
-        """
-        N = Partition_circuit.get_Qbit_Num()
-        if N == 1:
-            return SingleQubitPartitionResult(Partition_circuit, Partition_parameters)
-
-        perumations_all = list(permutations(range(N)))
-        result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
-        identity = tuple(range(N))
-        factorial_N = math.factorial(N)
-
-        for topology_idx in range(len(topologies)):
-            mini_topology = topologies[topology_idx]
-            auts = compute_automorphisms(mini_topology)
-            aut_size = len(auts)
-            known_pairs = set()
-
-            # Choose mode: Full with orbits when |Aut(T)| > N!/2
-            use_full = (aut_size > factorial_N // 2)
-
-            if use_full:
-                # Orbit-reduced Full mode
-                for P_i in perumations_all:
-                    for P_o in perumations_all:
-                        if (P_i, P_o) in known_pairs:
-                            continue
-                        Partition_circuit_tmp = Circuit(N)
-                        Partition_circuit_tmp.add_Permutation(list(P_i))
-                        Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                        Partition_circuit_tmp.add_Permutation(list(P_o))
-                        synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-                        result.add_result((P_i, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
-                        known_pairs.add((P_i, P_o))
-                        for sigma in auts:
-                            if sigma == identity:
-                                continue
-                            new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i, P_o, synthesised_circuit, synthesised_parameters, N)
-                            if (new_P_i, new_P_o) not in known_pairs:
-                                result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
-                                known_pairs.add((new_P_i, new_P_o))
-            else:
-                # Sequential mode with automorphism derivation
-                P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
-                for P_i in perumations_all:
-                    Partition_circuit_tmp = Circuit(N)
-                    Partition_circuit_tmp.add_Permutation(list(P_i))
-                    Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                    Partition_circuit_tmp.add_Permutation(list(P_o_initial))
-                    synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-                    result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
-                    known_pairs.add((P_i, P_o_initial))
-                    for sigma in auts:
-                        if sigma == identity:
-                            continue
-                        new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i, P_o_initial, synthesised_circuit, synthesised_parameters, N)
-                        if (new_P_i, new_P_o) not in known_pairs:
-                            result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
-                            known_pairs.add((new_P_i, new_P_o))
-
-                P_i_best, _ = result.get_best_result(topology_idx)[0]
-                for P_o in perumations_all:
-                    if (tuple(P_i_best), P_o) in known_pairs:
-                        continue
-                    Partition_circuit_tmp = Circuit(N)
-                    Partition_circuit_tmp.add_Permutation(list(P_i_best))
-                    Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                    Partition_circuit_tmp.add_Permutation(list(P_o))
-                    synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-                    result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
-                    known_pairs.add((tuple(P_i_best), P_o))
-                    for sigma in auts:
-                        if sigma == identity:
-                            continue
-                        new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i_best, P_o, synthesised_circuit, synthesised_parameters, N)
-                        if (new_P_i, new_P_o) not in known_pairs:
-                            result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
-                            known_pairs.add((new_P_i, new_P_o))
-
         return result
 
     @staticmethod
@@ -450,8 +359,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
-                decompose_fn = self.DecomposePartition_Auto if self.config.get('use_automorphisms', True) else self.DecomposePartition_Sequential
-                optimized_results[partition_idx] = pool.apply_async( decompose_fn, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis",disable=self.config.get('progressbar', 0) == False) ):
                 optimized_results[partition_idx] = optimized_results[partition_idx].get()

From 96bd79a5ec2ba064f3260d422cc430fcb98bcb7c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 11 Mar 2026 16:47:54 +0100
Subject: [PATCH 094/232] Add in new parallel config

---
 .../N_Qubit_Decomposition_Tree_Search.cpp      | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp b/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp
index feae4838e..703325a46 100644
--- a/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp
+++ b/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp
@@ -639,7 +639,14 @@ TreeSearchResult N_Qubit_Decomposition_Tree_Search::tree_search_over_gate_struct
     int64_t concurrency = (int64_t)nthreads;
     concurrency = concurrency < iteration_max ? concurrency : iteration_max;
 
-    int parallel = get_parallel_configuration();
+    int parallel;
+    if (config.count("parallel_tree_search") > 0) {
+        long long value;
+        config["parallel_tree_search"].get_property(value);
+        parallel = (int)value;
+    } else {
+        parallel = get_parallel_configuration();
+    }
 
     int64_t work_batch = 1;
     if (parallel == 0) {
@@ -893,7 +900,14 @@ GrayCode N_Qubit_Decomposition_Tree_Search::tree_search_over_gate_structures(int
     int64_t concurrency = (int64_t)nthreads;
     concurrency = concurrency < iteration_max ? concurrency : iteration_max;
 
-    int parallel = get_parallel_configuration();
+    int parallel;
+    if (config.count("parallel_tree_search") > 0) {
+        long long value;
+        config["parallel_tree_search"].get_property(value);
+        parallel = (int)value;
+    } else {
+        parallel = get_parallel_configuration();
+    }
 
     int64_t work_batch = 1;
     if (parallel == 0) {

From 6fb009f3924b97db4c4d598b66af796eba49c676 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 11 Mar 2026 16:47:54 +0100
Subject: [PATCH 095/232] Revert "Add in new parallel config"

This reverts commit 96bd79a5ec2ba064f3260d422cc430fcb98bcb7c.

From eb6f72df716fd6ad824954ea07c8eed5e2eabd97 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 17 Mar 2026 15:32:47 +0100
Subject: [PATCH 096/232] async Parallel for wide circuits

---
 .../qgd_Wide_Circuit_Optimization.py          | 27 +++++++++++++------
 squander/synthesis/PartAM.py                  | 18 +++++++++++--
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 753383cec..a13de9b6f 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -18,6 +18,7 @@
 
 import multiprocessing as mp
 from multiprocessing import Process, Pool, parent_process
+from multiprocessing.pool import AsyncResult
 import os
 
 
@@ -1028,33 +1029,43 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
                 optimized_subcircuits[ partition_idx ] = new_subcircuit
                 optimized_parameter_list[ partition_idx ] = new_parameters
         else:
-            # list of AsyncResult objects
+            # list of AsyncResult objects (for 2-qubit) or direct results (for 1-qubit and 3+ qubit)
             async_results = [None] * len(subcircuits)
             with Pool(processes=mp.cpu_count()) as pool:
 
                 #  code for iterate over partitions and optimize them
                 for partition_idx, subcircuit in enumerate( subcircuits ):
-            
+
 
                     # isolate the parameters corresponding to the given sub-circuit
                     start_idx = subcircuit.get_Parameter_Start_Index()
                     end_idx   = start_idx + subcircuit.get_Parameter_Num()
                     subcircuit_parameters = parameters[ start_idx:end_idx ]
-    
-        
-                
+
+
+
                     # call a process to decompose a subcircuit
                     config = {**self.config, 'tree_level_max': max(0, subcircuit.get_Gate_Nums().get('CNOT', 0)-1)}
                     config = config if structures is None or partition_idx >= len(structures) else {**config, 'strategy': 'Custom', 'max_inner_iterations': 10000, 'max_iteration_loops': 4}
-                    async_results[partition_idx]  = pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config,
-                                                                                                        None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
+
+                    qbit_num_sub = len(subcircuit.get_Qbits())
+                    if qbit_num_sub == 2:
+                        async_results[partition_idx]  = pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config,
+                                                                                                            None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
+                    else:
+                        # 1-qubit and 3+ qubit: run sequentially; use internal C++ parallelism for large partitions
+                        seq_config = {**config, 'parallel': 1} if qbit_num_sub >= 3 else config
+                        async_results[partition_idx] = self.PartitionDecompositionProcess(subcircuit, subcircuit_parameters, seq_config,
+                                                                                         None if structures is None or partition_idx >= len(structures) else structures[partition_idx])
+
                 #  code for iterate over async results and retrieve the new subcircuits
                 for partition_idx, subcircuit in enumerate( subcircuits ):
                     # callback function done on the master process to compare the new decomposed and the original suncircuit
                     start_idx = subcircuit.get_Parameter_Start_Index()
                     subcircuit_parameters = parameters[ start_idx:start_idx + subcircuit.get_Parameter_Num() ]
                     callback_fnc = lambda  x : self.CompareAndPickCircuits( [subcircuit, *(z[0] for z in x)], [subcircuit_parameters, *(z[1] for z in x)] )
-                    new_subcircuit, new_parameters = callback_fnc(async_results[partition_idx].get( timeout = None ))
+                    result = async_results[partition_idx]
+                    new_subcircuit, new_parameters = callback_fnc(result.get( timeout = None ) if isinstance(result, AsyncResult) else result)
 
                     if subcircuit != new_subcircuit and self.config["verbosity"] > 0:
                         print( "original subcircuit:    ", subcircuit.get_Gate_Nums())
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3f4f96241..2181a3700 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -22,6 +22,7 @@
 
 import multiprocessing as mp
 from multiprocessing import Pool
+from multiprocessing.pool import AsyncResult
 import os
 import logging
 from tqdm import tqdm
@@ -345,6 +346,10 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         subcircuits = partitioned_circuit.get_Gates()
         optimized_results = [None] * len(subcircuits)
 
+        # Config with parallel=1 for large partitions (use internal C++ parallelism)
+        large_partition_config = dict(self.config)
+        large_partition_config['parallel'] = 1
+
         with Pool(processes=mp.cpu_count()) as pool:
             for partition_idx, subcircuit in enumerate( subcircuits ):
 
@@ -359,10 +364,19 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 for idx in range( len(involved_qbits) ):
                     qbit_map[ involved_qbits[idx] ] = idx
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
-                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+
+                if qbit_num_sub == 2:
+                    optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+                elif qbit_num_sub >= 3:
+                    optimized_results[partition_idx] = self.DecomposePartition_Sequential(remapped_subcircuit, subcircuit_parameters, large_partition_config, mini_topologies, involved_qbits, qbit_map)
+                else:
+                    optimized_results[partition_idx] = self.DecomposePartition_Sequential(remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map)
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis",disable=self.config.get('progressbar', 0) == False) ):
-                optimized_results[partition_idx] = optimized_results[partition_idx].get()
+                result = optimized_results[partition_idx]
+                if isinstance(result, AsyncResult):
+                    optimized_results[partition_idx] = result.get()
+                # else: already a resolved result (sequential or single-qubit)
 
         # ---- Phase 3: ILP partition selection with synthesis-cost weights ----
         weights = []

From ed1c90ddfd1dc89459593e2569a33d159cca2e1f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 18 Mar 2026 00:26:16 +0100
Subject: [PATCH 097/232] c++ cpu pooling

---
 .../decomposition/qgd_Wide_Circuit_Optimization.py  | 13 +++++++++----
 squander/synthesis/PartAM.py                        | 10 ++++++++--
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index a13de9b6f..3fbd1c059 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -1031,7 +1031,10 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
         else:
             # list of AsyncResult objects (for 2-qubit) or direct results (for 1-qubit and 3+ qubit)
             async_results = [None] * len(subcircuits)
-            with Pool(processes=mp.cpu_count()) as pool:
+            n_cpus = mp.cpu_count()
+            large_pool_size = max(1, n_cpus // 4)
+            with Pool(processes=n_cpus) as pool, \
+                 Pool(processes=large_pool_size) as large_pool:
 
                 #  code for iterate over partitions and optimize them
                 for partition_idx, subcircuit in enumerate( subcircuits ):
@@ -1052,10 +1055,12 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
                     if qbit_num_sub == 2:
                         async_results[partition_idx]  = pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config,
                                                                                                             None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
+                    elif qbit_num_sub >= 3:
+                        large_config = {**config, 'parallel': 1}
+                        async_results[partition_idx]  = large_pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, large_config,
+                                                                                                            None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
                     else:
-                        # 1-qubit and 3+ qubit: run sequentially; use internal C++ parallelism for large partitions
-                        seq_config = {**config, 'parallel': 1} if qbit_num_sub >= 3 else config
-                        async_results[partition_idx] = self.PartitionDecompositionProcess(subcircuit, subcircuit_parameters, seq_config,
+                        async_results[partition_idx] = self.PartitionDecompositionProcess(subcircuit, subcircuit_parameters, config,
                                                                                          None if structures is None or partition_idx >= len(structures) else structures[partition_idx])
 
                 #  code for iterate over async results and retrieve the new subcircuits
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2181a3700..4be620e67 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -350,7 +350,13 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         large_partition_config = dict(self.config)
         large_partition_config['parallel'] = 1
 
-        with Pool(processes=mp.cpu_count()) as pool:
+        # Use fewer workers for 3+ qubit partitions to avoid oversubscription
+        # from C++ internal threads (parallel=1) competing with pool workers
+        n_cpus = mp.cpu_count()
+        large_pool_size = max(1, n_cpus // 4)
+
+        with Pool(processes=n_cpus) as pool, \
+             Pool(processes=large_pool_size) as large_pool:
             for partition_idx, subcircuit in enumerate( subcircuits ):
 
                 start_idx = subcircuit.get_Parameter_Start_Index()
@@ -368,7 +374,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 if qbit_num_sub == 2:
                     optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
                 elif qbit_num_sub >= 3:
-                    optimized_results[partition_idx] = self.DecomposePartition_Sequential(remapped_subcircuit, subcircuit_parameters, large_partition_config, mini_topologies, involved_qbits, qbit_map)
+                    optimized_results[partition_idx] = large_pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, large_partition_config, mini_topologies, involved_qbits, qbit_map) )
                 else:
                     optimized_results[partition_idx] = self.DecomposePartition_Sequential(remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map)
 

From 474ec5fcc1d66bb571db620eba46e00077fcd194 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 18 Mar 2026 09:36:58 +0100
Subject: [PATCH 098/232] Remove parallel experiments

---
 .../qgd_Wide_Circuit_Optimization.py          | 15 +++------------
 .../N_Qubit_Decomposition_Tree_Search.cpp     | 19 +++----------------
 squander/synthesis/PartAM.py                  | 18 ++----------------
 3 files changed, 8 insertions(+), 44 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 3fbd1c059..858427ccf 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -1052,16 +1052,8 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
                     config = config if structures is None or partition_idx >= len(structures) else {**config, 'strategy': 'Custom', 'max_inner_iterations': 10000, 'max_iteration_loops': 4}
 
                     qbit_num_sub = len(subcircuit.get_Qbits())
-                    if qbit_num_sub == 2:
-                        async_results[partition_idx]  = pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config,
-                                                                                                            None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
-                    elif qbit_num_sub >= 3:
-                        large_config = {**config, 'parallel': 1}
-                        async_results[partition_idx]  = large_pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, large_config,
-                                                                                                            None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
-                    else:
-                        async_results[partition_idx] = self.PartitionDecompositionProcess(subcircuit, subcircuit_parameters, config,
-                                                                                         None if structures is None or partition_idx >= len(structures) else structures[partition_idx])
+                    async_results[partition_idx]  = pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config,
+                                                                                                        None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
 
                 #  code for iterate over async results and retrieve the new subcircuits
                 for partition_idx, subcircuit in enumerate( subcircuits ):
@@ -1069,8 +1061,7 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
                     start_idx = subcircuit.get_Parameter_Start_Index()
                     subcircuit_parameters = parameters[ start_idx:start_idx + subcircuit.get_Parameter_Num() ]
                     callback_fnc = lambda  x : self.CompareAndPickCircuits( [subcircuit, *(z[0] for z in x)], [subcircuit_parameters, *(z[1] for z in x)] )
-                    result = async_results[partition_idx]
-                    new_subcircuit, new_parameters = callback_fnc(result.get( timeout = None ) if isinstance(result, AsyncResult) else result)
+                    new_subcircuit, new_parameters = callback_fnc(async_results[partition_idx].get( timeout = None ))
 
                     if subcircuit != new_subcircuit and self.config["verbosity"] > 0:
                         print( "original subcircuit:    ", subcircuit.get_Gate_Nums())
diff --git a/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp b/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp
index 703325a46..8be801ddb 100644
--- a/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp
+++ b/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp
@@ -639,14 +639,7 @@ TreeSearchResult N_Qubit_Decomposition_Tree_Search::tree_search_over_gate_struct
     int64_t concurrency = (int64_t)nthreads;
     concurrency = concurrency < iteration_max ? concurrency : iteration_max;
 
-    int parallel;
-    if (config.count("parallel_tree_search") > 0) {
-        long long value;
-        config["parallel_tree_search"].get_property(value);
-        parallel = (int)value;
-    } else {
-        parallel = get_parallel_configuration();
-    }
+    int parallel = get_parallel_configuration();
 
     int64_t work_batch = 1;
     if (parallel == 0) {
@@ -900,14 +893,8 @@ GrayCode N_Qubit_Decomposition_Tree_Search::tree_search_over_gate_structures(int
     int64_t concurrency = (int64_t)nthreads;
     concurrency = concurrency < iteration_max ? concurrency : iteration_max;
 
-    int parallel;
-    if (config.count("parallel_tree_search") > 0) {
-        long long value;
-        config["parallel_tree_search"].get_property(value);
-        parallel = (int)value;
-    } else {
-        parallel = get_parallel_configuration();
-    }
+    int parallel = get_parallel_configuration();
+
 
     int64_t work_batch = 1;
     if (parallel == 0) {
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 4be620e67..766db88c1 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -22,7 +22,6 @@
 
 import multiprocessing as mp
 from multiprocessing import Pool
-from multiprocessing.pool import AsyncResult
 import os
 import logging
 from tqdm import tqdm
@@ -346,10 +345,6 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         subcircuits = partitioned_circuit.get_Gates()
         optimized_results = [None] * len(subcircuits)
 
-        # Config with parallel=1 for large partitions (use internal C++ parallelism)
-        large_partition_config = dict(self.config)
-        large_partition_config['parallel'] = 1
-
         # Use fewer workers for 3+ qubit partitions to avoid oversubscription
         # from C++ internal threads (parallel=1) competing with pool workers
         n_cpus = mp.cpu_count()
@@ -371,19 +366,10 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                     qbit_map[ involved_qbits[idx] ] = idx
                 remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
 
-                if qbit_num_sub == 2:
-                    optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
-                elif qbit_num_sub >= 3:
-                    optimized_results[partition_idx] = large_pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, large_partition_config, mini_topologies, involved_qbits, qbit_map) )
-                else:
-                    optimized_results[partition_idx] = self.DecomposePartition_Sequential(remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map)
+                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
 
             for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis",disable=self.config.get('progressbar', 0) == False) ):
-                result = optimized_results[partition_idx]
-                if isinstance(result, AsyncResult):
-                    optimized_results[partition_idx] = result.get()
-                # else: already a resolved result (sequential or single-qubit)
-
+                optimized_results[partition_idx] = optimized_results[partition_idx].get()
         # ---- Phase 3: ILP partition selection with synthesis-cost weights ----
         weights = []
         for idx, result in enumerate(optimized_results[:len(allparts)]):

From 2baa115ad9acfa88642f4e75df8decb0bfc3bd0f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 18 Mar 2026 12:13:58 +0100
Subject: [PATCH 099/232] remove stupid pool redundancy

---
 squander/decomposition/qgd_Wide_Circuit_Optimization.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 858427ccf..0018d3590 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -1032,9 +1032,7 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
             # list of AsyncResult objects (for 2-qubit) or direct results (for 1-qubit and 3+ qubit)
             async_results = [None] * len(subcircuits)
             n_cpus = mp.cpu_count()
-            large_pool_size = max(1, n_cpus // 4)
-            with Pool(processes=n_cpus) as pool, \
-                 Pool(processes=large_pool_size) as large_pool:
+            with Pool(processes=n_cpus) as pool:
 
                 #  code for iterate over partitions and optimize them
                 for partition_idx, subcircuit in enumerate( subcircuits ):

From 7b74b1b37f4041d34c3e1ad6635b070fcde17891 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 20 Mar 2026 00:19:41 +0100
Subject: [PATCH 100/232] Add better granularity to PartAM

---
 squander/synthesis/PartAM.py | 309 +++++++++++++++++++++++++----------
 1 file changed, 223 insertions(+), 86 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 766db88c1..fe2907dfa 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -27,6 +27,21 @@
 from tqdm import tqdm
 from collections import deque, defaultdict
 
+# Module-level globals for pool workers (set via Pool initializer)
+_worker_config = None
+
+def _init_decompose_worker(config):
+    global _worker_config
+    _worker_config = config
+
+def _decompose_one(Umtx, mini_topology):
+    """Pool worker function. Uses config set once by initializer instead of
+    pickling it per task."""
+    from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping
+    return qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(
+        Umtx, _worker_config, mini_topology
+    )
+
 from squander.synthesis.PartAM_utils import (
     get_subtopologies_of_type,
     get_unique_subtopologies,
@@ -185,71 +200,6 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
     # Partition Decomposition Methods
     # ------------------------------------------------------------------------
 
-    @staticmethod
-    def DecomposePartition_Sequential(Partition_circuit: Circuit, Partition_parameters: np.ndarray, config: dict, topologies, involved_qbits, qbit_map) -> PartitionSynthesisResult:
-        """
-        Call to decompose a partition sequentially.
-        When use_automorphisms is enabled in config, derives equivalent
-        decompositions from topology automorphisms to skip redundant work.
-        """
-        N = Partition_circuit.get_Qbit_Num()
-        if N !=1:
-            perumations_all = list(permutations(range(N)))
-            result = PartitionSynthesisResult(N, topologies, involved_qbits, qbit_map, Partition_circuit)
-            use_auts = config.get('use_automorphisms', True)
-
-            for topology_idx in range(len(topologies)):
-                mini_topology = topologies[topology_idx]
-                if use_auts:
-                    auts = compute_automorphisms(mini_topology)
-                    identity = tuple(range(N))
-                    known_pairs = set()
-
-                # Stage 1: fix P_o, sweep all P_i
-                P_o_initial = perumations_all[np.random.choice(range(len(perumations_all)))]
-                for P_i in perumations_all:
-                    Partition_circuit_tmp = Circuit(N)
-                    Partition_circuit_tmp.add_Permutation(list(P_i))
-                    Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                    Partition_circuit_tmp.add_Permutation(list(P_o_initial))
-                    synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-                    result.add_result((P_i, P_o_initial), synthesised_circuit, synthesised_parameters, topology_idx)
-                    if use_auts:
-                        known_pairs.add((P_i, P_o_initial))
-                        for sigma in auts:
-                            if sigma == identity:
-                                continue
-                            new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i, P_o_initial, synthesised_circuit, synthesised_parameters, N)
-                            if (new_P_i, new_P_o) not in known_pairs:
-                                result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
-                                known_pairs.add((new_P_i, new_P_o))
-
-                # Stage 2: fix P_i_best, sweep all P_o
-                P_i_best, _ = result.get_best_result(topology_idx)[0]
-                for P_o in perumations_all:
-                    if use_auts and (tuple(P_i_best), P_o) in known_pairs:
-                        continue
-                    Partition_circuit_tmp = Circuit(N)
-                    Partition_circuit_tmp.add_Permutation(list(P_i_best))
-                    Partition_circuit_tmp.add_Circuit(Partition_circuit)
-                    Partition_circuit_tmp.add_Permutation(list(P_o))
-                    synthesised_circuit, synthesised_parameters = qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm(Partition_circuit_tmp.get_Matrix(Partition_parameters), config, mini_topology)
-                    result.add_result((P_i_best, P_o), synthesised_circuit, synthesised_parameters, topology_idx)
-                    if use_auts:
-                        known_pairs.add((tuple(P_i_best), P_o))
-                        for sigma in auts:
-                            if sigma == identity:
-                                continue
-                            new_P_i, new_P_o, new_circuit, new_params = derive_result_from_automorphism(sigma, P_i_best, P_o, synthesised_circuit, synthesised_parameters, N)
-                            if (new_P_i, new_P_o) not in known_pairs:
-                                result.add_result((new_P_i, new_P_o), new_circuit, new_params, topology_idx)
-                                known_pairs.add((new_P_i, new_P_o))
-        else:
-            result = SingleQubitPartitionResult(Partition_circuit,Partition_parameters)
-        return result
-
-        return result
-
     @staticmethod
     def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None, max_retries: int = 5) -> Circuit:
         """
@@ -341,35 +291,222 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             partitioned_circuit.add_Circuit(c)
         parameters = np.concatenate(params, axis=0)
 
-        # ---- Phase 2: Stage 1 synthesis (Sequential) ----
+        # ---- Phase 2: Fine-grained parallel synthesis ----
+        # Instead of 1 coarse task per partition (each doing ~12 sequential
+        # C++ decompositions), submit individual (partition, topology, permutation)
+        # tasks for much better pool load balancing.
         subcircuits = partitioned_circuit.get_Gates()
         optimized_results = [None] * len(subcircuits)
-
-        # Use fewer workers for 3+ qubit partitions to avoid oversubscription
-        # from C++ internal threads (parallel=1) competing with pool workers
         n_cpus = mp.cpu_count()
-        large_pool_size = max(1, n_cpus // 4)
+        use_auts = self.config.get('use_automorphisms', True)
+        disable_pbar = self.config.get('progressbar', 0) == False
+
+        # Pre-compute partition metadata and base unitaries
+        partition_meta = []
+        for partition_idx, subcircuit in enumerate(subcircuits):
+            start_idx = subcircuit.get_Parameter_Start_Index()
+            end_idx = start_idx + subcircuit.get_Parameter_Num()
+            subcircuit_parameters = parameters[start_idx:end_idx]
+            involved_qbits = subcircuit.get_Qbits()
+            qbit_num_sub = len(involved_qbits)
+            mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub)
+            qbit_map = {involved_qbits[idx]: idx for idx in range(len(involved_qbits))}
+            remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num_sub)
+
+            if qbit_num_sub == 1:
+                optimized_results[partition_idx] = SingleQubitPartitionResult(
+                    remapped_subcircuit, subcircuit_parameters
+                )
+                partition_meta.append(None)
+            else:
+                partition_meta.append({
+                    'N': qbit_num_sub,
+                    'circuit': remapped_subcircuit,
+                    'params': subcircuit_parameters,
+                    'mini_topologies': mini_topologies,
+                    'involved_qbits': involved_qbits,
+                    'qbit_map': qbit_map,
+                })
+
+        # Pre-compute automorphisms per unique mini_topology shape
+        aut_cache = {}
+        def _get_auts(mini_topo):
+            key = tuple(sorted(tuple(sorted(e)) for e in mini_topo))
+            if key not in aut_cache:
+                aut_cache[key] = compute_automorphisms(mini_topo)
+            return aut_cache[key]
+
+        def _build_permuted_unitary(meta, P_i, P_o):
+            N = meta['N']
+            circ_tmp = Circuit(N)
+            circ_tmp.add_Permutation(list(P_i))
+            circ_tmp.add_Circuit(meta['circuit'])
+            circ_tmp.add_Permutation(list(P_o))
+            return circ_tmp.get_Matrix(meta['params'])
+
+        def _topo_key(mini_topology):
+            return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
+
+        # Cache: (rounded_unitary_bytes, topo_key) -> (circuit, parameters)
+        # Avoids redundant C++ decompositions when different partitions or
+        # permutations produce the same unitary matrix.
+        decomp_cache = {}
+
+        def _cache_key(Umtx, mini_topology):
+            return (np.round(Umtx, decimals=10).tobytes(), _topo_key(mini_topology))
+
+        def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
+                                  topology_idx, N, mini_topology, known_pairs, pair_key):
+            """Add a synthesis result and derive automorphism equivalents."""
+            result.add_result(perm_pair, synth_circuit, synth_params, topology_idx)
+            if use_auts:
+                if pair_key not in known_pairs:
+                    known_pairs[pair_key] = set()
+                known_pairs[pair_key].add(perm_pair)
+                P_i, P_o = perm_pair
+                auts = _get_auts(mini_topology)
+                identity = tuple(range(N))
+                for sigma in auts:
+                    if sigma == identity:
+                        continue
+                    new_P_i, new_P_o, new_circ, new_params = derive_result_from_automorphism(
+                        sigma, P_i, P_o, synth_circuit, synth_params, N
+                    )
+                    if (new_P_i, new_P_o) not in known_pairs[pair_key]:
+                        result.add_result((new_P_i, new_P_o), new_circ, new_params, topology_idx)
+                        known_pairs[pair_key].add((new_P_i, new_P_o))
+
+        with Pool(processes=n_cpus, initializer=_init_decompose_worker,
+                  initargs=(self.config,)) as pool:
+            # Initialize PartitionSynthesisResult for each multi-qubit partition
+            results_map = {}
+            for partition_idx, meta in enumerate(partition_meta):
+                if meta is None:
+                    continue
+                results_map[partition_idx] = PartitionSynthesisResult(
+                    meta['N'], meta['mini_topologies'], meta['involved_qbits'],
+                    meta['qbit_map'], meta['circuit']
+                )
+
+            # ---- Stage 1: fix random P_o, sweep all P_i ----
+            stage1_futures = []
+            stage1_cached = []
+            stage1_P_o = {}
+            known_pairs = {}
+
+            for partition_idx, meta in enumerate(partition_meta):
+                if meta is None:
+                    continue
+                N = meta['N']
+                perms_all = list(permutations(range(N)))
+                for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
+                    P_o_initial = perms_all[np.random.choice(len(perms_all))]
+                    stage1_P_o[(partition_idx, topology_idx)] = P_o_initial
+                    for P_i in perms_all:
+                        Umtx = _build_permuted_unitary(meta, P_i, P_o_initial)
+                        ck = _cache_key(Umtx, mini_topology)
+                        if ck in decomp_cache:
+                            stage1_cached.append((partition_idx, topology_idx, P_i, ck))
+                        else:
+                            future = pool.apply_async(
+                                _decompose_one, (Umtx, mini_topology)
+                            )
+                            stage1_futures.append((partition_idx, topology_idx, P_i, ck, future))
+
+            # Process Stage 1 cache hits immediately
+            for partition_idx, topology_idx, P_i, ck in stage1_cached:
+                meta = partition_meta[partition_idx]
+                N = meta['N']
+                P_o_initial = stage1_P_o[(partition_idx, topology_idx)]
+                mini_topology = meta['mini_topologies'][topology_idx]
+                synth_circuit, synth_params = decomp_cache[ck]
+                pair_key = (partition_idx, topology_idx)
+                _add_result_with_auts(
+                    results_map[partition_idx], (P_i, P_o_initial),
+                    synth_circuit, synth_params, topology_idx,
+                    N, mini_topology, known_pairs, pair_key
+                )
 
-        with Pool(processes=n_cpus) as pool, \
-             Pool(processes=large_pool_size) as large_pool:
-            for partition_idx, subcircuit in enumerate( subcircuits ):
+            # Collect Stage 1 pool results
+            cache_hits_s1 = len(stage1_cached)
+            for partition_idx, topology_idx, P_i, ck, future in tqdm(
+                stage1_futures, desc=f"Stage 1 Synthesis ({cache_hits_s1} cached)",
+                disable=disable_pbar
+            ):
+                synth_circuit, synth_params = future.get()
+                decomp_cache[ck] = (synth_circuit, synth_params)
+                meta = partition_meta[partition_idx]
+                N = meta['N']
+                P_o_initial = stage1_P_o[(partition_idx, topology_idx)]
+                mini_topology = meta['mini_topologies'][topology_idx]
+                pair_key = (partition_idx, topology_idx)
+                _add_result_with_auts(
+                    results_map[partition_idx], (P_i, P_o_initial),
+                    synth_circuit, synth_params, topology_idx,
+                    N, mini_topology, known_pairs, pair_key
+                )
 
-                start_idx = subcircuit.get_Parameter_Start_Index()
-                end_idx   = subcircuit.get_Parameter_Start_Index() + subcircuit.get_Parameter_Num()
-                subcircuit_parameters = parameters[ start_idx:end_idx ]
-                involved_qbits = subcircuit.get_Qbits()
+            # ---- Stage 2: fix best P_i from Stage 1, sweep all P_o ----
+            stage2_futures = []
+            stage2_cached = []
 
-                qbit_num_sub = len( involved_qbits )
-                mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub)
-                qbit_map = {}
-                for idx in range( len(involved_qbits) ):
-                    qbit_map[ involved_qbits[idx] ] = idx
-                remapped_subcircuit = subcircuit.Remap_Qbits( qbit_map, qbit_num_sub )
+            for partition_idx, meta in enumerate(partition_meta):
+                if meta is None:
+                    continue
+                N = meta['N']
+                perms_all = list(permutations(range(N)))
+                result = results_map[partition_idx]
+                for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
+                    P_i_best, _ = result.get_best_result(topology_idx)[0]
+                    pair_key = (partition_idx, topology_idx)
+                    kp = known_pairs.get(pair_key, set()) if use_auts else set()
+                    for P_o in perms_all:
+                        if use_auts and (tuple(P_i_best), P_o) in kp:
+                            continue
+                        Umtx = _build_permuted_unitary(meta, P_i_best, P_o)
+                        ck = _cache_key(Umtx, mini_topology)
+                        if ck in decomp_cache:
+                            stage2_cached.append((partition_idx, topology_idx, P_i_best, P_o, ck))
+                        else:
+                            future = pool.apply_async(
+                                _decompose_one, (Umtx, mini_topology)
+                            )
+                            stage2_futures.append((partition_idx, topology_idx, P_i_best, P_o, ck, future))
+
+            # Process Stage 2 cache hits
+            for partition_idx, topology_idx, P_i_best, P_o, ck in stage2_cached:
+                meta = partition_meta[partition_idx]
+                N = meta['N']
+                mini_topology = meta['mini_topologies'][topology_idx]
+                synth_circuit, synth_params = decomp_cache[ck]
+                pair_key = (partition_idx, topology_idx)
+                _add_result_with_auts(
+                    results_map[partition_idx], (tuple(P_i_best), P_o),
+                    synth_circuit, synth_params, topology_idx,
+                    N, mini_topology, known_pairs, pair_key
+                )
 
-                optimized_results[partition_idx] = pool.apply_async( self.DecomposePartition_Sequential, (remapped_subcircuit, subcircuit_parameters, self.config, mini_topologies, involved_qbits, qbit_map) )
+            # Collect Stage 2 pool results
+            cache_hits_s2 = len(stage2_cached)
+            for partition_idx, topology_idx, P_i_best, P_o, ck, future in tqdm(
+                stage2_futures, desc=f"Stage 2 Synthesis ({cache_hits_s2} cached)",
+                disable=disable_pbar
+            ):
+                synth_circuit, synth_params = future.get()
+                decomp_cache[ck] = (synth_circuit, synth_params)
+                meta = partition_meta[partition_idx]
+                N = meta['N']
+                mini_topology = meta['mini_topologies'][topology_idx]
+                pair_key = (partition_idx, topology_idx)
+                _add_result_with_auts(
+                    results_map[partition_idx], (tuple(P_i_best), P_o),
+                    synth_circuit, synth_params, topology_idx,
+                    N, mini_topology, known_pairs, pair_key
+                )
 
-            for partition_idx, subcircuit in enumerate( tqdm(subcircuits, desc="First Synthesis",disable=self.config.get('progressbar', 0) == False) ):
-                optimized_results[partition_idx] = optimized_results[partition_idx].get()
+            # Store assembled results
+            for partition_idx, result in results_map.items():
+                optimized_results[partition_idx] = result
         # ---- Phase 3: ILP partition selection with synthesis-cost weights ----
         weights = []
         for idx, result in enumerate(optimized_results[:len(allparts)]):

From adf164e6d6237f7e7b0f276af2dfcccb5923795c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 20 Mar 2026 09:19:19 +0100
Subject: [PATCH 101/232] Organize and clean up PartAM

---
 squander/synthesis/PartAM.py       | 297 +++++++++++++++--------------
 squander/synthesis/PartAM_utils.py |  27 ++-
 2 files changed, 169 insertions(+), 155 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index fe2907dfa..f7c8a5bf5 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1,13 +1,24 @@
 """
 This is an implementation of Partition Aware Mapping.
 """
+import logging
+import multiprocessing as mp
+import os
+import time
+from collections import deque, defaultdict
+from itertools import permutations
+from multiprocessing import Pool
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+from tqdm import tqdm
+
 from squander.decomposition.qgd_N_Qubit_Decompositions_Wrapper import (
     qgd_N_Qubit_Decomposition_adaptive as N_Qubit_Decomposition_adaptive,
     qgd_N_Qubit_Decomposition_Tree_Search as N_Qubit_Decomposition_Tree_Search,
     qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search,
 )
 from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
-from itertools import permutations
 from squander.partitioning.ilp import (
     get_all_partitions,
     _get_topo_order,
@@ -15,18 +26,6 @@
     ilp_global_optimal,
 )
 
-import numpy as np
-import time
-from typing import Dict, List, Optional, Tuple
-from dataclasses import dataclass
-
-import multiprocessing as mp
-from multiprocessing import Pool
-import os
-import logging
-from tqdm import tqdm
-from collections import deque, defaultdict
-
 # Module-level globals for pool workers (set via Pool initializer)
 _worker_config = None
 
@@ -52,27 +51,12 @@ def _decompose_one(Umtx, mini_topology):
     SingleQubitPartitionResult,
     PartitionSynthesisResult,
     PartitionCandidate,
+    PartitionScoreData,
     check_circuit_compatibility,
     construct_swap_circuit,
 )
 
 
-# ============================================================================
-# Data Classes
-# ============================================================================
-
-@dataclass(frozen=True)
-class PartitionScoreData:
-    mini_topologies: Tuple[Tuple[Tuple[int, int], ...], ...]
-    topology_candidates: Tuple[Tuple[Tuple[int, int], ...], ...]
-    permutations_pairs: Tuple[
-        Tuple[Tuple[Tuple[int, ...], Tuple[int, ...]], ...], ...
-    ]
-    circuit_structures: Tuple[Tuple[Tuple[int, ...], ...], ...]
-    qubit_map: Dict[int, int]
-    involved_qbits: Tuple[int, ...]
-
-
 # ============================================================================
 # Main Class: qgd_Partition_Aware_Mapping
 # ============================================================================
@@ -118,10 +102,6 @@ def __init__(self, config):
         self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
         self._swap_cache = {}     # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)}
 
-    # ------------------------------------------------------------------------
-    # Scoring Methods
-    # ------------------------------------------------------------------------
-
     # ------------------------------------------------------------------------
     # Caching Methods
     # ------------------------------------------------------------------------
@@ -148,6 +128,58 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
         
         return self._topology_cache[canonical_key]
 
+    # ------------------------------------------------------------------------
+    # Static Synthesis Helpers (extracted from SynthesizeWideCircuit)
+    # ------------------------------------------------------------------------
+
+    @staticmethod
+    def _topo_key(mini_topology):
+        return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
+
+    @staticmethod
+    def _cache_key(Umtx, mini_topology):
+        topo_key = tuple(sorted(tuple(sorted(e)) for e in mini_topology))
+        return (np.round(Umtx, decimals=10).tobytes(), topo_key)
+
+    @staticmethod
+    def _get_auts(mini_topo, aut_cache):
+        key = tuple(sorted(tuple(sorted(e)) for e in mini_topo))
+        if key not in aut_cache:
+            aut_cache[key] = compute_automorphisms(mini_topo)
+        return aut_cache[key]
+
+    @staticmethod
+    def _build_permuted_unitary(meta, P_i, P_o):
+        N = meta['N']
+        circ_tmp = Circuit(N)
+        circ_tmp.add_Permutation(list(P_i))
+        circ_tmp.add_Circuit(meta['circuit'])
+        circ_tmp.add_Permutation(list(P_o))
+        return circ_tmp.get_Matrix(meta['params'])
+
+    @staticmethod
+    def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
+                              topology_idx, N, mini_topology, known_pairs, pair_key,
+                              use_auts, aut_cache):
+        """Add a synthesis result and derive automorphism equivalents."""
+        result.add_result(perm_pair, synth_circuit, synth_params, topology_idx)
+        if use_auts:
+            if pair_key not in known_pairs:
+                known_pairs[pair_key] = set()
+            known_pairs[pair_key].add(perm_pair)
+            P_i, P_o = perm_pair
+            auts = qgd_Partition_Aware_Mapping._get_auts(mini_topology, aut_cache)
+            identity = tuple(range(N))
+            for sigma in auts:
+                if sigma == identity:
+                    continue
+                new_P_i, new_P_o, new_circ, new_params = derive_result_from_automorphism(
+                    sigma, P_i, P_o, synth_circuit, synth_params, N
+                )
+                if (new_P_i, new_P_o) not in known_pairs[pair_key]:
+                    result.add_result((new_P_i, new_P_o), new_circ, new_params, topology_idx)
+                    known_pairs[pair_key].add((new_P_i, new_P_o))
+
     def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[PartitionScoreData]]:
         """
         Create lightweight, picklable views of partitions that contain only the
@@ -297,9 +329,6 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         # tasks for much better pool load balancing.
         subcircuits = partitioned_circuit.get_Gates()
         optimized_results = [None] * len(subcircuits)
-        n_cpus = mp.cpu_count()
-        use_auts = self.config.get('use_automorphisms', True)
-        disable_pbar = self.config.get('progressbar', 0) == False
 
         # Pre-compute partition metadata and base unitaries
         partition_meta = []
@@ -328,54 +357,84 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                     'qbit_map': qbit_map,
                 })
 
-        # Pre-compute automorphisms per unique mini_topology shape
-        aut_cache = {}
-        def _get_auts(mini_topo):
-            key = tuple(sorted(tuple(sorted(e)) for e in mini_topo))
-            if key not in aut_cache:
-                aut_cache[key] = compute_automorphisms(mini_topo)
-            return aut_cache[key]
-
-        def _build_permuted_unitary(meta, P_i, P_o):
-            N = meta['N']
-            circ_tmp = Circuit(N)
-            circ_tmp.add_Permutation(list(P_i))
-            circ_tmp.add_Circuit(meta['circuit'])
-            circ_tmp.add_Permutation(list(P_o))
-            return circ_tmp.get_Matrix(meta['params'])
-
-        def _topo_key(mini_topology):
-            return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
+        # ---- Phase 2: Fine-grained parallel synthesis ----
+        results_map = self._run_parallel_synthesis(partition_meta)
+        for partition_idx, result in results_map.items():
+            optimized_results[partition_idx] = result
+
+        # ---- Phase 3: ILP partition selection with synthesis-cost weights ----
+        weights = []
+        for idx, result in enumerate(optimized_results[:len(allparts)]):
+            if isinstance(result, SingleQubitPartitionResult):
+                weights.append(0)
+            else:
+                weights.append(result.get_partition_synthesis_score())
+
+        L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
+
+        # ---- Phase 4: Reuse Phase 2 results (no re-synthesis) ----
+        # Build non-overlapping parts from selected allparts + standalone chains.
+        # Phase 2 already synthesized each allpart (with surrounded chains included),
+        # so we reuse those results directly.
+        selected_surrounded_starts = set()
+        selected_parts_gates = []
+        for i in L_parts:
+            part = allparts[i]
+            surrounded = {t for s in part for t in go[s]
+                         if t in single_qubit_chains_prepost
+                         and go[single_qubit_chains_prepost[t][-1]]
+                         and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
+            gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded))
+            selected_parts_gates.append(gates)
+            selected_surrounded_starts.update(surrounded)
+
+        # Non-surrounded chains become standalone SingleQubitPartitionResult entries
+        standalone_chains = []
+        for chain in single_qubit_chains:
+            if chain[0] not in selected_surrounded_starts:
+                selected_parts_gates.append(frozenset(chain))
+                standalone_chains.append(chain)
+
+        L = topo_sort_partitions(working_circ, self.config["max_partition_size"], selected_parts_gates)
+
+        n_selected = len(L_parts)
+        optimized_partitions = []
+        for part_idx in L:
+            if part_idx < n_selected:
+                # Multi-qubit partition — reuse Phase 2 PartitionSynthesisResult
+                optimized_partitions.append(optimized_results[L_parts[part_idx]])
+            else:
+                # Standalone single-qubit chain
+                chain = standalone_chains[part_idx - n_selected]
+                c = Circuit(qbit_num_orig_circuit)
+                chain_params = []
+                for gate_idx in chain:
+                    c.add_Gate(gate_dict[gate_idx])
+                    start = gate_dict[gate_idx].get_Parameter_Start_Index()
+                    chain_params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
+                chain_parameters = np.concatenate(chain_params) if chain_params else np.array([])
+                optimized_partitions.append(SingleQubitPartitionResult(c, chain_parameters))
+
+        return optimized_partitions
+
+    def _run_parallel_synthesis(self, partition_meta):
+        """Phase 2: Run parallel synthesis for all multi-qubit partitions.
 
+        Args:
+            partition_meta: List of per-partition dicts (None for single-qubit partitions).
+
+        Returns:
+            results_map: Dict mapping partition_idx to PartitionSynthesisResult.
+        """
+        n_cpus = mp.cpu_count()
+        use_auts = self.config.get('use_automorphisms', True)
+        disable_pbar = self.config.get('progressbar', 0) == False
+        aut_cache = {}
         # Cache: (rounded_unitary_bytes, topo_key) -> (circuit, parameters)
         # Avoids redundant C++ decompositions when different partitions or
         # permutations produce the same unitary matrix.
         decomp_cache = {}
 
-        def _cache_key(Umtx, mini_topology):
-            return (np.round(Umtx, decimals=10).tobytes(), _topo_key(mini_topology))
-
-        def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
-                                  topology_idx, N, mini_topology, known_pairs, pair_key):
-            """Add a synthesis result and derive automorphism equivalents."""
-            result.add_result(perm_pair, synth_circuit, synth_params, topology_idx)
-            if use_auts:
-                if pair_key not in known_pairs:
-                    known_pairs[pair_key] = set()
-                known_pairs[pair_key].add(perm_pair)
-                P_i, P_o = perm_pair
-                auts = _get_auts(mini_topology)
-                identity = tuple(range(N))
-                for sigma in auts:
-                    if sigma == identity:
-                        continue
-                    new_P_i, new_P_o, new_circ, new_params = derive_result_from_automorphism(
-                        sigma, P_i, P_o, synth_circuit, synth_params, N
-                    )
-                    if (new_P_i, new_P_o) not in known_pairs[pair_key]:
-                        result.add_result((new_P_i, new_P_o), new_circ, new_params, topology_idx)
-                        known_pairs[pair_key].add((new_P_i, new_P_o))
-
         with Pool(processes=n_cpus, initializer=_init_decompose_worker,
                   initargs=(self.config,)) as pool:
             # Initialize PartitionSynthesisResult for each multi-qubit partition
@@ -403,8 +462,8 @@ def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
                     P_o_initial = perms_all[np.random.choice(len(perms_all))]
                     stage1_P_o[(partition_idx, topology_idx)] = P_o_initial
                     for P_i in perms_all:
-                        Umtx = _build_permuted_unitary(meta, P_i, P_o_initial)
-                        ck = _cache_key(Umtx, mini_topology)
+                        Umtx = self._build_permuted_unitary(meta, P_i, P_o_initial)
+                        ck = self._cache_key(Umtx, mini_topology)
                         if ck in decomp_cache:
                             stage1_cached.append((partition_idx, topology_idx, P_i, ck))
                         else:
@@ -421,10 +480,10 @@ def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
                 mini_topology = meta['mini_topologies'][topology_idx]
                 synth_circuit, synth_params = decomp_cache[ck]
                 pair_key = (partition_idx, topology_idx)
-                _add_result_with_auts(
+                self._add_result_with_auts(
                     results_map[partition_idx], (P_i, P_o_initial),
                     synth_circuit, synth_params, topology_idx,
-                    N, mini_topology, known_pairs, pair_key
+                    N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                 )
 
             # Collect Stage 1 pool results
@@ -440,10 +499,10 @@ def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
                 P_o_initial = stage1_P_o[(partition_idx, topology_idx)]
                 mini_topology = meta['mini_topologies'][topology_idx]
                 pair_key = (partition_idx, topology_idx)
-                _add_result_with_auts(
+                self._add_result_with_auts(
                     results_map[partition_idx], (P_i, P_o_initial),
                     synth_circuit, synth_params, topology_idx,
-                    N, mini_topology, known_pairs, pair_key
+                    N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                 )
 
             # ---- Stage 2: fix best P_i from Stage 1, sweep all P_o ----
@@ -463,8 +522,8 @@ def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
                     for P_o in perms_all:
                         if use_auts and (tuple(P_i_best), P_o) in kp:
                             continue
-                        Umtx = _build_permuted_unitary(meta, P_i_best, P_o)
-                        ck = _cache_key(Umtx, mini_topology)
+                        Umtx = self._build_permuted_unitary(meta, P_i_best, P_o)
+                        ck = self._cache_key(Umtx, mini_topology)
                         if ck in decomp_cache:
                             stage2_cached.append((partition_idx, topology_idx, P_i_best, P_o, ck))
                         else:
@@ -480,10 +539,10 @@ def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
                 mini_topology = meta['mini_topologies'][topology_idx]
                 synth_circuit, synth_params = decomp_cache[ck]
                 pair_key = (partition_idx, topology_idx)
-                _add_result_with_auts(
+                self._add_result_with_auts(
                     results_map[partition_idx], (tuple(P_i_best), P_o),
                     synth_circuit, synth_params, topology_idx,
-                    N, mini_topology, known_pairs, pair_key
+                    N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                 )
 
             # Collect Stage 2 pool results
@@ -498,69 +557,13 @@ def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
                 N = meta['N']
                 mini_topology = meta['mini_topologies'][topology_idx]
                 pair_key = (partition_idx, topology_idx)
-                _add_result_with_auts(
+                self._add_result_with_auts(
                     results_map[partition_idx], (tuple(P_i_best), P_o),
                     synth_circuit, synth_params, topology_idx,
-                    N, mini_topology, known_pairs, pair_key
+                    N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                 )
 
-            # Store assembled results
-            for partition_idx, result in results_map.items():
-                optimized_results[partition_idx] = result
-        # ---- Phase 3: ILP partition selection with synthesis-cost weights ----
-        weights = []
-        for idx, result in enumerate(optimized_results[:len(allparts)]):
-            if isinstance(result, SingleQubitPartitionResult):
-                weights.append(0)
-            else:
-                weights.append(result.get_partition_synthesis_score())
-
-        L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
-
-        # ---- Phase 4: Reuse Phase 2 results (no re-synthesis) ----
-        # Build non-overlapping parts from selected allparts + standalone chains.
-        # Phase 2 already synthesized each allpart (with surrounded chains included),
-        # so we reuse those results directly.
-        selected_surrounded_starts = set()
-        selected_parts_gates = []
-        for i in L_parts:
-            part = allparts[i]
-            surrounded = {t for s in part for t in go[s]
-                         if t in single_qubit_chains_prepost
-                         and go[single_qubit_chains_prepost[t][-1]]
-                         and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
-            gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded))
-            selected_parts_gates.append(gates)
-            selected_surrounded_starts.update(surrounded)
-
-        # Non-surrounded chains become standalone SingleQubitPartitionResult entries
-        standalone_chains = []
-        for chain in single_qubit_chains:
-            if chain[0] not in selected_surrounded_starts:
-                selected_parts_gates.append(frozenset(chain))
-                standalone_chains.append(chain)
-
-        L = topo_sort_partitions(working_circ, self.config["max_partition_size"], selected_parts_gates)
-
-        n_selected = len(L_parts)
-        optimized_partitions = []
-        for part_idx in L:
-            if part_idx < n_selected:
-                # Multi-qubit partition — reuse Phase 2 PartitionSynthesisResult
-                optimized_partitions.append(optimized_results[L_parts[part_idx]])
-            else:
-                # Standalone single-qubit chain
-                chain = standalone_chains[part_idx - n_selected]
-                c = Circuit(qbit_num_orig_circuit)
-                chain_params = []
-                for gate_idx in chain:
-                    c.add_Gate(gate_dict[gate_idx])
-                    start = gate_dict[gate_idx].get_Parameter_Start_Index()
-                    chain_params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
-                chain_parameters = np.concatenate(chain_params) if chain_params else np.array([])
-                optimized_partitions.append(SingleQubitPartitionResult(c, chain_parameters))
-
-        return optimized_partitions
+        return results_map
 
     # ------------------------------------------------------------------------
     # Main Public API
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 8cbbcb7c3..4a55f9cf6 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -1,10 +1,13 @@
-import numpy as np
-from typing import List, Tuple, Set, FrozenSet
-from itertools import permutations, combinations
-from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
 import heapq
 import logging
 from collections import defaultdict
+from dataclasses import dataclass
+from itertools import combinations, permutations
+from typing import Dict, FrozenSet, List, Set, Tuple
+
+import numpy as np
+
+from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
 
 
 # ============================================================================
@@ -305,10 +308,6 @@ def derive_result_from_automorphism(sigma, P_i, P_o, circuit, parameters, N):
     return new_P_i, new_P_o, new_circuit, parameters
 
 
-# ============================================================================
-# Distance & Cost Calculations
-# ============================================================================
-
 # ============================================================================
 # Data Classes
 # ============================================================================
@@ -519,6 +518,18 @@ def get_final_circuit(self,optimized_partitions,N):
         return part_circuit, part_parameters
 
 
+@dataclass(frozen=True)
+class PartitionScoreData:
+    mini_topologies: Tuple[Tuple[Tuple[int, int], ...], ...]
+    topology_candidates: Tuple[Tuple[Tuple[int, int], ...], ...]
+    permutations_pairs: Tuple[
+        Tuple[Tuple[Tuple[int, ...], Tuple[int, ...]], ...], ...
+    ]
+    circuit_structures: Tuple[Tuple[Tuple[int, ...], ...], ...]
+    qubit_map: Dict[int, int]
+    involved_qbits: Tuple[int, ...]
+
+
 # ============================================================================
 # Circuit Utilities
 # ============================================================================

From 341e7496b8cb31ce3df812275de1ef9c4c9ba00c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 20 Mar 2026 10:02:35 +0100
Subject: [PATCH 102/232] Routing speedup

---
 squander/synthesis/PartAM.py       | 158 +++++++++++++++++++++++------
 squander/synthesis/PartAM_utils.py |  10 ++
 2 files changed, 138 insertions(+), 30 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index f7c8a5bf5..71ba1dcd4 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -714,6 +714,8 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         E_W = self.config.get('E_weight', 0.5)
         E_alpha = self.config.get('E_alpha', 0.9)
 
+        neighbor_data = self._precompute_neighbor_data(scoring_partitions, reverse=False)
+
         while len(F) != 0:
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
@@ -736,6 +738,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             E=E,
                             W=E_W,
                             alpha=E_alpha,
+                            neighbor_data=neighbor_data,
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -803,6 +806,8 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
         E_W = self.config.get('E_weight', 0.5)
         E_alpha = self.config.get('E_alpha', 0.9)
 
+        neighbor_data = self._precompute_neighbor_data(scoring_partitions, reverse=reverse)
+
         while F:
             partition_candidates = self.obtain_partition_candidates(F, optimized_partitions)
             if not partition_candidates:
@@ -821,6 +826,7 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
                     self._swap_cache,
                     E=E, W=E_W, alpha=E_alpha,
                     reverse=reverse,
+                    neighbor_data=neighbor_data,
                 )
                 for pc in partition_candidates
             ]
@@ -890,41 +896,110 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
-                                  E=None, W=0.5, alpha=0.9, reverse=False):
-        score = 0
-        swap_weight = 1
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, reverse=reverse)
-        score += swap_weight * len(swaps) * 3
-        score += 0.1*len(partition_candidate.circuit_structure)
+    def _precompute_neighbor_data(scoring_partitions, reverse=False):
+        """Precompute resolved virtual qubit edges for all scoring partitions.
 
-        for partition_idx in F:
-            partition = scoring_partitions[partition_idx]
-            if partition is None or partition_idx == partition_candidate.partition_idx:
+        Returns a dict mapping partition_idx to (cnot_arr, q_u_arr, q_v_arr)
+        where arrays are padded numpy arrays for vectorized scoring.
+        Partitions that are None are skipped.
+        """
+        neighbor_data = {}
+        for idx, partition in enumerate(scoring_partitions):
+            if partition is None:
                 continue
             qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
-            mini_scores = []
+            cnot_list = []
+            q_u_list = []
+            q_v_list = []
+            edge_counts = []
+
             for tdx, mini_topology in enumerate(partition.mini_topologies):
                 for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
-                    cnot_count = len(partition.circuit_structures[tdx][pdx])
-                    # In reverse pass, the "entry" side of neighbor partitions
-                    # is their output (P_o), not their input (P_i).
+                    cnot_list.append(len(partition.circuit_structures[tdx][pdx]))
                     P_route = P_o if reverse else P_i
+                    eu = []
+                    ev = []
                     if mini_topology:
-                        routing_cost = swap_weight * 3 * sum(
-                            max(0, D[int(output_perm[qbit_map_inv[P_route[u]]])][int(output_perm[qbit_map_inv[P_route[v]]])] - 1)
-                            for u, v in mini_topology
-                        )
+                        for u, v in mini_topology:
+                            eu.append(qbit_map_inv[P_route[u]])
+                            ev.append(qbit_map_inv[P_route[v]])
+                    q_u_list.append(eu)
+                    q_v_list.append(ev)
+                    edge_counts.append(len(eu))
+
+            if not cnot_list:
+                continue
+
+            n_combos = len(cnot_list)
+            max_edges = max(edge_counts)
+            cnot_arr = np.array(cnot_list, dtype=np.float64)
+
+            if max_edges > 0:
+                # Pad with 0: output_perm[0] maps to some physical qubit p,
+                # D[p][p] = 0, so max(0, 0-1) = 0 — padding contributes nothing.
+                q_u_arr = np.zeros((n_combos, max_edges), dtype=np.intp)
+                q_v_arr = np.zeros((n_combos, max_edges), dtype=np.intp)
+                for i in range(n_combos):
+                    ne = edge_counts[i]
+                    if ne > 0:
+                        q_u_arr[i, :ne] = q_u_list[i]
+                        q_v_arr[i, :ne] = q_v_list[i]
+                neighbor_data[idx] = (cnot_arr, q_u_arr, q_v_arr)
+            else:
+                neighbor_data[idx] = (cnot_arr, None, None)
+
+        return neighbor_data
+
+    @staticmethod
+    def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
+                                  E=None, W=0.5, alpha=0.9, reverse=False,
+                                  neighbor_data=None):
+        score = 0
+        swap_weight = 1
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, reverse=reverse)
+        score += swap_weight * len(swaps) * 3
+        score += 0.1*len(partition_candidate.circuit_structure)
+
+        if neighbor_data is not None:
+            output_perm_arr = np.asarray(output_perm, dtype=np.intp)
+            D_arr = np.asarray(D)
+
+            for partition_idx in F:
+                if partition_idx == partition_candidate.partition_idx:
+                    continue
+                entry = neighbor_data.get(partition_idx)
+                if entry is None:
+                    continue
+                cnot_arr, q_u_arr, q_v_arr = entry
+                if q_u_arr is not None:
+                    phys_u = output_perm_arr[q_u_arr]
+                    phys_v = output_perm_arr[q_v_arr]
+                    routing = 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum(axis=1)
+                    score += float((routing + cnot_arr).min())
+                else:
+                    score += float(cnot_arr.min())
+
+            if E:
+                e_score = 0.0
+                for partition_idx, depth in E:
+                    if partition_idx == partition_candidate.partition_idx:
+                        continue
+                    entry = neighbor_data.get(partition_idx)
+                    if entry is None:
+                        continue
+                    cnot_arr, q_u_arr, q_v_arr = entry
+                    if q_u_arr is not None:
+                        phys_u = output_perm_arr[q_u_arr]
+                        phys_v = output_perm_arr[q_v_arr]
+                        routing = 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum(axis=1)
+                        e_score += float((routing + cnot_arr).min()) * (alpha ** depth)
                     else:
-                        routing_cost = 0
-                    mini_scores.append(routing_cost + cnot_count)
-            if mini_scores:
-                score += np.min(mini_scores)
-
-        # Extended set look-ahead scoring
-        if E:
-            e_score = 0
-            for partition_idx, depth in E:
+                        e_score += float(cnot_arr.min()) * (alpha ** depth)
+                if len(E) > 0:
+                    score += W * e_score / len(E)
+        else:
+            # Fallback: original Python loop (no precomputed data)
+            for partition_idx in F:
                 partition = scoring_partitions[partition_idx]
                 if partition is None or partition_idx == partition_candidate.partition_idx:
                     continue
@@ -943,9 +1018,32 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                             routing_cost = 0
                         mini_scores.append(routing_cost + cnot_count)
                 if mini_scores:
-                    e_score += np.min(mini_scores) * (alpha ** depth)
-            if len(E) > 0:
-                score += W * e_score / len(E)
+                    score += min(mini_scores)
+
+            if E:
+                e_score = 0
+                for partition_idx, depth in E:
+                    partition = scoring_partitions[partition_idx]
+                    if partition is None or partition_idx == partition_candidate.partition_idx:
+                        continue
+                    qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
+                    mini_scores = []
+                    for tdx, mini_topology in enumerate(partition.mini_topologies):
+                        for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
+                            cnot_count = len(partition.circuit_structures[tdx][pdx])
+                            P_route = P_o if reverse else P_i
+                            if mini_topology:
+                                routing_cost = swap_weight * 3 * sum(
+                                    max(0, D[int(output_perm[qbit_map_inv[P_route[u]]])][int(output_perm[qbit_map_inv[P_route[v]]])] - 1)
+                                    for u, v in mini_topology
+                                )
+                            else:
+                                routing_cost = 0
+                            mini_scores.append(routing_cost + cnot_count)
+                    if mini_scores:
+                        e_score += min(mini_scores) * (alpha ** depth)
+                if len(E) > 0:
+                    score += W * e_score / len(E)
 
         return score
 
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 4a55f9cf6..bf8478edb 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -227,7 +227,14 @@ def get_subtopologies_of_type(edges: List[Tuple[int, int]], target_topology: Lis
             matches.append(induced)  # global labels retained for routing
     return matches
 
+_node_mapping_cache = {}
+
 def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int, int]]) -> dict:
+    cache_key = (tuple(tuple(e) for e in topology1), tuple(tuple(e) for e in topology2))
+    cached = _node_mapping_cache.get(cache_key)
+    if cached is not None:
+        return cached
+
     qubits1 = set()
     for u, v in topology1:
         qubits1.add(u)
@@ -237,6 +244,7 @@ def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int
         qubits2.add(u)
         qubits2.add(v)
     if len(qubits1) != len(qubits2):
+        _node_mapping_cache[cache_key] = {}
         return {}
     sorted_qubits1 = sorted(qubits1)
     sorted_qubits2 = sorted(qubits2)
@@ -248,7 +256,9 @@ def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int
             mapped_edges.add(tuple(sorted([mapping[u], mapping[v]])))
         original_edges = set(tuple(sorted([u, v])) for u, v in topology2)
         if mapped_edges == original_edges:
+            _node_mapping_cache[cache_key] = mapping
             return mapping
+    _node_mapping_cache[cache_key] = {}
     return {}
 
 

From 7fe9983e675976eedd3fb1cb80fb18a34d64a214 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 20 Mar 2026 10:08:03 +0100
Subject: [PATCH 103/232] Add cleanup in between routing passes

---
 squander/synthesis/PartAM.py | 170 +++++++++++++++++++++++++----------
 1 file changed, 121 insertions(+), 49 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 71ba1dcd4..1ae0b8d34 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -588,6 +588,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         n_iterations = self.config.get('sabre_iterations', 1)
         n_trials = self.config.get('n_layout_trials', 1)
         random_seed = self.config.get('random_seed', 42)
+        do_cleanup = self.config.get('cleanup', True)
         routing_start = time.time()
         if n_iterations == 0:
             # Single forward pass from identity layout
@@ -598,64 +599,135 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                 scoring_partitions=scoring_partitions, D=D,
             )
         else:
-            best_pi = None
-            best_cost = float('inf')
-
-            for trial in range(max(1, n_trials)):
-                rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
-                pi = np.arange(N)
-
-                for iteration in range(n_iterations):
-                    # Reverse pass: walk DAG backwards (swap DAG↔IDAG)
-                    F_rev = self.get_final_layer(DAG, N, optimized_partitions)
-                    pi, _ = self._heuristic_search_layout_only(
-                        F_rev, pi, IDAG, DAG, optimized_partitions, scoring_partitions, D,
-                        rng=rng,
-                        reverse=True,
+            if do_cleanup:
+                from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
+                cleanup_config = dict(self.config)
+                cleanup_config['topology'] = self.topology
+                cleanup_config['routed'] = True
+                cleanup_config['test_subcircuits'] = False
+                cleanup_config['test_final_circuit'] = False
+                wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+
+                # Save single-qubit partition circuits before trial loop
+                saved_sq_circuits = {i: p.circuit for i, p in enumerate(optimized_partitions)
+                                     if isinstance(p, SingleQubitPartitionResult)}
+
+                best_circuit = best_params = best_pi_init = best_pi = None
+                best_cost = float('inf')
+
+                for trial in range(max(1, n_trials)):
+                    rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
+                    pi = np.arange(N)
+
+                    for iteration in range(n_iterations):
+                        # Reverse pass: walk DAG backwards (swap DAG↔IDAG)
+                        F_rev = self.get_final_layer(DAG, N, optimized_partitions)
+                        pi, _ = self._heuristic_search_layout_only(
+                            F_rev, pi, IDAG, DAG, optimized_partitions, scoring_partitions, D,
+                            rng=rng,
+                            reverse=True,
+                        )
+
+                        # Forward layout-only pass (skip on last iteration)
+                        if iteration < n_iterations - 1:
+                            F_fwd = self.get_initial_layer(IDAG, N, optimized_partitions)
+                            pi, _ = self._heuristic_search_layout_only(
+                                F_fwd, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                                rng=rng,
+                            )
+
+                    # Restore single-qubit partition circuits before full forward pass
+                    for i, orig in saved_sq_circuits.items():
+                        optimized_partitions[i].circuit = orig.copy()
+
+                    # Full forward pass
+                    F_trial = self.get_initial_layer(IDAG, N, optimized_partitions)
+                    partition_order, pi_out, pi_init = self.Heuristic_Search(
+                        F_trial, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
                     )
 
-                    # Forward layout-only pass (skip on last iteration — real pass follows)
-                    if iteration < n_iterations - 1:
-                        F_fwd = self.get_initial_layer(IDAG, N, optimized_partitions)
+                    # Build circuit + cleanup
+                    trial_circuit, trial_params = self.Construct_circuit_from_HS(
+                        partition_order, optimized_partitions, N,
+                    )
+                    pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get('CNOT', 0)
+                    trial_circuit, trial_params = wco.OptimizeWideCircuit(
+                        trial_circuit.get_Flat_Circuit(), trial_params,
+                    )
+
+                    cost = trial_circuit.get_Gate_Nums().get('CNOT', 0)
+
+                    if cost < best_cost:
+                        best_cost = cost
+                        best_pre_cleanup = pre_cleanup_cnots
+                        best_circuit, best_params = trial_circuit, trial_params
+                        best_pi_init, best_pi = pi_init, pi_out
+
+                final_circuit, final_parameters = best_circuit, best_params
+                pi_initial, pi = best_pi_init, best_pi
+
+            else:
+                best_pi = None
+                best_cost = float('inf')
+
+                for trial in range(max(1, n_trials)):
+                    rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
+                    pi = np.arange(N)
+
+                    for iteration in range(n_iterations):
+                        # Reverse pass: walk DAG backwards (swap DAG↔IDAG)
+                        F_rev = self.get_final_layer(DAG, N, optimized_partitions)
                         pi, _ = self._heuristic_search_layout_only(
-                            F_fwd, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                            F_rev, pi, IDAG, DAG, optimized_partitions, scoring_partitions, D,
                             rng=rng,
+                            reverse=True,
                         )
 
-                # Score this trial: deterministic forward layout-only pass
-                F_eval = self.get_initial_layer(IDAG, N, optimized_partitions)
-                _, cost = self._heuristic_search_layout_only(
-                    F_eval, pi.copy(), DAG, IDAG, optimized_partitions, scoring_partitions, D,
-                    rng=None,
-                )
+                        # Forward layout-only pass (skip on last iteration — real pass follows)
+                        if iteration < n_iterations - 1:
+                            F_fwd = self.get_initial_layer(IDAG, N, optimized_partitions)
+                            pi, _ = self._heuristic_search_layout_only(
+                                F_fwd, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                                rng=rng,
+                            )
 
-                if cost < best_cost:
-                    best_cost = cost
-                    best_pi = pi.copy()
+                    # Score this trial: deterministic forward layout-only pass
+                    F_eval = self.get_initial_layer(IDAG, N, optimized_partitions)
+                    _, cost = self._heuristic_search_layout_only(
+                        F_eval, pi.copy(), DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                        rng=None,
+                    )
 
-            # Final forward pass — builds actual circuits
-            F = self.get_initial_layer(IDAG, N, optimized_partitions)
-            partition_order, pi, pi_initial = self.Heuristic_Search(
-                F, best_pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
-            )
+                    if cost < best_cost:
+                        best_cost = cost
+                        best_pi = pi.copy()
 
-        final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
-        self._routing_time = time.time() - routing_start
-        self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get('CNOT', 0)
-
-        # Cleanup phase: re-partition and resynthesize to eliminate
-        # redundancies at SWAP-partition boundaries
-        if self.config.get('cleanup', True):
-            from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
-            cleanup_config = dict(self.config)
-            cleanup_config['topology'] = self.topology
-            cleanup_config['routed'] = True
-            cleanup_config['test_subcircuits'] = False
-            cleanup_config['test_final_circuit'] = False
-            wco = qgd_Wide_Circuit_Optimization(cleanup_config)
-            final_circuit, final_parameters = wco.OptimizeWideCircuit(
-                final_circuit.get_Flat_Circuit(), final_parameters
-            )
+                # Final forward pass — builds actual circuits
+                F = self.get_initial_layer(IDAG, N, optimized_partitions)
+                partition_order, pi, pi_initial = self.Heuristic_Search(
+                    F, best_pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                )
+
+        if do_cleanup and n_iterations > 0:
+            # Cleanup already done per-trial
+            self._routing_time = time.time() - routing_start
+            self._cnot_pre_cleanup = best_pre_cleanup
+        else:
+            final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
+            self._routing_time = time.time() - routing_start
+            self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get('CNOT', 0)
+
+            if self.config.get('cleanup', True):
+                from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
+                cleanup_config = dict(self.config)
+                cleanup_config['topology'] = self.topology
+                cleanup_config['routed'] = True
+                cleanup_config['test_subcircuits'] = False
+                cleanup_config['test_final_circuit'] = False
+                wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+                final_circuit, final_parameters = wco.OptimizeWideCircuit(
+                    final_circuit.get_Flat_Circuit(), final_parameters
+                )
 
         return final_circuit, final_parameters, pi_initial, pi
 

From 3ea5ac7bde7c356c77460110d27214c701626f89 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 17 Apr 2026 22:38:24 +0200
Subject: [PATCH 104/232] Add in improvements

---
 .../qgd_Wide_Circuit_Optimization.py          |  75 +---
 squander/synthesis/PartAM.py                  | 393 +++++++++++++++++-
 squander/synthesis/PartAM_utils.py            |  88 +++-
 3 files changed, 466 insertions(+), 90 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 0018d3590..3b252155d 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -1001,72 +1001,33 @@ def OptimizeWideCircuit( self, circ: Circuit, orig_parameters: np.ndarray, globa
         # the list of parameters associated with the optimized subcircuits
         optimized_parameter_list = [None] * len(subcircuits)
 
-        if parent_process() is not None:
-            #  code for iterate over partitions and optimize them
-            for partition_idx, subcircuit in enumerate( subcircuits ):
-        
+        async_results = [None] * len(subcircuits)
+        n_cpus = mp.cpu_count()
+        with Pool(processes=n_cpus) as pool:
 
-                # isolate the parameters corresponding to the given sub-circuit
+            for partition_idx, subcircuit in enumerate(subcircuits):
                 start_idx = subcircuit.get_Parameter_Start_Index()
                 end_idx   = start_idx + subcircuit.get_Parameter_Num()
-                subcircuit_parameters = parameters[ start_idx:end_idx ]
-    
-        
-            
-                # callback function done on the master process to compare the new decomposed and the original suncircuit
-                callback_fnc = lambda  x : self.CompareAndPickCircuits( [subcircuit, *(z[0] for z in x)], [subcircuit_parameters, *(z[1] for z in x)] )
+                subcircuit_parameters = parameters[start_idx:end_idx]
 
-                # call a process to decompose a subcircuit
                 config = {**self.config, 'tree_level_max': max(0, subcircuit.get_Gate_Nums().get('CNOT', 0)-1)}
                 config = config if structures is None or partition_idx >= len(structures) else {**config, 'strategy': 'Custom', 'max_inner_iterations': 10000, 'max_iteration_loops': 4}
-                new_subcircuit, new_parameters = callback_fnc(self.PartitionDecompositionProcess( subcircuit, subcircuit_parameters, config,
-                                                                                     None if structures is None or partition_idx >= len(structures) else structures[partition_idx] ))
-                if subcircuit != new_subcircuit and self.config["verbosity"] > 0:
-                    print( "original subcircuit:    ", subcircuit.get_Gate_Nums())
-                    print( "reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums())
-
-                if partition_idx % 100 == 99 and self.config["verbosity"] > 0: print(partition_idx+1, "partitions optimized")
-                optimized_subcircuits[ partition_idx ] = new_subcircuit
-                optimized_parameter_list[ partition_idx ] = new_parameters
-        else:
-            # list of AsyncResult objects (for 2-qubit) or direct results (for 1-qubit and 3+ qubit)
-            async_results = [None] * len(subcircuits)
-            n_cpus = mp.cpu_count()
-            with Pool(processes=n_cpus) as pool:
-
-                #  code for iterate over partitions and optimize them
-                for partition_idx, subcircuit in enumerate( subcircuits ):
 
+                async_results[partition_idx] = pool.apply_async(self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config,
+                                                                                                    None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
 
-                    # isolate the parameters corresponding to the given sub-circuit
-                    start_idx = subcircuit.get_Parameter_Start_Index()
-                    end_idx   = start_idx + subcircuit.get_Parameter_Num()
-                    subcircuit_parameters = parameters[ start_idx:end_idx ]
-
-
-
-                    # call a process to decompose a subcircuit
-                    config = {**self.config, 'tree_level_max': max(0, subcircuit.get_Gate_Nums().get('CNOT', 0)-1)}
-                    config = config if structures is None or partition_idx >= len(structures) else {**config, 'strategy': 'Custom', 'max_inner_iterations': 10000, 'max_iteration_loops': 4}
-
-                    qbit_num_sub = len(subcircuit.get_Qbits())
-                    async_results[partition_idx]  = pool.apply_async( self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config,
-                                                                                                        None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
-
-                #  code for iterate over async results and retrieve the new subcircuits
-                for partition_idx, subcircuit in enumerate( subcircuits ):
-                    # callback function done on the master process to compare the new decomposed and the original suncircuit
-                    start_idx = subcircuit.get_Parameter_Start_Index()
-                    subcircuit_parameters = parameters[ start_idx:start_idx + subcircuit.get_Parameter_Num() ]
-                    callback_fnc = lambda  x : self.CompareAndPickCircuits( [subcircuit, *(z[0] for z in x)], [subcircuit_parameters, *(z[1] for z in x)] )
-                    new_subcircuit, new_parameters = callback_fnc(async_results[partition_idx].get( timeout = None ))
+            for partition_idx, subcircuit in enumerate(subcircuits):
+                start_idx = subcircuit.get_Parameter_Start_Index()
+                subcircuit_parameters = parameters[start_idx:start_idx + subcircuit.get_Parameter_Num()]
+                callback_fnc = lambda  x : self.CompareAndPickCircuits( [subcircuit, *(z[0] for z in x)], [subcircuit_parameters, *(z[1] for z in x)] )
+                new_subcircuit, new_parameters = callback_fnc(async_results[partition_idx].get(timeout=None))
 
-                    if subcircuit != new_subcircuit and self.config["verbosity"] > 0:
-                        print( "original subcircuit:    ", subcircuit.get_Gate_Nums())
-                        print( "reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums())
-                    if partition_idx % 100 == 99 and self.config["verbosity"] > 0: print(partition_idx+1, "partitions optimized")
-                    optimized_subcircuits[ partition_idx ] = new_subcircuit
-                    optimized_parameter_list[ partition_idx ] = new_parameters
+                if subcircuit != new_subcircuit and self.config["verbosity"] > 0:
+                    print("original subcircuit:    ", subcircuit.get_Gate_Nums())
+                    print("reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums())
+                if partition_idx % 100 == 99 and self.config["verbosity"] > 0: print(partition_idx+1, "partitions optimized")
+                optimized_subcircuits[partition_idx] = new_subcircuit
+                optimized_parameter_list[partition_idx] = new_parameters
 
 
         # construct the wide circuit from the optimized suncircuits
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 1ae0b8d34..3dbe858a6 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -25,7 +25,6 @@
     topo_sort_partitions,
     ilp_global_optimal,
 )
-
 # Module-level globals for pool workers (set via Pool initializer)
 _worker_config = None
 
@@ -93,6 +92,13 @@ def __init__(self, config):
         self.config.setdefault('score_tolerance', 0.05)
         self.config.setdefault('random_seed', 42)
         self.config.setdefault('cleanup', True)
+        self.config.setdefault('prefilter_top_k', 50)
+        self.config.setdefault('neighbor_weight', 0.5)
+        self.config.setdefault('E_overlap_floor', 0.2)
+        self.config.setdefault('branch_budget', 3)
+        self.config.setdefault('branch_threshold', 0.1)
+        self.config.setdefault('congestion_weight', 0.1)
+        self.config.setdefault('congestion_decay', 0.9)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -101,6 +107,7 @@ def __init__(self, config):
         # Initialize caches for performance optimization
         self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
         self._swap_cache = {}     # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)}
+        self._adj = None          # Precomputed adjacency list (built by compute_distances_bfs)
 
     # ------------------------------------------------------------------------
     # Caching Methods
@@ -430,9 +437,6 @@ def _run_parallel_synthesis(self, partition_meta):
         use_auts = self.config.get('use_automorphisms', True)
         disable_pbar = self.config.get('progressbar', 0) == False
         aut_cache = {}
-        # Cache: (rounded_unitary_bytes, topo_key) -> (circuit, parameters)
-        # Avoids redundant C++ decompositions when different partitions or
-        # permutations produce the same unitary matrix.
         decomp_cache = {}
 
         with Pool(processes=n_cpus, initializer=_init_decompose_worker,
@@ -652,7 +656,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                     )
                     pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get('CNOT', 0)
                     trial_circuit, trial_params = wco.OptimizeWideCircuit(
-                        trial_circuit.get_Flat_Circuit(), trial_params,
+                        trial_circuit.get_Flat_Circuit(), trial_params, global_min = False
                     )
 
                     cost = trial_circuit.get_Gate_Nums().get('CNOT', 0)
@@ -726,7 +730,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                 cleanup_config['test_final_circuit'] = False
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
                 final_circuit, final_parameters = wco.OptimizeWideCircuit(
-                    final_circuit.get_Flat_Circuit(), final_parameters
+                    final_circuit.get_Flat_Circuit(), final_parameters, global_min = False
                 )
 
         return final_circuit, final_parameters, pi_initial, pi
@@ -750,6 +754,223 @@ def _select_best_candidate(self, partition_candidates, scores, rng=None):
         else:
             return partition_candidates[np.argmin(scores_array)]
 
+    def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=False):
+        """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
+        if len(partition_candidates) <= top_k:
+            return partition_candidates
+        estimates = np.array([
+            pc.estimate_swap_count(pi, D, reverse=reverse) * 3 + 0.1 * len(pc.circuit_structure)
+            for pc in partition_candidates
+        ])
+        top_k_indices = np.argpartition(estimates, top_k)[:top_k]
+        return [partition_candidates[i] for i in top_k_indices]
+
+    def _select_with_lookahead(self, partition_candidates, scores, pi, F,
+                               DAG, IDAG, resolved_partitions,
+                               optimized_partitions, scoring_partitions, D,
+                               E_W, E_alpha, E_overlap_floor,
+                               neighbor_data, neighbor_weight,
+                               reverse=False, rng=None):
+        """1-step lookahead branching: when top candidates are close in score,
+        tentatively commit each, score one step ahead, and pick the best 2-step total.
+
+        Falls back to _select_best_candidate when there is a clear winner or
+        branching is disabled (branch_budget <= 1).
+        """
+        branch_budget = self.config.get('branch_budget', 3)
+        branch_threshold = self.config.get('branch_threshold', 0.1)
+
+        if branch_budget <= 1:
+            return self._select_best_candidate(partition_candidates, scores, rng=rng)
+
+        scores_array = np.array(scores)
+        min_score = np.min(scores_array)
+
+        # Find candidates within threshold of best
+        if min_score > 0:
+            threshold = min_score * (1 + branch_threshold)
+        else:
+            threshold = branch_threshold
+        close_indices = np.where(scores_array <= threshold)[0]
+
+        if len(close_indices) <= 1:
+            return self._select_best_candidate(partition_candidates, scores, rng=rng)
+
+        # Limit to branch_budget
+        if len(close_indices) > branch_budget:
+            # Keep the top branch_budget by score
+            sorted_close = close_indices[np.argsort(scores_array[close_indices])]
+            close_indices = sorted_close[:branch_budget]
+
+        # Evaluate each branch one step ahead
+        best_branch_score = float('inf')
+        best_candidate = None
+        top_k = self.config.get('prefilter_top_k', 50)
+
+        for idx in close_indices:
+            candidate = partition_candidates[idx]
+            candidate_score = scores_array[idx]
+
+            # Tentatively apply this candidate's routing
+            temp_swap_cache = {}
+            neighbor_info = self._compute_neighbor_info(
+                candidate, tuple(F), None, neighbor_data, pi,
+                alpha=E_alpha, weight=neighbor_weight,
+            )
+            swaps, pi_next = candidate.transform_pi(
+                pi, D, temp_swap_cache, reverse=reverse,
+                adj=self._adj, neighbor_info=neighbor_info,
+            )
+
+            # Compute tentative front layer after committing this candidate
+            F_next = [p for p in F if p != candidate.partition_idx]
+            temp_resolved = list(resolved_partitions)
+            temp_resolved[candidate.partition_idx] = True
+
+            # Promote children (skip single-qubit partitions)
+            for child in DAG[candidate.partition_idx]:
+                if not temp_resolved[child] and child not in F_next:
+                    if all(temp_resolved[p] for p in IDAG[child]):
+                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                            temp_resolved[child] = True
+                            stack = list(DAG[child])
+                            while stack:
+                                gc = stack.pop()
+                                if not temp_resolved[gc] and gc not in F_next:
+                                    if all(temp_resolved[p] for p in IDAG[gc]):
+                                        if isinstance(optimized_partitions[gc], SingleQubitPartitionResult):
+                                            temp_resolved[gc] = True
+                                            stack.extend(DAG[gc])
+                                        else:
+                                            F_next.append(gc)
+                        else:
+                            F_next.append(child)
+
+            if not F_next:
+                # No next step — use candidate score alone
+                branch_score = candidate_score
+            else:
+                # Generate and score next-step candidates
+                next_candidates = self.obtain_partition_candidates(F_next, optimized_partitions)
+                if next_candidates:
+                    next_candidates = self._prefilter_candidates(
+                        next_candidates, pi_next, D, top_k, reverse=reverse
+                    )
+                    F_next_snapshot = tuple(F_next)
+                    E_next = self.generate_extended_set(
+                        F_next, DAG, IDAG, temp_resolved, optimized_partitions,
+                        max_E_size=self.config.get('max_E_size', 20),
+                        max_lookahead=self.config.get('max_lookahead', 4),
+                    )
+                    next_scores = [
+                        self.score_partition_candidate(
+                            pc, F_next_snapshot, pi_next, scoring_partitions, D,
+                            temp_swap_cache,
+                            E=E_next, W=E_W, alpha=E_alpha,
+                            reverse=reverse,
+                            neighbor_data=neighbor_data,
+                            adj=self._adj,
+                            neighbor_info=self._compute_neighbor_info(
+                                pc, F_next_snapshot, E_next, neighbor_data, pi_next,
+                                alpha=E_alpha, weight=neighbor_weight,
+                            ),
+                            E_overlap_floor=E_overlap_floor,
+                        )
+                        for pc in next_candidates
+                    ]
+                    branch_score = candidate_score + min(next_scores)
+                else:
+                    branch_score = candidate_score
+
+            if branch_score < best_branch_score:
+                best_branch_score = branch_score
+                best_candidate = candidate
+
+        return best_candidate
+
+    @staticmethod
+    def _compute_neighbor_info(partition_candidate, F, E, neighbor_data, pi,
+                               alpha=0.9, weight=0.01):
+        """Build neighbor_info dict for SABRE-aware A* tiebreaker.
+
+        Collects virtual qubit edges from front-layer and extended-set partitions
+        (excluding the current partition) so the A* can prefer SWAP paths that
+        leave future-partition qubits closer together.
+        """
+        if weight == 0 or neighbor_data is None:
+            return None
+
+        own_qubits = set(partition_candidate.involved_qbits)
+        # Collect weighted edges: (virtual_q_u, virtual_q_v, edge_weight)
+        raw_edges = []
+
+        # Front layer partitions (weight 1.0)
+        for part_idx in F:
+            if part_idx == partition_candidate.partition_idx:
+                continue
+            entry = neighbor_data.get(part_idx)
+            if entry is None:
+                continue
+            cnot_arr, q_u_arr, q_v_arr = entry
+            if q_u_arr is None:
+                continue
+            # Use the best (min-CNOT) permutation's edges
+            best_pdx = int(np.argmin(cnot_arr))
+            for e in range(q_u_arr.shape[1]):
+                qu, qv = int(q_u_arr[best_pdx, e]), int(q_v_arr[best_pdx, e])
+                if qu == qv:  # padding
+                    continue
+                if qu not in own_qubits or qv not in own_qubits:
+                    raw_edges.append((qu, qv, 1.0))
+
+        # Extended set partitions (weight alpha^depth)
+        if E:
+            for part_idx, depth in E:
+                if part_idx == partition_candidate.partition_idx:
+                    continue
+                entry = neighbor_data.get(part_idx)
+                if entry is None:
+                    continue
+                cnot_arr, q_u_arr, q_v_arr = entry
+                if q_u_arr is None:
+                    continue
+                best_pdx = int(np.argmin(cnot_arr))
+                ew = alpha ** depth
+                for e in range(q_u_arr.shape[1]):
+                    qu, qv = int(q_u_arr[best_pdx, e]), int(q_v_arr[best_pdx, e])
+                    if qu == qv:
+                        continue
+                    if qu not in own_qubits or qv not in own_qubits:
+                        raw_edges.append((qu, qv, ew))
+
+        if not raw_edges:
+            return None
+
+        # Build ordered list of unique neighbor virtual qubits
+        vq_set = set()
+        for qu, qv, _ in raw_edges:
+            vq_set.add(qu)
+            vq_set.add(qv)
+        neighbor_vqs = sorted(vq_set)
+        vq_to_idx = {vq: i for i, vq in enumerate(neighbor_vqs)}
+
+        # Convert edges to index-based, dedup by summing weights
+        edge_map = {}
+        for qu, qv, ew in raw_edges:
+            iu, iv = vq_to_idx[qu], vq_to_idx[qv]
+            key = (min(iu, iv), max(iu, iv))
+            edge_map[key] = edge_map.get(key, 0.0) + ew
+
+        edges = [(iu, iv, w) for (iu, iv), w in edge_map.items()]
+        initial_pos = tuple(int(pi[vq]) for vq in neighbor_vqs)
+
+        return {
+            'neighbor_vqs': neighbor_vqs,
+            'initial_pos': initial_pos,
+            'edges': edges,
+            'weight': weight,
+        }
+
     def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D):
         pi_initial = pi.copy()
 
@@ -785,13 +1006,24 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         max_lookahead = self.config.get('max_lookahead', 4)
         E_W = self.config.get('E_weight', 0.5)
         E_alpha = self.config.get('E_alpha', 0.9)
+        E_overlap_floor = self.config.get('E_overlap_floor', 0.2)
 
         neighbor_data = self._precompute_neighbor_data(scoring_partitions, reverse=False)
+        neighbor_weight = self.config.get('neighbor_weight', 0.5)
+
+        congestion_weight = self.config.get('congestion_weight', 0.1)
+        congestion_decay = self.config.get('congestion_decay', 0.9)
+        congestion = np.zeros(len(pi))
+        betweenness = getattr(self, '_betweenness', None)
 
         while len(F) != 0:
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
                     break
+
+                top_k = self.config.get('prefilter_top_k', 50)
+                partition_candidates = self._prefilter_candidates(partition_candidates, pi, D, top_k)
+
                 F_snapshot = tuple(F)
 
                 E = self.generate_extended_set(
@@ -811,10 +1043,25 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             W=E_W,
                             alpha=E_alpha,
                             neighbor_data=neighbor_data,
+                            adj=self._adj,
+                            neighbor_info=self._compute_neighbor_info(
+                                partition_candidate, F_snapshot, E, neighbor_data, pi,
+                                alpha=E_alpha, weight=neighbor_weight,
+                            ),
+                            E_overlap_floor=E_overlap_floor,
+                            congestion=congestion,
+                            betweenness=betweenness,
+                            congestion_weight=congestion_weight,
                         )
                         for partition_candidate in partition_candidates
                     ]
-                min_partition_candidate = self._select_best_candidate(partition_candidates, scores, rng=None)
+                min_partition_candidate = self._select_with_lookahead(
+                    partition_candidates, scores, pi, F,
+                    DAG, IDAG, resolved_partitions,
+                    optimized_partitions, scoring_partitions, D,
+                    E_W, E_alpha, E_overlap_floor,
+                    neighbor_data, neighbor_weight,
+                )
 
                 F.remove(min_partition_candidate.partition_idx)
                 resolved_partitions[min_partition_candidate.partition_idx] = True
@@ -822,9 +1069,20 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 pbar.n = resolved_count
                 pbar.refresh()
 
-                swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache)
+                neighbor_info = self._compute_neighbor_info(
+                    min_partition_candidate, F_snapshot, E, neighbor_data, pi,
+                    alpha=E_alpha, weight=neighbor_weight,
+                )
+                swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache, adj=self._adj, neighbor_info=neighbor_info)
                 if len(swap_order)!=0:
                     partition_order.append(construct_swap_circuit(swap_order, len(pi)))
+                    # Update congestion: increment for nodes used in SWAPs
+                    for p1, p2 in swap_order:
+                        congestion[p1] += 1.0
+                        congestion[p2] += 1.0
+
+                # Decay congestion each step
+                congestion *= congestion_decay
 
                 partition_order.append(min_partition_candidate)
                 children = list(DAG[min_partition_candidate.partition_idx])
@@ -877,14 +1135,25 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
         max_lookahead = self.config.get('max_lookahead', 4)
         E_W = self.config.get('E_weight', 0.5)
         E_alpha = self.config.get('E_alpha', 0.9)
+        E_overlap_floor = self.config.get('E_overlap_floor', 0.2)
 
         neighbor_data = self._precompute_neighbor_data(scoring_partitions, reverse=reverse)
+        neighbor_weight = self.config.get('neighbor_weight', 0.5)
+
+        congestion_weight = self.config.get('congestion_weight', 0.1)
+        congestion_decay = self.config.get('congestion_decay', 0.9)
+        N_layout = len(pi)
+        congestion = np.zeros(N_layout)
+        betweenness = getattr(self, '_betweenness', None)
 
         while F:
             partition_candidates = self.obtain_partition_candidates(F, optimized_partitions)
             if not partition_candidates:
                 break
 
+            top_k = self.config.get('prefilter_top_k', 50)
+            partition_candidates = self._prefilter_candidates(partition_candidates, pi, D, top_k, reverse=reverse)
+
             F_snapshot = tuple(F)
 
             E = self.generate_extended_set(
@@ -899,17 +1168,43 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
                     E=E, W=E_W, alpha=E_alpha,
                     reverse=reverse,
                     neighbor_data=neighbor_data,
+                    adj=self._adj,
+                    neighbor_info=self._compute_neighbor_info(
+                        pc, F_snapshot, E, neighbor_data, pi,
+                        alpha=E_alpha, weight=neighbor_weight,
+                    ),
+                    E_overlap_floor=E_overlap_floor,
+                    congestion=congestion,
+                    betweenness=betweenness,
+                    congestion_weight=congestion_weight,
                 )
                 for pc in partition_candidates
             ]
 
-            best = self._select_best_candidate(partition_candidates, scores, rng=rng)
+            best = self._select_with_lookahead(
+                partition_candidates, scores, pi, F,
+                DAG, IDAG, resolved_partitions,
+                optimized_partitions, scoring_partitions, D,
+                E_W, E_alpha, E_overlap_floor,
+                neighbor_data, neighbor_weight,
+                reverse=reverse, rng=rng,
+            )
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 
-            swaps, pi = best.transform_pi(pi, D, self._swap_cache, reverse=reverse)
+            neighbor_info = self._compute_neighbor_info(
+                best, F_snapshot, E, neighbor_data, pi,
+                alpha=E_alpha, weight=neighbor_weight,
+            )
+            swaps, pi = best.transform_pi(pi, D, self._swap_cache, reverse=reverse, adj=self._adj, neighbor_info=neighbor_info)
             total_swaps += len(swaps)
 
+            # Update and decay congestion
+            for p1, p2 in swaps:
+                congestion[p1] += 1.0
+                congestion[p2] += 1.0
+            congestion *= congestion_decay
+
             # Promote children
             for child in DAG[best.partition_idx]:
                 if not resolved_partitions[child] and child not in F:
@@ -1025,13 +1320,23 @@ def _precompute_neighbor_data(scoring_partitions, reverse=False):
     @staticmethod
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
-                                  neighbor_data=None):
+                                  neighbor_data=None, adj=None, neighbor_info=None,
+                                  E_overlap_floor=0.2,
+                                  congestion=None, betweenness=None, congestion_weight=0.0):
         score = 0
         swap_weight = 1
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, reverse=reverse)
+        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, reverse=reverse, adj=adj, neighbor_info=neighbor_info)
         score += swap_weight * len(swaps) * 3
         score += 0.1*len(partition_candidate.circuit_structure)
 
+        # Congestion penalty: penalize SWAP paths through congested bottleneck nodes
+        if congestion is not None and betweenness is not None and congestion_weight > 0 and swaps:
+            cong_penalty = 0.0
+            for p1, p2 in swaps:
+                cong_penalty += congestion[p1] * betweenness[p1]
+                cong_penalty += congestion[p2] * betweenness[p2]
+            score += congestion_weight * cong_penalty
+
         if neighbor_data is not None:
             output_perm_arr = np.asarray(output_perm, dtype=np.intp)
             D_arr = np.asarray(D)
@@ -1053,20 +1358,31 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
 
             if E:
                 e_score = 0.0
+                cand_qubits = set(partition_candidate.involved_qbits)
                 for partition_idx, depth in E:
                     if partition_idx == partition_candidate.partition_idx:
                         continue
                     entry = neighbor_data.get(partition_idx)
                     if entry is None:
                         continue
+                    # Overlap-aware decay: partitions sharing qubits with
+                    # the candidate are weighted more heavily.
+                    e_part = scoring_partitions[partition_idx]
+                    if e_part is not None and e_part.involved_qbits:
+                        e_qubits = set(e_part.involved_qbits)
+                        overlap = len(cand_qubits & e_qubits)
+                        relevance = overlap / len(e_qubits)
+                    else:
+                        relevance = 0.0
+                    decay = (alpha ** depth) * (E_overlap_floor + (1 - E_overlap_floor) * relevance)
                     cnot_arr, q_u_arr, q_v_arr = entry
                     if q_u_arr is not None:
                         phys_u = output_perm_arr[q_u_arr]
                         phys_v = output_perm_arr[q_v_arr]
                         routing = 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum(axis=1)
-                        e_score += float((routing + cnot_arr).min()) * (alpha ** depth)
+                        e_score += float((routing + cnot_arr).min()) * decay
                     else:
-                        e_score += float(cnot_arr.min()) * (alpha ** depth)
+                        e_score += float(cnot_arr.min()) * decay
                 if len(E) > 0:
                     score += W * e_score / len(E)
         else:
@@ -1094,10 +1410,19 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
 
             if E:
                 e_score = 0
+                cand_qubits = set(partition_candidate.involved_qbits)
                 for partition_idx, depth in E:
                     partition = scoring_partitions[partition_idx]
                     if partition is None or partition_idx == partition_candidate.partition_idx:
                         continue
+                    # Overlap-aware decay
+                    if partition.involved_qbits:
+                        e_qubits = set(partition.involved_qbits)
+                        overlap = len(cand_qubits & e_qubits)
+                        relevance = overlap / len(e_qubits)
+                    else:
+                        relevance = 0.0
+                    decay = (alpha ** depth) * (E_overlap_floor + (1 - E_overlap_floor) * relevance)
                     qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
                     mini_scores = []
                     for tdx, mini_topology in enumerate(partition.mini_topologies):
@@ -1113,7 +1438,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                 routing_cost = 0
                             mini_scores.append(routing_cost + cnot_count)
                     if mini_scores:
-                        e_score += min(mini_scores) * (alpha ** depth)
+                        e_score += min(mini_scores) * decay
                 if len(E) > 0:
                     score += W * e_score / len(E)
 
@@ -1284,6 +1609,44 @@ def compute_distances_bfs(self, N):
                         D[start][neighbor] = dist + 1
                         queue.append((neighbor, dist + 1))
         
+        # Store adjacency list for reuse by A* routing
+        self._adj = [list(adj[i]) for i in range(N)]
+
+        # Compute betweenness centrality for congestion-aware scoring.
+        # Brandes' algorithm adapted for unweighted BFS graphs: O(N * E).
+        bc = np.zeros(N)
+        for s in range(N):
+            # BFS from s
+            S = []  # stack of nodes in order of non-decreasing distance
+            P = [[] for _ in range(N)]  # predecessors on shortest paths
+            sigma = np.zeros(N)  # number of shortest paths from s
+            sigma[s] = 1.0
+            d = np.full(N, -1)
+            d[s] = 0
+            Q = deque([s])
+            while Q:
+                v = Q.popleft()
+                S.append(v)
+                for w in adj[v]:
+                    if d[w] < 0:
+                        Q.append(w)
+                        d[w] = d[v] + 1
+                    if d[w] == d[v] + 1:
+                        sigma[w] += sigma[v]
+                        P[w].append(v)
+            delta = np.zeros(N)
+            while S:
+                w = S.pop()
+                for v in P[w]:
+                    delta[v] += (sigma[v] / sigma[w]) * (1.0 + delta[w])
+                if w != s:
+                    bc[w] += delta[w]
+        # Normalize to [0, 1]
+        max_bc = bc.max()
+        if max_bc > 0:
+            bc /= max_bc
+        self._betweenness = bc
+
         return D #multiply by 3 to make it CNOT cost instead of SWAP cost
 
 
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index bf8478edb..3dd768e8d 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -13,7 +13,7 @@
 # ============================================================================
 # SWAP Routing Algorithms
 # ============================================================================
-def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
+def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix, adj=None, neighbor_info=None):
     """
     Route partition qubits to their target physical positions using A* over
     the k-dimensional state space of partition qubit positions only.
@@ -39,13 +39,14 @@ def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix):
     """
     n = len(pi_A)
 
-    # Build adjacency list from dist_matrix
-    adj = [[] for _ in range(n)]
-    for i in range(n):
-        for j in range(i + 1, n):
-            if dist_matrix[i][j] == 1:
-                adj[i].append(j)
-                adj[j].append(i)
+    # Build adjacency list from dist_matrix if not provided
+    if adj is None:
+        adj = [[] for _ in range(n)]
+        for i in range(n):
+            for j in range(i + 1, n):
+                if dist_matrix[i][j] == 1:
+                    adj[i].append(j)
+                    adj[j].append(i)
 
     partition_qubits = sorted(pi_B_dict.keys())
     k = len(partition_qubits)
@@ -60,6 +61,36 @@ def heuristic(positions):
         # Admissible lower bound: sum of individual distances / 2
         return sum(dist_matrix[positions[i]][target_positions[i]] for i in range(k)) / 2
 
+    # SABRE-aware tiebreaker: prefer SWAP paths that keep future-partition
+    # qubits closer together.  The weight is small enough to never override
+    # optimality (same SWAP count), only break ties among equal-length paths.
+    if neighbor_info is not None and neighbor_info['edges']:
+        n_vqs = neighbor_info['neighbor_vqs']
+        n_edges = neighbor_info['edges']       # list of (idx_u, idx_v, edge_weight)
+        n_weight = neighbor_info['weight']
+        initial_n_pos = neighbor_info['initial_pos']
+        # Reverse map: physical position → index in n_vqs (for displacement tracking)
+        _n_len = len(n_vqs)
+        use_neighbor = True
+
+        # Normalize so neighbor_heuristic returns values in [0, 1].
+        # This guarantees n_weight * neighbor_heuristic < 1 (for n_weight < 1),
+        # so the tiebreaker never overrides SWAP-count optimality.
+        _total_edge_weight = sum(w for _, _, w in n_edges)
+        _diameter = int(np.max(dist_matrix[dist_matrix < np.inf])) if n > 1 else 1
+        _norm = max(1.0, _total_edge_weight * _diameter)
+
+        def neighbor_heuristic(n_pos):
+            return sum(w * dist_matrix[n_pos[i]][n_pos[j]] for i, j, w in n_edges) / _norm
+    else:
+        initial_n_pos = ()
+        n_weight = 0.0
+        _n_len = 0
+        use_neighbor = False
+
+        def neighbor_heuristic(n_pos):
+            return 0.0
+
     # A* over k-dimensional state space.
     # Each state is a tuple of physical positions, one per partition qubit.
     # Paths are reconstructed via a parent-pointer dict to avoid copying lists
@@ -68,12 +99,14 @@ def heuristic(positions):
     parent = {}  # state → (parent_state, swap) for path reconstruction
     parent[initial_positions] = None
 
+    h0 = heuristic(initial_positions)
+    nh0 = n_weight * neighbor_heuristic(initial_n_pos) if use_neighbor else 0.0
     heap = []
-    heapq.heappush(heap, (heuristic(initial_positions), 0, counter, initial_positions))
+    heapq.heappush(heap, (h0 + nh0, 0, counter, initial_positions, initial_n_pos))
     visited = {initial_positions: 0}
 
     while heap:
-        f, g, _, positions = heapq.heappop(heap)
+        f, g, _, positions, n_pos = heapq.heappop(heap)
 
         if positions == target_positions:
             # Reconstruct swap path via parent pointers
@@ -102,6 +135,10 @@ def heuristic(positions):
         # Quick lookup: physical position → index within partition_qubits list
         pos_to_k_idx = {p: i for i, p in enumerate(positions)}
 
+        # Build reverse map for neighbor displacement tracking
+        if use_neighbor:
+            n_phys_to_idx = {n_pos[idx]: idx for idx in range(_n_len)}
+
         # Expand: try every SWAP that moves at least one partition qubit
         for i, p in enumerate(positions):
             for nb in adj[p]:
@@ -117,12 +154,26 @@ def heuristic(positions):
                 if visited.get(new_positions, float('inf')) <= new_g:
                     continue
 
+                # Update neighbor qubit positions: partition qubit at p
+                # moves to nb, displacing whatever was at nb to p.
+                if use_neighbor:
+                    if nb in n_phys_to_idx:
+                        new_n_pos = list(n_pos)
+                        new_n_pos[n_phys_to_idx[nb]] = p
+                        new_n_pos = tuple(new_n_pos)
+                    else:
+                        new_n_pos = n_pos
+                    new_nh = n_weight * neighbor_heuristic(new_n_pos)
+                else:
+                    new_n_pos = n_pos
+                    new_nh = 0.0
+
                 visited[new_positions] = new_g
                 swap_key = (min(p, nb), max(p, nb))
                 parent[new_positions] = (positions, swap_key)
                 counter += 1
-                heapq.heappush(heap, (new_g + heuristic(new_positions), new_g,
-                                      counter, new_positions))
+                heapq.heappush(heap, (new_g + heuristic(new_positions) + new_nh,
+                                      new_g, counter, new_positions, new_n_pos))
 
     logging.warning(
         "find_constrained_swaps_partial: failed to route %s → %s",
@@ -461,7 +512,7 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
-    def transform_pi(self, pi, D, swap_cache=None, reverse=False):
+    def transform_pi(self, pi, D, swap_cache=None, reverse=False, adj=None, neighbor_info=None):
         # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o)
         #
         # Forward (reverse=False):
@@ -491,10 +542,10 @@ def transform_pi(self, pi, D, swap_cache=None, reverse=False):
             if cache_key in swap_cache:
                 swaps, pi_init = swap_cache[cache_key]
             else:
-                swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+                swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info)
                 swap_cache[cache_key] = (swaps, pi_init)
         else:
-            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D)
+            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info)
 
         pi_output = pi_init.copy()
         qbit_map_inverse = {v: k for k, v in self.qbit_map.items()}
@@ -504,13 +555,14 @@ def transform_pi(self, pi, D, swap_cache=None, reverse=False):
                 pi_output[k] = self.node_mapping[P_exit[q_star]]
         return swaps, pi_output
     
-    def estimate_swap_count(self, pi, D) -> int:
+    def estimate_swap_count(self, pi, D, reverse=False) -> int:
         """O(n) lower-bound on the number of SWAPs needed to route this
         partition's virtual qubits to their target physical positions.
         Uses the same admissible heuristic as the A* search internaly:
             floor(sum_of_distances / 2)
         """
-        P_i_inv = [self.P_i.index(i) for i in range(len(self.P_i))]
+        P_route = self.P_o if reverse else self.P_i
+        P_i_inv = [P_route.index(i) for i in range(len(P_route))]
         total = 0.0
         for k, v in self.qbit_map.items():
             target_P = self.node_mapping[P_i_inv[v]]
@@ -523,7 +575,7 @@ def estimate_swap_count(self, pi, D) -> int:
     def get_final_circuit(self,optimized_partitions,N):
         partition = optimized_partitions[self.partition_idx]
         part_parameters = partition.synthesised_parameters[self.topology_idx][self.permutation_idx]
-        part_circuit = partition.synthesised_circuits[self.topology_idx][self.permutation_idx]
+        part_circuit = partition.synthesised_circuits[self.topology_idx][self.permutation_idx].get_Flat_Circuit()
         part_circuit = part_circuit.Remap_Qbits(self.node_mapping, N)
         return part_circuit, part_parameters
 

From 5e61d5f330c54f65dc51148a4ebff0ddb64b97cb Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 17 Apr 2026 22:57:46 +0200
Subject: [PATCH 105/232] WCO merge

---
 .../qgd_Wide_Circuit_Optimization.py          | 501 +++++++++++++-----
 1 file changed, 378 insertions(+), 123 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 8f9af837d..d9648fa09 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -19,8 +19,7 @@
 
 import multiprocessing as mp
 from multiprocessing import Process, Pool, parent_process
-from multiprocessing.pool import AsyncResult
-import os
+import os, contextlib, collections, time
 
 
 from squander.partitioning.partition import PartitionCircuit
@@ -45,10 +44,6 @@ def extract_subtopology(involved_qbits, qbit_map, config):
             mini_topology.append((qbit_map[edge[0]], qbit_map[edge[1]]))
     return mini_topology
 
-def CNOTGateCount( circ: Circuit ) -> int :
-    """
-    Call to get the number of CNOT-equivalent gates in the circuit.
-    Counts all two-qubit gates, converting them to CNOT equivalents.
 
 CNOT_COUNT_DICT = {
     "CNOT": 1,
@@ -78,7 +73,9 @@ def CNOTGateCount(circ: Circuit, max_gates: int = 0) -> int:
     When ``max_gates > 0``, the function returns a lexicographic-style score:
     ``two_qubit_cost * max_gates + single_qubit_gate_count``.
 
-        Returns with the CNOT-equivalent gate count (all two-qubit gates counted)
+    Args:
+        circ: Squander circuit representation.
+        max_gates: Weight multiplier for the two-qubit cost term.
 
     Returns:
         Integer gate-cost score used by optimization heuristics.
@@ -1291,6 +1288,7 @@ class qgd_Wide_Circuit_Optimization:
 
     Supports multiple decomposition strategies, optional global recombination (ILP),
     and routing when the circuit does not match the target topology.
+
     """
 
     def __init__(self, config):
@@ -1393,7 +1391,8 @@ def ConstructCircuitFromPartitions(
     def DecomposePartition(
         Umtx: np.ndarray, config: dict, mini_topology=None, structure=None
     ) -> list[tuple[Circuit, np.ndarray]]:
-        """Decompose a unitary ``Umtx`` (e.g. from a partition) using ``config['strategy']``.
+        """
+        Decompose a unitary ``Umtx`` (e.g. from a partition) using ``config['strategy']``.
 
         Args:
             Umtx: Complex unitary matrix.
@@ -1402,11 +1401,7 @@ def DecomposePartition(
             structure: Required gate structure when ``strategy == "Custom"``.
 
         Returns:
-            Normally ``[(circuit, parameters)]`` on success, or ``[]`` if the
-            decomposition error exceeds ``tolerance``. If
-            ``config.get('stop_first_solution')`` is false, returns
-            ``cDecompose.all_solutions`` from the underlying decomposer instead of
-            a single best pair.
+            List of ``(squander_circuit, parameters)`` on success, or ``[]`` if error exceeds tolerance.
         """
         strategy = config["strategy"]
         if strategy == "TreeSearch":
@@ -1489,15 +1484,25 @@ def CompareAndPickCircuits(
         parameter_arrs: List[np.ndarray],
         metric: Callable[[Circuit], int] = CNOTGateCount,
     ) -> tuple[Circuit, np.ndarray]:
-        """Select the circuit with the lowest ``metric`` value.
+        """
+        Call to pick the most optimal circuit corresponding a specific metric. Looks for the circuit
+        with the minimal metric value.
+
 
         Args:
-            circs: Candidate Squander circuits (same length as ``parameter_arrs``).
-            parameter_arrs: Parameter vectors aligned with ``circs``.
-            metric: Scalar cost functional; lower is better. Defaults to ``CNOTGateCount``.
 
-        Returns:
-            ``(best_circuit, best_parameters)`` for the minimizing index.
+            circs ( List[Circuit] ) A list of Squander circuits to be compared
+
+            parameter_arrs ( List[np.ndarray] ) A list of parameter arrays associated with the sqaunder circuits
+
+            metric (optional) The metric function to decide which input circuit is better.
+
+
+        Return:
+
+            Returns with the chosen circuit and the corresponding parameter array
+
+
         """
 
         if not isinstance(circs, list):
@@ -1524,10 +1529,8 @@ def PartitionDecompositionProcess(
         config: dict,
         structure=None,
     ) -> Tuple[Circuit, np.ndarray]:
-        """Decompose one partition subcircuit (multiprocessing-safe entry point).
-
-        For ``TreeGuided`` on large registers, may recursively partition and
-        enumerate combinations before returning remapped results.
+        """
+        Worker-friendly entry: decompose a partition subcircuit (optionally nested for TreeGuided).
 
         Args:
             subcircuit: Subcircuit acting on a subset of the wide register.
@@ -1536,8 +1539,7 @@ def PartitionDecompositionProcess(
             structure: Optional fixed gate structure when ``strategy == "Custom"``.
 
         Returns:
-            Tuple of ``(decomposed_circuit, decomposed_parameters)`` pairs, each
-            remapped back to the original qubit indices of ``subcircuit``.
+            List of ``(Circuit, parameters)`` pairs (or empty list on failure), remapped to the original register.
         """
 
         qbit_num_orig_circuit = subcircuit.get_Qbit_Num()
@@ -1693,15 +1695,7 @@ def PartitionDecompositionProcess(
 
     @staticmethod
     def build_partition_topo_deps(allparts):
-        """Order partition gate-sets by dependencies and build a reverse-dependency map.
-
-        Args:
-            allparts: List of sets of gate indices, one per partition.
-
-        Returns:
-            ``(ordered_parts, rg_new)`` where ``ordered_parts`` lists partitions in
-            topological order and ``rg_new`` maps each new index to predecessors.
-        """
+        """Topological sort of partition gate-sets; returns ordered partitions and reverse-dependency map."""
         gate_to_parts = {}
         for i, part in enumerate(allparts):
             for gate in part:
@@ -1815,15 +1809,7 @@ def make_all_partition_circuit(circ, orig_parameters, max_partition_size):
 
     @staticmethod
     def strip_single_qubit_head_tails(circ, params):
-        """Drop single-qubit gates that sit only at the head or tail of the dependency DAG.
-
-        Args:
-            circ: Input circuit.
-            params: Flat parameter array for ``circ``.
-
-        Returns:
-            ``(new_circuit, new_params)`` with head/tail single-qubit gates removed.
-        """
+        """Remove single-qubit gates that are purely at the head/tail of the dependency graph."""
         gate_dict, g, rg, gate_to_qubit, _ = build_dependency(circ)
         newcirc = Circuit(circ.get_Qbit_Num())
         new_params = []
@@ -1842,15 +1828,7 @@ def strip_single_qubit_head_tails(circ, params):
 
     @staticmethod
     def get_fingerprint(circ, params):
-        """Hashable signature of gate layout and parameters (for decomposition caching).
-
-        Args:
-            circ: Squander circuit.
-            params: Parameter array associated with ``circ``.
-
-        Returns:
-            Tuple usable as a dict key for memoizing decompositions.
-        """
+        """Hashable signature of gate types, qubits, and parameters (for decomposition caching)."""
         return tuple(
             (gate.get_Name(), tuple(gate.get_Involved_Qbits()))
             for gate in circ.get_Gates()
@@ -1860,16 +1838,10 @@ def get_fingerprint(circ, params):
     def recombine_all_partition_circuit(
         circ, optimized_subcircuits, optimized_parameter_list, recombine_info
     ):
-        """Reorder optimized partitions to respect global gate dependencies.
+        """Reorder partition results to satisfy global dependencies.
 
-        Args:
-            circ: Original flat circuit (for topological ordering context).
-            optimized_subcircuits: One optimized subcircuit per partition slot.
-            optimized_parameter_list: Parameter lists aligned with ``optimized_subcircuits``.
-            recombine_info: Tuple from ``make_all_partition_circuit`` (ILP metadata).
-
-        Returns:
-            ``(reordered_circuits, reordered_parameter_lists)`` in execution order.
+        Uses ILP-based ordering and a final topological sort, then returns
+        reordered subcircuits and parameter arrays aligned by structure index.
         """
         from squander.partitioning.ilp import (
             topo_sort_partitions,
@@ -2132,9 +2104,6 @@ def InnerOptimizeWideCircuit(
         if not in_parent:
             print(len(subcircuits), "partitions found to optimize")
 
-        if parent_process() is None and self.config["verbosity"] > 0: print(len(subcircuits), "partitions found to optimize")
-
-
         # the list of optimized subcircuits
         optimized_subcircuits: List[Optional[Circuit]] = [None] * len(subcircuits)
 
@@ -2143,34 +2112,154 @@ def InnerOptimizeWideCircuit(
             subcircuits
         )
 
+        # list of AsyncResult objects
         async_results = [None] * len(subcircuits)
-        n_cpus = mp.cpu_count()
-        with Pool(processes=n_cpus) as pool:
 
-            for partition_idx, subcircuit in enumerate(subcircuits):
-                start_idx = subcircuit.get_Parameter_Start_Index()
-                end_idx   = start_idx + subcircuit.get_Parameter_Num()
-                subcircuit_parameters = parameters[start_idx:end_idx]
-
-                config = {**self.config, 'tree_level_max': max(0, subcircuit.get_Gate_Nums().get('CNOT', 0)-1)}
-                config = config if structures is None or partition_idx >= len(structures) else {**config, 'strategy': 'Custom', 'max_inner_iterations': 10000, 'max_iteration_loops': 4}
-
-                async_results[partition_idx] = pool.apply_async(self.PartitionDecompositionProcess, (subcircuit, subcircuit_parameters, config,
-                                                                                                    None if structures is None or partition_idx >= len(structures) else structures[partition_idx]))
-
-            for partition_idx, subcircuit in enumerate(subcircuits):
-                start_idx = subcircuit.get_Parameter_Start_Index()
-                subcircuit_parameters = parameters[start_idx:start_idx + subcircuit.get_Parameter_Num()]
-                callback_fnc = lambda  x : self.CompareAndPickCircuits( [subcircuit, *(z[0] for z in x)], [subcircuit_parameters, *(z[1] for z in x)] )
-                new_subcircuit, new_parameters = callback_fnc(async_results[partition_idx].get(timeout=None))
+        total_opt = [0]
+
+        def process_result(partition_idx):
+            """Finalize async decomposition for partition ``partition_idx`` and update caches / lists."""
+            if optimized_subcircuits[partition_idx] is not None:
+                return
+            subcircuit = subcircuits[partition_idx]
+            # callback function done on the master process to compare the new decomposed and the original suncircuit
+            start_idx = subcircuit.get_Parameter_Start_Index()
+            subcircuit_parameters = parameters[
+                start_idx : start_idx + subcircuit.get_Parameter_Num()
+            ]
+            fingerprint = (
+                None
+                if fingerprint_dict is None
+                else qgd_Wide_Circuit_Optimization.get_fingerprint(
+                    subcircuit, subcircuit_parameters
+                )
+            )
+            callback_fnc = lambda x: self.CompareAndPickCircuits(
+                [subcircuit, *(z[0] for z in x)],
+                [subcircuit_parameters, *(z[1] for z in x)],
+                lambda c: CNOTGateCount(c, max_gates),
+            )
+            if fingerprint_dict is not None and fingerprint in fingerprint_dict:
+                new_subcircuit, new_parameters = fingerprint_dict[fingerprint]
+            else:
+                new_subcircuit, new_parameters = callback_fnc(
+                    async_results[partition_idx][0](*async_results[partition_idx][1])
+                    if in_parent
+                    else async_results[partition_idx].get(timeout=None)
+                )
 
-                if subcircuit != new_subcircuit and self.config["verbosity"] > 0:
-                    print("original subcircuit:    ", subcircuit.get_Gate_Nums())
+                if subcircuit != new_subcircuit:
+                    print(
+                        "original subcircuit:    ",
+                        subcircuit.get_Gate_Nums(),
+                        partition_idx,
+                    )
                     print("reoptimized subcircuit: ", new_subcircuit.get_Gate_Nums())
-                if partition_idx % 100 == 99 and self.config["verbosity"] > 0: print(partition_idx+1, "partitions optimized")
-                optimized_subcircuits[partition_idx] = new_subcircuit
-                optimized_parameter_list[partition_idx] = new_parameters
-
+                if fingerprint_dict is not None:
+                    fingerprint_dict[fingerprint] = (new_subcircuit, new_parameters)
+                    fingerprint_dict[
+                        qgd_Wide_Circuit_Optimization.get_fingerprint(
+                            new_subcircuit, new_parameters
+                        )
+                    ] = (new_subcircuit, new_parameters)
+                    trim_subcirc, trim_parameters = (
+                        qgd_Wide_Circuit_Optimization.strip_single_qubit_head_tails(
+                            new_subcircuit, new_parameters
+                        )
+                    )
+                    fingerprint_dict[
+                        qgd_Wide_Circuit_Optimization.get_fingerprint(
+                            trim_subcirc, trim_parameters
+                        )
+                    ] = (trim_subcirc, trim_parameters)
+            if total_opt[0] % 100 == 99:
+                print(total_opt[0] + 1, "partitions optimized")
+            total_opt[0] += 1
+            optimized_subcircuits[partition_idx] = new_subcircuit
+            optimized_parameter_list[partition_idx] = new_parameters
+
+        with (
+            contextlib.nullcontext()
+            if in_parent
+            else Pool(processes=mp.cpu_count())
+        ) as pool:
+            remaining = list(range(len(subcircuits)))
+            while remaining:
+                still_remaining = []
+                #  code for iterate over partitions and optimize them
+                for partition_idx in remaining:
+                    subcircuit = subcircuits[partition_idx]
+
+                    # isolate the parameters corresponding to the given sub-circuit
+                    start_idx = subcircuit.get_Parameter_Start_Index()
+                    end_idx = start_idx + subcircuit.get_Parameter_Num()
+                    subcircuit_parameters = parameters[start_idx:end_idx]
+
+                    fingerprint = (
+                        None
+                        if fingerprint_dict is None
+                        else qgd_Wide_Circuit_Optimization.get_fingerprint(
+                            subcircuit, subcircuit_parameters
+                        )
+                    )
+                    if fingerprint_dict is not None and fingerprint in fingerprint_dict:
+                        (
+                            optimized_subcircuits[partition_idx],
+                            optimized_parameter_list[partition_idx],
+                        ) = fingerprint_dict[fingerprint]
+                        continue
+                    if part_deps is not None and partition_idx in part_deps:
+                        any_optimized, any_remaining = False, False
+                        for dep_idx in part_deps[partition_idx]:
+                            if optimized_subcircuits[dep_idx] is None and (
+                                async_results[dep_idx] is None
+                                or not isinstance(async_results[dep_idx], tuple)
+                                and not async_results[dep_idx].ready()
+                            ):
+                                any_remaining = True
+                                continue
+                            elif optimized_subcircuits[dep_idx] is None:
+                                process_result(dep_idx)
+
+                            optimized_subcircuits_loc = optimized_subcircuits[dep_idx]
+                            assert isinstance(optimized_subcircuits_loc, Circuit)
+                            assert optimized_subcircuits_loc is not None
+
+                            if CNOTGateCount(optimized_subcircuits_loc) < CNOTGateCount(
+                                subcircuits[dep_idx]
+                            ):  # if the dependency partition was optimized, skip
+                                any_optimized = True
+                                break
+                        if any_optimized:
+                            optimized_subcircuits[partition_idx] = subcircuit
+                            optimized_parameter_list[partition_idx] = (
+                                subcircuit_parameters
+                            )
+                            continue
+                        if any_remaining:
+                            still_remaining.append(partition_idx)
+                            continue
+                    # call a process to decompose a subcircuit
+                    config = {
+                        **self.config,
+                        "tree_level_max": max(0, CNOTGateCount(subcircuit, 0) - 1),
+                    }
+                    fargs = (
+                        self.PartitionDecompositionProcess,
+                        (subcircuit, subcircuit_parameters, config, None),
+                    )
+                    # print("Dispatching", subcircuit.get_Involved_Qubits(), "qubits with", CNOGateCount(subcircuit, 0), "CNOT gates, partition ", partition_idx)
+                    async_results[partition_idx] = (
+                        fargs
+                        if in_parent
+                        else pool.apply_async(*fargs)
+                    )
+                if len(remaining) == len(still_remaining):
+                    time.sleep(0.1)
+                remaining = still_remaining
+            #  code for iterate over async results and retrieve the new subcircuits
+            for partition_idx in range(len(subcircuits)):
+                process_result(partition_idx)
 
         # construct the wide circuit from the optimized suncircuits
         if global_min:
@@ -2183,9 +2272,16 @@ def InnerOptimizeWideCircuit(
                 )
             )
 
-        if parent_process() is None and self.config["verbosity"] > 0:
-            print( "original circuit:    ", circ.get_Gate_Nums())
-            print( "reoptimized circuit: ", wide_circuit.get_Gate_Nums()) 
+        if any(c is None for c in optimized_subcircuits) or any(
+            p is None for p in optimized_parameter_list
+        ):
+            raise RuntimeError(
+                "Internal error: some partitions were not optimized before reconstruction."
+            )
+        wide_circuit, wide_parameters = self.ConstructCircuitFromPartitions(
+            cast(List[Circuit], optimized_subcircuits),
+            cast(List[List[np.ndarray]], optimized_parameter_list),
+        )
 
         if not in_parent:
             print("original circuit:    ", circ.get_Gate_Nums())
@@ -2236,16 +2332,15 @@ def lattice_topology(x_qbits, y_qbits):
 
     @staticmethod
     def heavy_hexagonal_topology(rows, cols):
-        """Build a finite heavy-hex coupling list (honeycomb with subdivided edges).
+        """
+        Finite heavy-hex patch.
 
-        Args:
-            rows: Number of rows in the brick-wall honeycomb patch.
-            cols: Number of columns in the patch.
+        rows, cols describe the underlying honeycomb 'brick-wall' patch.
+        The first rows*cols qubits are the original honeycomb vertices.
+        Every original edge gets one inserted degree-2 qubit.
 
         Returns:
-            List of undirected edges ``(u, v)``. The first ``rows * cols`` qubit
-            indices are honeycomb vertices; each original edge introduces one
-            additional degree-2 qubit on the subdivided link.
+            list[(u, v)]  undirected couplers
         """
 
         def vid(r, c):
@@ -2328,26 +2423,9 @@ def check_valid_routing(wide_circuit, topo):
         ), "Final circuit contains gates that do not respect the routing constraints."
 
     def check_compare_circuits(
-        self,
-        circ,
-        orig_parameters,
-        wide_circuit,
-        wide_parameters,
-        routing=False,
-        forced_test=False,
+        self, circ, orig_parameters, wide_circuit, wide_parameters, routing=False, forced_test=False,
     ):
-        """Optionally verify equivalence of ``circ`` and ``wide_circuit`` via ``CompareCircuits``.
-
-        Args:
-            circ: Original circuit.
-            orig_parameters: Parameters for ``circ``.
-            wide_circuit: Optimized or routed circuit.
-            wide_parameters: Parameters for ``wide_circuit``.
-            routing: If true and initial/final mappings exist in ``self.config``,
-                pass them to ``CompareCircuits`` for layout-aware comparison.
-            forced_test: If true, run the comparison even when ``test_final_circuit``
-                is false in config.
-        """
+        """If ``test_final_circuit``, numerically compare unitaries (optional initial/final layout for routing)."""
         if self.config["test_final_circuit"] or forced_test:
             if (
                 routing
@@ -2367,11 +2445,188 @@ def check_compare_circuits(
                 CompareCircuits(circ, orig_parameters, wide_circuit, wide_parameters)
 
     def route_circuit(self, circ: Circuit, orig_parameters: np.ndarray):
-        """Map ``circ`` onto ``self.config['topology']`` using the configured router.
+        """Map ``circ`` onto ``self.config['topology']`` using BQSKit SeQPAM, Qiskit SABRE, or Squander SABRE."""
+        strategy = self.config.get("routing-strategy", "seqpam-ilp")
+
+        if strategy in ("seqpam-ilp", "seqpam-quick", "bqskit-sabre"):
+            from squander import Qiskit_IO
+            from bqskit import Circuit as BQSKitCircuit, compile
+            from bqskit.compiler import Compiler
+            from bqskit.compiler.compile import (
+                build_seqpam_mapping_optimization_workflow,
+            )
+            from bqskit.compiler.basepass import BasePass
+
+            class SquanderPartitioner(BasePass):
+                """BQSKit pass: replace circuit body with Squander ILP partition blocks (QASM round-trip)."""
+
+                def __init__(self, max_partition_size):
+                    super().__init__()
+                    self.max_partition_size = max_partition_size
+
+                async def run(self, circuit: BQSKitCircuit, data=None):
+                    from squander import Qiskit_IO
+                    from squander.partitioning.partition import PartitionCircuit
+
+                    circ_qiskit = QuantumCircuit.from_qasm_str(
+                        OPENQASM2Language().encode(circuit)
+                    )
+                    circ, orig_parameters = Qiskit_IO.convert_Qiskit_to_Squander(
+                        circ_qiskit
+                    )
+                    partitioned_circuit, parameters, _ = PartitionCircuit(
+                        circ, orig_parameters, self.max_partition_size, strategy="ilp"
+                    )
+                    partitioned_circuit_qiskit = Qiskit_IO.get_Qiskit_Circuit(
+                        partitioned_circuit, parameters
+                    )
+                    partitioned_circuit_bqskit = OPENQASM2Language().decode(
+                        qasm2.dumps(partitioned_circuit_qiskit)
+                    )
+                    circuit.become(partitioned_circuit_bqskit, False)
+
+            from bqskit.passes import (
+                GeneralizedSabreLayoutPass,
+                GeneralizedSabreRoutingPass,
+                SetModelPass,
+                IfThenElsePass,
+                QuickPartitioner,
+            )
+            from bqskit.ir.gates import CNOTGate  # example; extend as needed
+            from bqskit.compiler.machine import MachineModel
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from qiskit import qasm2, QuantumCircuit
+
+            # Build BQSKit machine model from your topology
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
+
+            # Convert squander circuit → qiskit → BQSKit
+            # (BQSKit has a from_qiskit helper if you go via Qiskit IR)
+            circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters)
+
+            bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
+            # Customizable knobs
 
-        sabre = SABRE(circ, self.config["topology"])
-        Squander_remapped_circuit, parameters_remapped_circuit, pi, final_pi, swap_count = sabre.map_circuit(orig_parameters)
-        self.config.setdefault("initial_mapping",pi)
-        self.config.setdefault("final_mapping",sabre.get_inverse_pi(final_pi))
-        self.config["routed"] = True
+            # Routing-only pass pipeline — NO optimization passes
+            mainflow = build_seqpam_mapping_optimization_workflow(
+                block_size=self.config["max_partition_size"]
+            )
+            if strategy == "seqpam-ilp":
+                for curpass in mainflow._passes:
+                    if isinstance(curpass, IfThenElsePass):
+                        for i in range(len(curpass.on_true._passes)):
+                            if isinstance(curpass.on_true._passes[i], QuickPartitioner):
+                                curpass.on_true._passes[i] = SquanderPartitioner(
+                                    self.config["max_partition_size"]
+                                )
+
+            routing_workflow = [
+                SetModelPass(model),  # attach hardware model to circuit
+                *(
+                    (build_seqpam_mapping_optimization_workflow(),)
+                    if strategy != "bqskit-sabre"
+                    else (
+                        GeneralizedSabreLayoutPass(),  # SABRE-style layout
+                        GeneralizedSabreRoutingPass(),
+                    )
+                ),  # SABRE-style routing
+            ]
+
+            with Compiler() as compiler:
+                routed_bqskit_circ, pass_data = compiler.compile(
+                    bqskit_circ, routing_workflow, True
+                )
+
+            # Convert back: BQSKit → Qiskit → Squander
+            circuit_qiskit_routed = QuantumCircuit.from_qasm_str(
+                OPENQASM2Language().encode(routed_bqskit_circ)
+            )
+            Squander_remapped_circuit, parameters_remapped_circuit = (
+                Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_routed)
+            )
+            Squander_remapped_circuit = Squander_remapped_circuit.Remap_Qbits(
+                {i: j for i, j in enumerate(pass_data.placement)}
+            )
+            self.config["initial_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.initial_mapping
+            )
+            self.config["final_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.final_mapping
+            )
+
+        elif strategy == "light-sabre":
+            from squander import Qiskit_IO
+            from qiskit import transpile
+            from qiskit.transpiler.preset_passmanagers import (
+                generate_preset_pass_manager,
+            )
+            from qiskit.transpiler.passes import SabreLayout, SabreSwap
+            from qiskit.transpiler import PassManager, CouplingMap
+            from squander.gates import gates_Wrapper as gate
+
+            # SUPPORTED_GATES_NAMES = {n.lower().replace("cnot", "cx") for n in dir(gate) if not n.startswith("_") and issubclass(getattr(gate, n), gate.Gate) and n not in ("Gate", "CROT", "CR", "SYC", "CCX", "CSWAP")}
+            circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters)
+            coupling_map = [[i, j] for i, j in self.config["topology"]]
+            # circuit_qiskit_sabre = transpile(circo, basis_gates=SUPPORTED_GATES_NAMES, coupling_map=coupling_map, optimization_level=0)
+            coupling_map = CouplingMap(coupling_map)
+            # Customizable SABRE parameters
+            sabre_seed = self.config.get("sabre_seed", 42)
+            sabre_trials = self.config.get("sabre_trials", 5)  # layout trials
+            swap_trials = self.config.get("sabre_swap_trials", sabre_trials)
+            heuristic = self.config.get(
+                "sabre_heuristic", "decay"
+            )  # "basic" | "lookahead" | "decay"
+
+            layout_pass = SabreLayout(
+                coupling_map,
+                seed=sabre_seed,
+                max_iterations=sabre_trials,
+                swap_trials=swap_trials,
+            )
+            swap_pass = SabreSwap(
+                coupling_map,
+                heuristic=heuristic,
+                seed=sabre_seed,
+                trials=swap_trials,
+            )
+
+            pm = PassManager(
+                [
+                    layout_pass,  # find initial qubit mapping via SABRE
+                    swap_pass,  # insert SWAP gates for routing
+                ]
+            )
+            circuit_qiskit_sabre = pm.run(circo)
+            Squander_remapped_circuit, parameters_remapped_circuit = (
+                Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_sabre)
+            )
+            self.config["initial_mapping"] = (
+                circuit_qiskit_sabre.layout.initial_index_layout()
+            )
+            self.config["final_mapping"] = (
+                circuit_qiskit_sabre.layout.final_index_layout()
+            )
+        elif strategy == "sabre":
+            sabre = SABRE(circ, self.config["topology"])
+            (
+                Squander_remapped_circuit,
+                parameters_remapped_circuit,
+                pi,
+                final_pi,
+                swap_count,
+            ) = sabre.map_circuit(orig_parameters)
+            self.config["initial_mapping"] = pi
+            self.config["final_mapping"] = final_pi
+        qgd_Wide_Circuit_Optimization.check_valid_routing(
+            Squander_remapped_circuit, self.config["topology"]
+        )
+
+        print("cheking circuit after routing")
+        self.check_compare_circuits(
+            circ,
+            orig_parameters,
+            Squander_remapped_circuit,
+            parameters_remapped_circuit,
+            routing=True,
+        )
         return Squander_remapped_circuit, parameters_remapped_circuit

From 640b13017ef0681757dccfd13af70c4f3c425e2e Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 17 Apr 2026 23:08:05 +0200
Subject: [PATCH 106/232] Fix gates wrapper

---
 squander/gates/gates_Wrapper.cpp | 35 ++------------------------------
 1 file changed, 2 insertions(+), 33 deletions(-)

diff --git a/squander/gates/gates_Wrapper.cpp b/squander/gates/gates_Wrapper.cpp
index 86526dff6..a854bc2e2 100644
--- a/squander/gates/gates_Wrapper.cpp
+++ b/squander/gates/gates_Wrapper.cpp
@@ -746,21 +746,6 @@ Gate_Wrapper_get_Matrix( Gate_Wrapper *self, PyObject *args, PyObject *kwds ) {
         }
 }
 
-        int parallel = 1;
-        try {
-            gate_mtx = gate->get_matrix( parallel );
-        }
-        catch (std::string err) {
-            PyErr_SetString(PyExc_Exception, err.c_str());
-            std::cout << err << std::endl;
-            return NULL;
-        }
-        catch(...) {
-            std::string err( "Invalid pointer to gate class or error in get_matrix");
-            PyErr_SetString(PyExc_Exception, err.c_str());
-            return NULL;
-        }
-
 /**
 @brief Call to apply the gate operation from the right side on an input state or matrix
 */
@@ -943,24 +928,8 @@ Gate_Wrapper_Wrapper_apply_from_right( Gate_Wrapper *self, PyObject *args, PyObj
         memcpy(PyArray_DATA(input), input_mtx.data, input_mtx.size() * sizeof(QGD_Complex16));
     }
 
-        // get the C++ wrapper around the input data
-        Matrix_real&& parameters_mtx = numpy2matrix_real( parameters_arr );
-        int parallel = 1;
-        try {
-            gate_mtx = self->gate->get_matrix( parameters_mtx, parallel );
-        }
-        catch (std::string err) {
-            Py_DECREF(parameters_arr);
-            PyErr_SetString(PyExc_Exception, err.c_str());
-            std::cout << err << std::endl;
-            return NULL;
-        }
-        catch(...) {
-            Py_DECREF(parameters_arr);
-            std::string err( "Invalid pointer to gate class or error in get_matrix");
-            PyErr_SetString(PyExc_Exception, err.c_str());
-            return NULL;
-        }
+    return Py_BuildValue("i", 0);
+}
 
 
 /**

From 28ab66033619904e116f6f7aa83d7ccb339d44a4 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 17 Apr 2026 23:38:10 +0200
Subject: [PATCH 107/232] Rework PartAM

---
 squander/synthesis/PartAM.py       | 128 +++++++++++------------------
 squander/synthesis/PartAM_utils.py |  51 +++---------
 2 files changed, 63 insertions(+), 116 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3dbe858a6..31b0b8b8b 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -282,12 +282,19 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         """
         Partition and synthesize a full circuit.
 
+        Flow:
+            1) Enumerate candidate partitions.
+            2) ILP-select the minimum-count non-overlapping cover (uniform weights).
+            3) Synthesize only the selected partitions via SeqPAM (two-stage P_i/P_o
+               sweep over mini_topologies, executed by _run_parallel_synthesis).
+
         Args:
             circ: The full quantum circuit (must be flat — no subcircuit blocks)
             orig_parameters: Parameters for circ
 
         Returns:
-            optimized_partitions: List of PartitionSynthesisResult / SingleQubitPartitionResult
+            optimized_partitions: List of PartitionSynthesisResult /
+                SingleQubitPartitionResult, in topological order.
         """
         working_circ = circ
         working_parameters = orig_parameters
@@ -305,39 +312,56 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
-        partitioned_circuit = Circuit( qbit_num_orig_circuit )
-        params = []
+        # ---- Phase 2: Minimum-count ILP partition selection ----
+        L_parts, _ = ilp_global_optimal(allparts, g)
+
+        # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----
+        selected_surrounded_starts = set()
+        selected_parts_gates = []
+        for i in L_parts:
+            part = allparts[i]
+            surrounded = {t for s in part for t in go[s]
+                          if t in single_qubit_chains_prepost
+                          and go[single_qubit_chains_prepost[t][-1]]
+                          and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
+            gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded))
+            selected_parts_gates.append(gates)
+            selected_surrounded_starts.update(surrounded)
 
-        for part in allparts:
-            surrounded_chains = {t for s in part for t in go[s] if t in single_qubit_chains_prepost and go[single_qubit_chains_prepost[t][-1]] and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
-            gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded_chains))
-            #topo sort part + surrounded chains
-            c = Circuit( qbit_num_orig_circuit )
+        standalone_chains = []
+        for chain in single_qubit_chains:
+            if chain[0] not in selected_surrounded_starts:
+                selected_parts_gates.append(frozenset(chain))
+                standalone_chains.append(chain)
+
+        n_multi = len(L_parts)
 
-            for gate_idx in _get_topo_order({x: go[x] & gates for x in gates}, {x: rgo[x] & gates for x in gates}):
-                c.add_Gate( gate_dict[gate_idx] )
+        # ---- Phase 4: Assemble partitioned circuit from selected partitions only ----
+        partitioned_circuit = Circuit(qbit_num_orig_circuit)
+        params = []
+
+        for gates in selected_parts_gates[:n_multi]:
+            c = Circuit(qbit_num_orig_circuit)
+            for gate_idx in _get_topo_order({x: go[x] & gates for x in gates},
+                                            {x: rgo[x] & gates for x in gates}):
+                c.add_Gate(gate_dict[gate_idx])
                 start = gate_dict[gate_idx].get_Parameter_Start_Index()
                 params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
-
             partitioned_circuit.add_Circuit(c)
-        # Only add single-qubit chains as separate partitions if minimum_partition_size allows it
-        for chain in single_qubit_chains:
-            c = Circuit( qbit_num_orig_circuit )
+
+        for chain in standalone_chains:
+            c = Circuit(qbit_num_orig_circuit)
             for gate_idx in chain:
-                c.add_Gate( gate_dict[gate_idx] )
+                c.add_Gate(gate_dict[gate_idx])
                 start = gate_dict[gate_idx].get_Parameter_Start_Index()
                 params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
             partitioned_circuit.add_Circuit(c)
+
         parameters = np.concatenate(params, axis=0)
 
-        # ---- Phase 2: Fine-grained parallel synthesis ----
-        # Instead of 1 coarse task per partition (each doing ~12 sequential
-        # C++ decompositions), submit individual (partition, topology, permutation)
-        # tasks for much better pool load balancing.
+        # ---- Phase 5: SeqPAM synthesis on selected partitions only ----
         subcircuits = partitioned_circuit.get_Gates()
         optimized_results = [None] * len(subcircuits)
-
-        # Pre-compute partition metadata and base unitaries
         partition_meta = []
         for partition_idx, subcircuit in enumerate(subcircuits):
             start_idx = subcircuit.get_Parameter_Start_Index()
@@ -345,7 +369,6 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             subcircuit_parameters = parameters[start_idx:end_idx]
             involved_qbits = subcircuit.get_Qbits()
             qbit_num_sub = len(involved_qbits)
-            mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub)
             qbit_map = {involved_qbits[idx]: idx for idx in range(len(involved_qbits))}
             remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num_sub)
 
@@ -355,6 +378,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 )
                 partition_meta.append(None)
             else:
+                mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub)
                 partition_meta.append({
                     'N': qbit_num_sub,
                     'circuit': remapped_subcircuit,
@@ -364,65 +388,13 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                     'qbit_map': qbit_map,
                 })
 
-        # ---- Phase 2: Fine-grained parallel synthesis ----
         results_map = self._run_parallel_synthesis(partition_meta)
         for partition_idx, result in results_map.items():
             optimized_results[partition_idx] = result
 
-        # ---- Phase 3: ILP partition selection with synthesis-cost weights ----
-        weights = []
-        for idx, result in enumerate(optimized_results[:len(allparts)]):
-            if isinstance(result, SingleQubitPartitionResult):
-                weights.append(0)
-            else:
-                weights.append(result.get_partition_synthesis_score())
-
-        L_parts, fusion_info = ilp_global_optimal(allparts, g, weights=weights)
-
-        # ---- Phase 4: Reuse Phase 2 results (no re-synthesis) ----
-        # Build non-overlapping parts from selected allparts + standalone chains.
-        # Phase 2 already synthesized each allpart (with surrounded chains included),
-        # so we reuse those results directly.
-        selected_surrounded_starts = set()
-        selected_parts_gates = []
-        for i in L_parts:
-            part = allparts[i]
-            surrounded = {t for s in part for t in go[s]
-                         if t in single_qubit_chains_prepost
-                         and go[single_qubit_chains_prepost[t][-1]]
-                         and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part}
-            gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded))
-            selected_parts_gates.append(gates)
-            selected_surrounded_starts.update(surrounded)
-
-        # Non-surrounded chains become standalone SingleQubitPartitionResult entries
-        standalone_chains = []
-        for chain in single_qubit_chains:
-            if chain[0] not in selected_surrounded_starts:
-                selected_parts_gates.append(frozenset(chain))
-                standalone_chains.append(chain)
-
-        L = topo_sort_partitions(working_circ, self.config["max_partition_size"], selected_parts_gates)
-
-        n_selected = len(L_parts)
-        optimized_partitions = []
-        for part_idx in L:
-            if part_idx < n_selected:
-                # Multi-qubit partition — reuse Phase 2 PartitionSynthesisResult
-                optimized_partitions.append(optimized_results[L_parts[part_idx]])
-            else:
-                # Standalone single-qubit chain
-                chain = standalone_chains[part_idx - n_selected]
-                c = Circuit(qbit_num_orig_circuit)
-                chain_params = []
-                for gate_idx in chain:
-                    c.add_Gate(gate_dict[gate_idx])
-                    start = gate_dict[gate_idx].get_Parameter_Start_Index()
-                    chain_params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])
-                chain_parameters = np.concatenate(chain_params) if chain_params else np.array([])
-                optimized_partitions.append(SingleQubitPartitionResult(c, chain_parameters))
-
-        return optimized_partitions
+        # ---- Phase 6: Topologically order selected partitions ----
+        L = topo_sort_partitions(working_circ, selected_parts_gates)
+        return [optimized_results[idx] for idx in L]
 
     def _run_parallel_synthesis(self, partition_meta):
         """Phase 2: Run parallel synthesis for all multi-qubit partitions.
@@ -448,7 +420,7 @@ def _run_parallel_synthesis(self, partition_meta):
                     continue
                 results_map[partition_idx] = PartitionSynthesisResult(
                     meta['N'], meta['mini_topologies'], meta['involved_qbits'],
-                    meta['qbit_map'], meta['circuit']
+                    meta['qbit_map'],
                 )
 
             # ---- Stage 1: fix random P_o, sweep all P_i ----
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 3dd768e8d..62969fd1b 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -374,30 +374,25 @@ def derive_result_from_automorphism(sigma, P_i, P_o, circuit, parameters, N):
 # ============================================================================
 
 class SingleQubitPartitionResult:
-    
-    def __init__(self,circuit_in,parameters_in):
+
+    def __init__(self, circuit_in, parameters_in):
         self.circuit = circuit_in
         self.parameters = parameters_in
         self.involved_qbits = circuit_in.get_Qbits()
-    
-    def get_partition_synthesis_score(self):
-        return 0
 
 # Virtual qubits q, reduced virtual qubits (the remapped circuit only up to partition_size) q*
-# Physical qubits Q, reduced physical qubits Q* 
+# Physical qubits Q, reduced physical qubits Q*
 class PartitionSynthesisResult:
-    
-    def __init__(self, N , mini_topologies, involved_qbits, qubit_map, original_circuit, topology=None, topology_cache=None):
-        #The physical mini_topology of the partition q*
+
+    def __init__(self, N, mini_topologies, involved_qbits, qubit_map, topology=None, topology_cache=None):
+        # Physical mini_topology of the partition q*
         self.mini_topologies = mini_topologies
-        #number of topologies
-        self.topology_count = len(mini_topologies)
-        #Qubit num of the partition
+        # Qubit num of the partition
         self.N = N
-        # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc 
+        # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc
         # P_o in Q*->q* permutation pattern [Q*1 Q*0 Q*2] This means that the current output of Q*1 is equal to q*0
         self.permutations_pairs = [[] for _ in range(len(mini_topologies))]
-        # results of synthesis
+        # Synthesis results
         self.synthesised_circuits = [[] for _ in range(len(mini_topologies))]
         self.synthesised_parameters = [[] for _ in range(len(mini_topologies))]
         self.cnot_counts = [[] for _ in range(len(mini_topologies))]
@@ -406,12 +401,10 @@ def __init__(self, N , mini_topologies, involved_qbits, qubit_map, original_circ
         self.involved_qbits = involved_qbits
         # {q:q*}
         self.qubit_map = qubit_map
-        # the original circuit
-        self.original_circuit = original_circuit
-        # Pre-computed topology candidates for each mini_topology (lazy initialization)
+        # Lazy per-topology candidate cache
         self._topology_candidates = [None] * len(mini_topologies)
-        self._topology = topology  # Full topology for computing candidates
-        self._topology_cache = topology_cache  # Cache to use for lookups
+        self._topology = topology
+        self._topology_cache = topology_cache
 
     def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
         self.permutations_pairs[topology_idx].append(permutations_pair)
@@ -431,25 +424,7 @@ def extract_circuit_structure(self, circuit):
     def get_best_result(self, topology_idx):
         best_index = np.argmin(self.cnot_counts[topology_idx])
         return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index]
-    
-    #get the circuit structure in q 
-    def get_original_circuit_structure(self):
-        #q*->q
-        qbit_map_inverse = {v:k for k,v in self.qubit_map.items()}
-        circuit_structure = []
-        for gate in self.original_circuit.get_Gates():
-            involved_qbits = gate.get_Involved_Qbits()
-            if len(involved_qbits) != 1:
-                circuit_structure.append((qbit_map_inverse[involved_qbits[0]],qbit_map_inverse[involved_qbits[1]]))
-        return circuit_structure
-        
-    def get_partition_synthesis_score(self):
-        score = np.inf
-        for topology_idx in range(self.topology_count):
-            cnot_count_topology = np.min(self.cnot_counts[topology_idx])#np.mean(self.cnot_counts[topology_idx])*0.5 + np.min(self.cnot_counts[topology_idx])*0.5
-            score = min(cnot_count_topology,score)
-        return score
-    
+
     def get_topology_candidates(self, topology_idx):
         """
         Get topology candidates for a given topology index, using cache if available.

From 9846f731de46d18cbfb4ca67a9a05d8e625519c1 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 17 Apr 2026 23:43:52 +0200
Subject: [PATCH 108/232] Fix

---
 squander/synthesis/PartAM.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 31b0b8b8b..70853c630 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -343,7 +343,8 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         for gates in selected_parts_gates[:n_multi]:
             c = Circuit(qbit_num_orig_circuit)
             for gate_idx in _get_topo_order({x: go[x] & gates for x in gates},
-                                            {x: rgo[x] & gates for x in gates}):
+                                            {x: rgo[x] & gates for x in gates},
+                                            gate_to_qubit):
                 c.add_Gate(gate_dict[gate_idx])
                 start = gate_dict[gate_idx].get_Parameter_Start_Index()
                 params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()])

From bee6380ecea9f5077082204131b4461320336d29 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 17 Apr 2026 23:48:13 +0200
Subject: [PATCH 109/232] Fix new error

---
 squander/synthesis/PartAM.py | 58 ++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 70853c630..27bc6faaf 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -951,23 +951,28 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         partition_order = []
         step = 0
 
-        for partition_idx in list(F):
-            if isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
+        # Drain initial single-qubit partitions from F, recursively resolving
+        # any single-qubit descendants that become ready.  Children that are
+        # multi-qubit are pushed into F for the main search loop.
+        queue = [p for p in F if isinstance(optimized_partitions[p], SingleQubitPartitionResult)]
+        while queue:
+            partition_idx = queue.pop()
+            if resolved_partitions[partition_idx]:
+                continue
+            if partition_idx in F:
                 F.remove(partition_idx)
-                single_qubit_part = optimized_partitions[partition_idx]
-                qubit = single_qubit_part.circuit.get_Qbits()[0]
-                single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
-                partition_order.append(single_qubit_part)
-
-                resolved_partitions[partition_idx] = True
-                children = list(DAG[partition_idx])
-                while len(children) !=0:
-                    child = children.pop(0)
-                    parents_resolved = True
-                    for parent in IDAG[child]:
-                        parents_resolved *= resolved_partitions[parent]
-                    if parents_resolved:
-                        F.append(child)
+            single_qubit_part = optimized_partitions[partition_idx]
+            qubit = single_qubit_part.circuit.get_Qbits()[0]
+            single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])}, max(D.shape))
+            partition_order.append(single_qubit_part)
+            resolved_partitions[partition_idx] = True
+            for child in DAG[partition_idx]:
+                if not resolved_partitions[child] and child not in F:
+                    if all(resolved_partitions[p] for p in IDAG[child]):
+                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                            queue.append(child)
+                        else:
+                            F.append(child)
 
         # Initialize progress bar
         total_partitions = len(DAG)
@@ -1095,14 +1100,23 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
         resolved_partitions = [False] * len(DAG)
         total_swaps = 0
 
-        # Resolve initial single-qubit partitions
-        for partition_idx in list(F):
-            if isinstance(optimized_partitions[partition_idx], SingleQubitPartitionResult):
+        # Resolve initial single-qubit partitions, recursively draining any
+        # single-qubit descendants.  Multi-qubit descendants go into F.
+        queue = [p for p in F if isinstance(optimized_partitions[p], SingleQubitPartitionResult)]
+        while queue:
+            partition_idx = queue.pop()
+            if resolved_partitions[partition_idx]:
+                continue
+            if partition_idx in F:
                 F.remove(partition_idx)
-                resolved_partitions[partition_idx] = True
-                for child in DAG[partition_idx]:
+            resolved_partitions[partition_idx] = True
+            for child in DAG[partition_idx]:
+                if not resolved_partitions[child] and child not in F:
                     if all(resolved_partitions[p] for p in IDAG[child]):
-                        F.append(child)
+                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                            queue.append(child)
+                        else:
+                            F.append(child)
 
         max_E_size = self.config.get('max_E_size', 20)
         max_lookahead = self.config.get('max_lookahead', 4)

From b3a651583b724eb858205e2960f80726053e0d3c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 17 Apr 2026 23:53:01 +0200
Subject: [PATCH 110/232] Fix another error + update toml

---
 pyproject.toml               | 3 ++-
 squander/synthesis/PartAM.py | 6 ++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3ec14e77b..1721b5129 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -9,7 +9,8 @@ requires = [
     "tbb-devel; platform_machine == 'x86' or platform_machine == 'x86_64'",
     "cmake>=3.10.2",
     "networkx",
-    "qiskit"
+    "qiskit",
+    "tqdm"
 ]
 build-backend = "setuptools.build_meta"
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 27bc6faaf..ddcebe111 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -583,6 +583,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                 cleanup_config['routed'] = True
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
+                cleanup_config['global_min'] = False
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
                 # Save single-qubit partition circuits before trial loop
@@ -629,7 +630,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                     )
                     pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get('CNOT', 0)
                     trial_circuit, trial_params = wco.OptimizeWideCircuit(
-                        trial_circuit.get_Flat_Circuit(), trial_params, global_min = False
+                        trial_circuit.get_Flat_Circuit(), trial_params
                     )
 
                     cost = trial_circuit.get_Gate_Nums().get('CNOT', 0)
@@ -701,9 +702,10 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                 cleanup_config['routed'] = True
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
+                cleanup_config['global_min'] = False
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
                 final_circuit, final_parameters = wco.OptimizeWideCircuit(
-                    final_circuit.get_Flat_Circuit(), final_parameters, global_min = False
+                    final_circuit.get_Flat_Circuit(), final_parameters
                 )
 
         return final_circuit, final_parameters, pi_initial, pi

From d8fb4e16c9169d7e356c04c8bf210b4991e73958 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 18 Apr 2026 00:00:14 +0200
Subject: [PATCH 111/232] add tqdm to cli

---
 .github/workflows/ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 67b7e4b97..b8cb1abf0 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -31,7 +31,7 @@ jobs:
     - name: Install Python Dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest
+        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm
         python -m pip install -e . -q
 
     - name: Build
@@ -113,7 +113,7 @@ jobs:
     - name: Install Python Dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest
+        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm
         python -m pip install -e . -q
 
     - name: Build
@@ -159,7 +159,7 @@ jobs:
     - name: Install Python Dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest
+        python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm
         python -m pip install -e . -q
 
     - name: Build

From 8951ef0ba680c7f01626fa217208bea9049fb196 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 18 Apr 2026 00:54:51 +0200
Subject: [PATCH 112/232] Remove silly test

---
 .../test_group_into_two_qubit_blocks.py       | 217 ------------------
 1 file changed, 217 deletions(-)
 delete mode 100644 tests/decomposition/test_group_into_two_qubit_blocks.py

diff --git a/tests/decomposition/test_group_into_two_qubit_blocks.py b/tests/decomposition/test_group_into_two_qubit_blocks.py
deleted file mode 100644
index ec0b45ed0..000000000
--- a/tests/decomposition/test_group_into_two_qubit_blocks.py
+++ /dev/null
@@ -1,217 +0,0 @@
-import pytest
-import numpy as np
-from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
-from squander.synthesis.PartAM_utils import group_into_two_qubit_blocks
-
-
-def _count_gates_by_qubit_num(circuit, qubit_num):
-    """Count gates with exactly `qubit_num` involved qubits, recursing into blocks."""
-    count = 0
-    for gate in circuit.get_Gates():
-        if isinstance(gate, Circuit):
-            count += _count_gates_by_qubit_num(gate, qubit_num)
-        else:
-            if len(gate.get_Involved_Qbits()) == qubit_num:
-                count += 1
-    return count
-
-
-def _get_params(circuit, seed=42):
-    np.random.seed(seed)
-    return np.random.uniform(0, 2 * np.pi, circuit.get_Parameter_Num())
-
-
-# ============================================================================
-# Structure tests
-# ============================================================================
-
-def test_all_top_level_elements_are_circuits():
-    c = Circuit(3)
-    c.add_H(0)
-    c.add_CNOT(0, 1)
-    c.add_RZ(1)
-    c.add_CNOT(1, 2)
-
-    result = group_into_two_qubit_blocks(c)
-    for gate in result.get_Gates():
-        assert isinstance(gate, Circuit)
-
-
-def test_each_block_has_exactly_one_two_qubit_gate():
-    c = Circuit(3)
-    c.add_H(0)
-    c.add_CNOT(0, 1)
-    c.add_RZ(1)
-    c.add_CNOT(1, 2)
-    c.add_H(2)
-
-    result = group_into_two_qubit_blocks(c)
-    for block in result.get_Gates():
-        two_qubit_count = sum(
-            1 for g in block.get_Gates()
-            if len(g.get_Involved_Qbits()) == 2
-        )
-        assert two_qubit_count == 1
-
-
-def test_block_count_equals_two_qubit_gate_count():
-    c = Circuit(4)
-    c.add_CNOT(0, 1)
-    c.add_CNOT(1, 2)
-    c.add_CNOT(2, 3)
-
-    result = group_into_two_qubit_blocks(c)
-    assert len(result.get_Gates()) == 3
-
-
-def test_only_2qubit_gates_each_block_has_one_gate():
-    """With no single-qubit gates, each block contains exactly the 2-qubit gate."""
-    c = Circuit(3)
-    c.add_CNOT(0, 1)
-    c.add_CNOT(1, 2)
-
-    result = group_into_two_qubit_blocks(c)
-    for block in result.get_Gates():
-        assert len(block.get_Gates()) == 1
-
-
-# ============================================================================
-# Gate count preservation tests
-# ============================================================================
-
-def test_total_single_qubit_gate_count_preserved():
-    c = Circuit(3)
-    c.add_H(0)
-    c.add_RZ(1)
-    c.add_CNOT(0, 1)
-    c.add_RZ(0)
-    c.add_CNOT(1, 2)
-    c.add_H(2)
-
-    result = group_into_two_qubit_blocks(c)
-    assert _count_gates_by_qubit_num(result, 1) == _count_gates_by_qubit_num(c, 1)
-
-
-def test_total_two_qubit_gate_count_preserved():
-    c = Circuit(4)
-    c.add_H(0)
-    c.add_CNOT(0, 1)
-    c.add_H(2)
-    c.add_CNOT(1, 2)
-    c.add_CNOT(2, 3)
-
-    result = group_into_two_qubit_blocks(c)
-    assert _count_gates_by_qubit_num(result, 2) == _count_gates_by_qubit_num(c, 2)
-
-
-# ============================================================================
-# Block membership tests
-# ============================================================================
-
-def test_leading_single_qubit_gates_in_first_block():
-    """Single-qubit gates before the first 2-qubit gate go into the first block."""
-    c = Circuit(2)
-    c.add_H(0)
-    c.add_H(1)
-    c.add_CNOT(0, 1)
-
-    result = group_into_two_qubit_blocks(c)
-    blocks = result.get_Gates()
-    assert len(blocks) == 1
-    assert len(blocks[0].get_Gates()) == 3  # H(0) + H(1) + CNOT
-
-
-def test_trailing_single_qubit_gates_in_last_block():
-    """Single-qubit gates after the last 2-qubit gate on a qubit go into that last block."""
-    c = Circuit(2)
-    c.add_CNOT(0, 1)
-    c.add_H(0)
-    c.add_RZ(1)
-
-    result = group_into_two_qubit_blocks(c)
-    blocks = result.get_Gates()
-    assert len(blocks) == 1
-    assert len(blocks[0].get_Gates()) == 3  # CNOT + H(0) + RZ(1)
-
-
-def test_interleaved_single_qubit_gates_split_correctly():
-    """Single-qubit gates between two 2-qubit gates go to the next block."""
-    c = Circuit(3)
-    c.add_CNOT(0, 1)   # block 0
-    c.add_H(0)          # -> block 1 (next 2-qubit gate involving q0)
-    c.add_RZ(1)         # -> block 1 (next 2-qubit gate involving q1)
-    c.add_CNOT(0, 1)   # block 1
-
-    result = group_into_two_qubit_blocks(c)
-    blocks = result.get_Gates()
-    assert len(blocks) == 2
-    assert len(blocks[0].get_Gates()) == 1  # only CNOT
-    assert len(blocks[1].get_Gates()) == 3  # H + RZ + CNOT
-
-
-# ============================================================================
-# Correctness (unitary equivalence) tests
-# ============================================================================
-
-def test_unitary_equivalence_cnot_chain():
-    c = Circuit(3)
-    c.add_CNOT(0, 1)
-    c.add_CNOT(1, 2)
-    c.add_CNOT(0, 2)
-
-    result = group_into_two_qubit_blocks(c)
-    params = _get_params(c)
-    assert np.allclose(c.get_Matrix(params), result.get_Matrix(params), atol=1e-10)
-
-
-def test_unitary_equivalence_with_single_qubit_gates():
-    c = Circuit(3)
-    c.add_H(0)
-    c.add_RZ(1)
-    c.add_CNOT(0, 1)
-    c.add_H(1)
-    c.add_CNOT(1, 2)
-    c.add_H(2)
-
-    result = group_into_two_qubit_blocks(c)
-    params = _get_params(c)
-    assert np.allclose(c.get_Matrix(params), result.get_Matrix(params), atol=1e-10)
-
-
-@pytest.mark.parametrize("N", [2, 3, 4])
-def test_unitary_equivalence_parametric(N):
-    c = Circuit(N)
-    c.add_RZ(0)
-    c.add_CNOT(0, 1)
-    c.add_RY(1)
-    if N > 2:
-        c.add_RZ(2)
-        c.add_CNOT(1, 2)
-    if N > 3:
-        c.add_RZ(3)
-        c.add_CNOT(2, 3)
-
-    result = group_into_two_qubit_blocks(c)
-    params = _get_params(c)
-    assert np.allclose(c.get_Matrix(params), result.get_Matrix(params), atol=1e-10)
-
-
-# ============================================================================
-# Edge cases
-# ============================================================================
-
-def test_empty_circuit_returns_empty():
-    c = Circuit(3)
-    result = group_into_two_qubit_blocks(c)
-    assert len(result.get_Gates()) == 0
-
-
-def test_single_two_qubit_gate_no_singles():
-    c = Circuit(2)
-    c.add_CNOT(0, 1)
-
-    result = group_into_two_qubit_blocks(c)
-    blocks = result.get_Gates()
-    assert len(blocks) == 1
-    assert isinstance(blocks[0], Circuit)
-    assert len(blocks[0].get_Gates()) == 1

From c1e81be303ea0e1b1dc6705e4af2002bd8e43944 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 18 Apr 2026 01:04:44 +0200
Subject: [PATCH 113/232] Ease verbosity

---
 .../qgd_Wide_Circuit_Optimization.py          | 43 ++++++++++++-------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index d9648fa09..372004f2d 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -1467,7 +1467,7 @@ def DecomposePartition(
                 parameters = cDecompose.get_Optimized_Parameters()
                 err = cDecompose.Optimization_Problem(parameters)
                 it += 1
-            if err > tolerance or it != 0:
+            if (err > tolerance or it != 0) and config.get("verbosity", 0) >= 1:
                 print("Decomposition error: ", err, it)
         else:
             err = cDecompose.get_Decomposition_Error()
@@ -1893,13 +1893,15 @@ def OptimizeWideCircuit(
             circ, self.config["topology"]
         ):
 
-            print("fixing topology in the circuit")
+            if self.config["verbosity"] >= 1:
+                print("fixing topology in the circuit")
             topo = self.config["topology"]
             self.config["topology"] = None
             strat = self.config["strategy"]
             self.config["strategy"] = self.config["pre-opt-strategy"]
 
-            print("Optimizing circuit with all-to-all (a2a) connectivity")
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with all-to-all (a2a) connectivity")
             circ, parameters = self.OptimizeWideCircuit(circ, parameters)
             self.config["all_to_all_optimization_time"] = self.config[
                 "optimization_time"
@@ -1910,17 +1912,20 @@ def OptimizeWideCircuit(
             self.config["topology"] = topo
             start_time = time.time()
 
-            print("Routing circuit to fix the topology")
+            if self.config["verbosity"] >= 1:
+                print("Routing circuit to fix the topology")
             circ, parameters = self.route_circuit(circ, parameters)
             self.config["routing_time"] = time.time() - start_time
             self.config["routed_circuit"] = circ
             self.config["routed_parameters"] = parameters
         else:
-            print("No additional routing is needed on the circuit")
+            if self.config["verbosity"] >= 1:
+                print("No additional routing is needed on the circuit")
 
         start_time = time.time()
         if self.config["strategy"] == "bqskit":
-            print("Optimizing circuit with BQSkit")
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with BQSkit")
             from squander import Qiskit_IO
             from bqskit import compile
 
@@ -1984,12 +1989,14 @@ def OptimizeWideCircuit(
             qgd_Wide_Circuit_Optimization.check_valid_routing(
                 newcirc, self.config["topology"]
             )
-            print("OptimizeWideCircuit::check_compare_circuits")
+            if self.config["verbosity"] >= 2:
+                print("OptimizeWideCircuit::check_compare_circuits")
             self.check_compare_circuits(circ, parameters, newcirc, newparameters)
             circ, parameters = newcirc, newparameters
 
         elif self.config["strategy"] == "qiskit":
-            print("Optimizing circuit with Qiskit")
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with Qiskit")
             from squander import Qiskit_IO
             from qiskit import transpile
             from qiskit.transpiler import CouplingMap
@@ -2020,12 +2027,14 @@ def OptimizeWideCircuit(
             qgd_Wide_Circuit_Optimization.check_valid_routing(
                 newcirc, self.config["topology"]
             )
-            print("OptimizeWideCircuit::check_compare_circuits")
+            if self.config["verbosity"] >= 2:
+                print("OptimizeWideCircuit::check_compare_circuits")
             self.check_compare_circuits(circ, parameters, newcirc, newparameters)
             circ, parameters = newcirc, newparameters
         else:
 
-            print("Optimizing circuit with Squander")
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with Squander")
             part_size_start = self.max_partition_size
             part_size_end = self.max_partition_size
             if self.config.get("use_osr", False) or self.config.get(
@@ -2101,7 +2110,7 @@ def InnerOptimizeWideCircuit(
 
         in_parent = parent_process() is not None
 
-        if not in_parent:
+        if not in_parent and self.config["verbosity"] >= 1:
             print(len(subcircuits), "partitions found to optimize")
 
         # the list of optimized subcircuits
@@ -2148,7 +2157,7 @@ def process_result(partition_idx):
                     else async_results[partition_idx].get(timeout=None)
                 )
 
-                if subcircuit != new_subcircuit:
+                if subcircuit != new_subcircuit and self.config["verbosity"] >= 2:
                     print(
                         "original subcircuit:    ",
                         subcircuit.get_Gate_Nums(),
@@ -2172,7 +2181,7 @@ def process_result(partition_idx):
                             trim_subcirc, trim_parameters
                         )
                     ] = (trim_subcirc, trim_parameters)
-            if total_opt[0] % 100 == 99:
+            if total_opt[0] % 100 == 99 and self.config["verbosity"] >= 1:
                 print(total_opt[0] + 1, "partitions optimized")
             total_opt[0] += 1
             optimized_subcircuits[partition_idx] = new_subcircuit
@@ -2283,14 +2292,15 @@ def process_result(partition_idx):
             cast(List[List[np.ndarray]], optimized_parameter_list),
         )
 
-        if not in_parent:
+        if not in_parent and self.config["verbosity"] >= 1:
             print("original circuit:    ", circ.get_Gate_Nums())
             print("reoptimized circuit: ", wide_circuit.get_Gate_Nums())
 
         qgd_Wide_Circuit_Optimization.check_valid_routing(
             wide_circuit, self.config["topology"]
         )
-        print("InnerOptimizeWideCircuit: check_compare_circuits")
+        if self.config["verbosity"] >= 2:
+            print("InnerOptimizeWideCircuit: check_compare_circuits")
         self.check_compare_circuits(
             circ, orig_parameters, wide_circuit, wide_parameters
         )
@@ -2621,7 +2631,8 @@ async def run(self, circuit: BQSKitCircuit, data=None):
             Squander_remapped_circuit, self.config["topology"]
         )
 
-        print("cheking circuit after routing")
+        if self.config["verbosity"] >= 2:
+            print("cheking circuit after routing")
         self.check_compare_circuits(
             circ,
             orig_parameters,

From ff244a4396c74111e9956f17f264d86f4bc20bd6 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 18 Apr 2026 11:02:30 +0200
Subject: [PATCH 114/232] Try to rework cost function

---
 squander/synthesis/PartAM.py | 55 +++++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 16 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index ddcebe111..c737693b2 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -243,36 +243,53 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
     def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None, max_retries: int = 5) -> Circuit:
         """
         Call to decompose a partition. Retries up to max_retries times if the
-        decomposition error exceeds the configured tolerance.
+        decomposition error exceeds the configured tolerance.  Returns the
+        best-error attempt across all retries and logs a warning when no
+        attempt reaches ``config["tolerance"]``.
         """
         tolerance = config["tolerance"]
         strategy = config["strategy"]
 
+        best_err = float('inf')
+        best_circuit = None
+        best_params = None
+
         for attempt in range(max_retries):
             if strategy == "TreeSearch":
-                cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
+                cDecompose = N_Qubit_Decomposition_Tree_Search(Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
             elif strategy == "TabuSearch":
-                cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology )
+                cDecompose = N_Qubit_Decomposition_Tabu_Search(Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology)
             elif strategy == "Adaptive":
-                cDecompose = N_Qubit_Decomposition_adaptive( Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology )
+                cDecompose = N_Qubit_Decomposition_adaptive(Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology)
             else:
                 raise Exception(f"Unsupported decomposition type: {strategy}")
-            cDecompose.set_Verbose( config["verbosity"] )
-            cDecompose.set_Cost_Function_Variant( 3 )
-            cDecompose.set_Optimization_Tolerance( tolerance )
-            cDecompose.set_Optimizer( config["optimizer"] )
+            cDecompose.set_Verbose(config["verbosity"])
+            cDecompose.set_Cost_Function_Variant(3)
+            cDecompose.set_Optimization_Tolerance(tolerance)
+            cDecompose.set_Optimizer(config["optimizer"])
             cDecompose.Start_Decomposition()
 
             err = cDecompose.get_Decomposition_Error()
-            if err <= tolerance:
-                break
+            if err < best_err:
+                best_err = err
+                best_circuit = cDecompose.get_Circuit()
+                best_params = cDecompose.get_Optimized_Parameters()
 
-            if attempt >= max_retries - 1:
+            if best_err <= tolerance:
                 break
 
-        squander_circuit = cDecompose.get_Circuit()
-        parameters       = cDecompose.get_Optimized_Parameters()
-        return squander_circuit, parameters
+        if best_err > tolerance:
+            N = int(np.log2(Umtx.shape[0]))
+            logging.warning(
+                "DecomposePartition_and_Perm: %d-qubit partition on topology %s "
+                "did not reach tolerance %.2e after %d retries "
+                "(best error %.2e). Returning best attempt; final circuit error "
+                "may be elevated.",
+                N, list(mini_topology) if mini_topology else None,
+                tolerance, max_retries, best_err,
+            )
+
+        return best_circuit, best_params
 
     # ------------------------------------------------------------------------
     # Circuit Synthesis
@@ -312,8 +329,14 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
-        # ---- Phase 2: Minimum-count ILP partition selection ----
-        L_parts, _ = ilp_global_optimal(allparts, g)
+        # ---- Phase 2: ILP partition selection ----
+        # 2-qubit partitions are free (weight 0) since they are trivially
+        # synthesized as themselves; 3+ qubit partitions cost 1.
+        weights = [
+            0 if len({q for gate in part for q in gate_to_qubit[gate]}) == 2 else 1
+            for part in allparts
+        ]
+        L_parts, _ = ilp_global_optimal(allparts, g, weights=weights)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----
         selected_surrounded_starts = set()

From da0685350aaebf5415645824e7febbcbae85a915 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 18 Apr 2026 13:29:42 +0200
Subject: [PATCH 115/232] Fix single-qubit partition remapping to use original
 qubits

Track original qubits in SingleQubitPartitionResult so that remapping
uses the correct qubit index from the physical mapping instead of the
circuit-internal qubit index.
---
 squander/synthesis/PartAM.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index c737693b2..66b8ebfde 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -398,7 +398,8 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
             if qbit_num_sub == 1:
                 optimized_results[partition_idx] = SingleQubitPartitionResult(
-                    remapped_subcircuit, subcircuit_parameters
+                    remapped_subcircuit, subcircuit_parameters,
+                    original_qubits=list(involved_qbits)
                 )
                 partition_meta.append(None)
             else:
@@ -987,8 +988,9 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
             if partition_idx in F:
                 F.remove(partition_idx)
             single_qubit_part = optimized_partitions[partition_idx]
-            qubit = single_qubit_part.circuit.get_Qbits()[0]
-            single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits({int(qubit): int(pi[qubit])}, max(D.shape))
+            original_qubit = int(single_qubit_part.involved_qbits[0])
+            circuit_qubit = int(single_qubit_part.circuit.get_Qbits()[0])
+            single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits({circuit_qubit: int(pi[original_qubit])}, max(D.shape))
             partition_order.append(single_qubit_part)
             resolved_partitions[partition_idx] = True
             for child in DAG[partition_idx]:
@@ -1098,8 +1100,9 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                     if (not resolved_partitions[child] and child not in F) and parents_resolved:
                         if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
                             child_partition = optimized_partitions[child]
-                            qubit = child_partition.circuit.get_Qbits()[0]
-                            child_partition.circuit = child_partition.circuit.Remap_Qbits({int(qubit): int(pi[qubit])},max(D.shape))
+                            original_qubit = int(child_partition.involved_qbits[0])
+                            circuit_qubit = int(child_partition.circuit.get_Qbits()[0])
+                            child_partition.circuit = child_partition.circuit.Remap_Qbits({circuit_qubit: int(pi[original_qubit])},max(D.shape))
                             partition_order.append(child_partition)
                             resolved_partitions[child] = True
                             resolved_count = sum(resolved_partitions)

From 5ba551dcf948637eb3b74f580989b3136f9d45a2 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 18 Apr 2026 13:40:26 +0200
Subject: [PATCH 116/232] Add original_qubits parameter to
 SingleQubitPartitionResult

---
 squander/synthesis/PartAM_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 62969fd1b..12db1f65c 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -375,10 +375,10 @@ def derive_result_from_automorphism(sigma, P_i, P_o, circuit, parameters, N):
 
 class SingleQubitPartitionResult:
 
-    def __init__(self, circuit_in, parameters_in):
+    def __init__(self, circuit_in, parameters_in, original_qubits=None):
         self.circuit = circuit_in
         self.parameters = parameters_in
-        self.involved_qbits = circuit_in.get_Qbits()
+        self.involved_qbits = original_qubits if original_qubits is not None else circuit_in.get_Qbits()
 
 # Virtual qubits q, reduced virtual qubits (the remapped circuit only up to partition_size) q*
 # Physical qubits Q, reduced physical qubits Q*

From e6af0ba1600cca615bb9af933855c96fb96ad054 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 18 Apr 2026 18:31:06 +0200
Subject: [PATCH 117/232] Try to improve Routing

---
 squander/synthesis/PartAM.py       | 50 +++++++++++++++++-------------
 squander/synthesis/PartAM_utils.py | 48 +++++++++++++++++++++-------
 2 files changed, 65 insertions(+), 33 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 66b8ebfde..9ee790c21 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -506,7 +506,8 @@ def _run_parallel_synthesis(self, partition_meta):
                     N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                 )
 
-            # ---- Stage 2: fix best P_i from Stage 1, sweep all P_o ----
+            # ---- Stage 2: fix top-k P_i from Stage 1, sweep all P_o ----
+            top_k_pi = self.config.get('top_k_pi', 1)
             stage2_futures = []
             stage2_cached = []
 
@@ -517,38 +518,38 @@ def _run_parallel_synthesis(self, partition_meta):
                 perms_all = list(permutations(range(N)))
                 result = results_map[partition_idx]
                 for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
-                    P_i_best, _ = result.get_best_result(topology_idx)[0]
                     pair_key = (partition_idx, topology_idx)
                     kp = known_pairs.get(pair_key, set()) if use_auts else set()
-                    for P_o in perms_all:
-                        if use_auts and (tuple(P_i_best), P_o) in kp:
-                            continue
-                        Umtx = self._build_permuted_unitary(meta, P_i_best, P_o)
-                        ck = self._cache_key(Umtx, mini_topology)
-                        if ck in decomp_cache:
-                            stage2_cached.append((partition_idx, topology_idx, P_i_best, P_o, ck))
-                        else:
-                            future = pool.apply_async(
-                                _decompose_one, (Umtx, mini_topology)
-                            )
-                            stage2_futures.append((partition_idx, topology_idx, P_i_best, P_o, ck, future))
+                    for P_i_cand in result.get_top_k_results(topology_idx, top_k_pi):
+                        for P_o in perms_all:
+                            if use_auts and (tuple(P_i_cand), P_o) in kp:
+                                continue
+                            Umtx = self._build_permuted_unitary(meta, P_i_cand, P_o)
+                            ck = self._cache_key(Umtx, mini_topology)
+                            if ck in decomp_cache:
+                                stage2_cached.append((partition_idx, topology_idx, P_i_cand, P_o, ck))
+                            else:
+                                future = pool.apply_async(
+                                    _decompose_one, (Umtx, mini_topology)
+                                )
+                                stage2_futures.append((partition_idx, topology_idx, P_i_cand, P_o, ck, future))
 
             # Process Stage 2 cache hits
-            for partition_idx, topology_idx, P_i_best, P_o, ck in stage2_cached:
+            for partition_idx, topology_idx, P_i_cand, P_o, ck in stage2_cached:
                 meta = partition_meta[partition_idx]
                 N = meta['N']
                 mini_topology = meta['mini_topologies'][topology_idx]
                 synth_circuit, synth_params = decomp_cache[ck]
                 pair_key = (partition_idx, topology_idx)
                 self._add_result_with_auts(
-                    results_map[partition_idx], (tuple(P_i_best), P_o),
+                    results_map[partition_idx], (tuple(P_i_cand), P_o),
                     synth_circuit, synth_params, topology_idx,
                     N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                 )
 
             # Collect Stage 2 pool results
             cache_hits_s2 = len(stage2_cached)
-            for partition_idx, topology_idx, P_i_best, P_o, ck, future in tqdm(
+            for partition_idx, topology_idx, P_i_cand, P_o, ck, future in tqdm(
                 stage2_futures, desc=f"Stage 2 Synthesis ({cache_hits_s2} cached)",
                 disable=disable_pbar
             ):
@@ -559,7 +560,7 @@ def _run_parallel_synthesis(self, partition_meta):
                 mini_topology = meta['mini_topologies'][topology_idx]
                 pair_key = (partition_idx, topology_idx)
                 self._add_result_with_auts(
-                    results_map[partition_idx], (tuple(P_i_best), P_o),
+                    results_map[partition_idx], (tuple(P_i_cand), P_o),
                     synth_circuit, synth_params, topology_idx,
                     N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                 )
@@ -757,8 +758,9 @@ def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=Fals
         """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
         if len(partition_candidates) <= top_k:
             return partition_candidates
+        local_cost_weight = self.config.get('local_cost_weight', 0.1)
         estimates = np.array([
-            pc.estimate_swap_count(pi, D, reverse=reverse) * 3 + 0.1 * len(pc.circuit_structure)
+            pc.estimate_swap_count(pi, D, reverse=reverse) * 3 + local_cost_weight * pc.cnot_count
             for pc in partition_candidates
         ])
         top_k_indices = np.argpartition(estimates, top_k)[:top_k]
@@ -874,6 +876,7 @@ def _select_with_lookahead(self, partition_candidates, scores, pi, F,
                                 alpha=E_alpha, weight=neighbor_weight,
                             ),
                             E_overlap_floor=E_overlap_floor,
+                            local_cost_weight=self.config.get('local_cost_weight', 0.1),
                         )
                         for pc in next_candidates
                     ]
@@ -1057,6 +1060,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             congestion=congestion,
                             betweenness=betweenness,
                             congestion_weight=congestion_weight,
+                            local_cost_weight=self.config.get('local_cost_weight', 0.1),
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -1192,6 +1196,7 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
                     congestion=congestion,
                     betweenness=betweenness,
                     congestion_weight=congestion_weight,
+                    local_cost_weight=self.config.get('local_cost_weight', 0.1),
                 )
                 for pc in partition_candidates
             ]
@@ -1337,12 +1342,13 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
                                   neighbor_data=None, adj=None, neighbor_info=None,
                                   E_overlap_floor=0.2,
-                                  congestion=None, betweenness=None, congestion_weight=0.0):
+                                  congestion=None, betweenness=None, congestion_weight=0.0,
+                                  local_cost_weight=0.1):
         score = 0
         swap_weight = 1
         swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, reverse=reverse, adj=adj, neighbor_info=neighbor_info)
         score += swap_weight * len(swaps) * 3
-        score += 0.1*len(partition_candidate.circuit_structure)
+        score += local_cost_weight * partition_candidate.cnot_count
 
         # Congestion penalty: penalize SWAP paths through congested bottleneck nodes
         if congestion is not None and betweenness is not None and congestion_weight > 0 and swaps:
@@ -1532,7 +1538,7 @@ def obtain_partition_candidates(self, F, optimized_partitions):
                     topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
-                        partition_candidates.append(PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits))
+                        partition_candidates.append(PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits,cnot_count=partition.cnot_counts[tdx][pdx]))
         return partition_candidates
 
     # ------------------------------------------------------------------------
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 12db1f65c..dc856851b 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -154,15 +154,17 @@ def neighbor_heuristic(n_pos):
                 if visited.get(new_positions, float('inf')) <= new_g:
                     continue
 
-                # Update neighbor qubit positions: partition qubit at p
-                # moves to nb, displacing whatever was at nb to p.
+                # Bug B fix: update neighbor positions for BOTH sides of the swap.
+                # A neighbor qubit at nb gets displaced to p, AND a neighbor qubit
+                # at p (if it's also tracked, e.g. overlaps with a partition qubit)
+                # moves to nb.
                 if use_neighbor:
+                    new_n_pos = list(n_pos)
                     if nb in n_phys_to_idx:
-                        new_n_pos = list(n_pos)
                         new_n_pos[n_phys_to_idx[nb]] = p
-                        new_n_pos = tuple(new_n_pos)
-                    else:
-                        new_n_pos = n_pos
+                    if p in n_phys_to_idx:
+                        new_n_pos[n_phys_to_idx[p]] = nb
+                    new_n_pos = tuple(new_n_pos)
                     new_nh = n_weight * neighbor_heuristic(new_n_pos)
                 else:
                     new_n_pos = n_pos
@@ -425,6 +427,23 @@ def get_best_result(self, topology_idx):
         best_index = np.argmin(self.cnot_counts[topology_idx])
         return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index]
 
+    def get_top_k_results(self, topology_idx, k):
+        counts = self.cnot_counts[topology_idx]
+        pairs = self.permutations_pairs[topology_idx]
+        if not counts:
+            return []
+        indices = np.argsort(counts)
+        seen_pi = set()
+        result = []
+        for i in indices:
+            pi_key = tuple(pairs[i][0])
+            if pi_key not in seen_pi:
+                seen_pi.add(pi_key)
+                result.append(pairs[i][0])
+                if len(result) >= k:
+                    break
+        return result
+
     def get_topology_candidates(self, topology_idx):
         """
         Get topology candidates for a given topology index, using cache if available.
@@ -463,7 +482,7 @@ def get_topology_candidates(self, topology_idx):
 
 class PartitionCandidate:
     
-    def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits):
+    def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits, cnot_count=0):
         #Which partition does this belong to
         self.partition_idx = partition_idx
         #the index of the Q* topology
@@ -484,6 +503,7 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         self.qbit_map = qbit_map
         # q belonging to the original circuit
         self.involved_qbits = involved_qbits
+        self.cnot_count = cnot_count
         # {Q*:Q}
         self.node_mapping = get_node_mapping(mini_topology, topology)
 
@@ -509,18 +529,24 @@ def transform_pi(self, pi, D, swap_cache=None, reverse=False, adj=None, neighbor
         pi_list = [int(x) for x in pi]
         n = len(pi_list)
 
-        # Check cache if provided
-        if swap_cache is not None:
+        # Check cache if provided (Bug A: skip cache when neighbor heuristic is active,
+        # since cached paths were computed with different future context)
+        use_cache = (neighbor_info is None or
+                      neighbor_info.get('weight', 0) == 0 or
+                      not neighbor_info.get('edges', []))
+        if swap_cache is not None and use_cache:
             pi_tuple = tuple(pi_list)
             qbit_map_frozen = frozenset(qbit_map_input.items())
             cache_key = (pi_tuple, qbit_map_frozen)
             if cache_key in swap_cache:
                 swaps, pi_init = swap_cache[cache_key]
             else:
-                swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info)
+                swaps, pi_init = find_constrained_swaps_partial(
+                    pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info)
                 swap_cache[cache_key] = (swaps, pi_init)
         else:
-            swaps, pi_init = find_constrained_swaps_partial(pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info)
+            swaps, pi_init = find_constrained_swaps_partial(
+                pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info)
 
         pi_output = pi_init.copy()
         qbit_map_inverse = {v: k for k, v in self.qbit_map.items()}

From 68ff536163245e216e6386f2d1099e75b6e27847 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 19 Apr 2026 00:36:18 +0200
Subject: [PATCH 118/232] Streamline routing: remove valve from layout-only
 mode, drop dead config keys

Disable release valve in Heuristic_Search_routed (layout-only trials
don't need forced progress). Remove pointless _swap_cache.clear() calls
that discard valid pi-keyed entries. Strip dead config defaults
(neighbor_weight, E_overlap_floor, branch_budget, branch_threshold,
congestion_weight, congestion_decay, hs_score_workers) and unused os
import.
---
 squander/synthesis/PartAM.py | 650 ++++++++++-------------------------
 1 file changed, 180 insertions(+), 470 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 9ee790c21..f60778448 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -3,7 +3,6 @@
 """
 import logging
 import multiprocessing as mp
-import os
 import time
 from collections import deque, defaultdict
 from itertools import permutations
@@ -86,19 +85,12 @@ def __init__(self, config):
         self.config.setdefault('bh_interval', 50)
         self.config.setdefault('bh_target_accept_rate', 0.5)
         self.config.setdefault('bh_stepwise_factor', 0.9)
-        self.config.setdefault('hs_score_workers', os.cpu_count() or 1)
         self.config.setdefault('use_osr', 0)
         self.config.setdefault('n_layout_trials', 1)
         self.config.setdefault('score_tolerance', 0.05)
         self.config.setdefault('random_seed', 42)
         self.config.setdefault('cleanup', True)
         self.config.setdefault('prefilter_top_k', 50)
-        self.config.setdefault('neighbor_weight', 0.5)
-        self.config.setdefault('E_overlap_floor', 0.2)
-        self.config.setdefault('branch_budget', 3)
-        self.config.setdefault('branch_threshold', 0.1)
-        self.config.setdefault('congestion_weight', 0.1)
-        self.config.setdefault('congestion_decay', 0.9)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -766,212 +758,89 @@ def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=Fals
         top_k_indices = np.argpartition(estimates, top_k)[:top_k]
         return [partition_candidates[i] for i in top_k_indices]
 
-    def _select_with_lookahead(self, partition_candidates, scores, pi, F,
-                               DAG, IDAG, resolved_partitions,
-                               optimized_partitions, scoring_partitions, D,
-                               E_W, E_alpha, E_overlap_floor,
-                               neighbor_data, neighbor_weight,
-                               reverse=False, rng=None):
-        """1-step lookahead branching: when top candidates are close in score,
-        tentatively commit each, score one step ahead, and pick the best 2-step total.
-
-        Falls back to _select_best_candidate when there is a clear winner or
-        branching is disabled (branch_budget <= 1).
-        """
-        branch_budget = self.config.get('branch_budget', 3)
-        branch_threshold = self.config.get('branch_threshold', 0.1)
-
-        if branch_budget <= 1:
-            return self._select_best_candidate(partition_candidates, scores, rng=rng)
-
-        scores_array = np.array(scores)
-        min_score = np.min(scores_array)
-
-        # Find candidates within threshold of best
-        if min_score > 0:
-            threshold = min_score * (1 + branch_threshold)
-        else:
-            threshold = branch_threshold
-        close_indices = np.where(scores_array <= threshold)[0]
-
-        if len(close_indices) <= 1:
-            return self._select_best_candidate(partition_candidates, scores, rng=rng)
-
-        # Limit to branch_budget
-        if len(close_indices) > branch_budget:
-            # Keep the top branch_budget by score
-            sorted_close = close_indices[np.argsort(scores_array[close_indices])]
-            close_indices = sorted_close[:branch_budget]
-
-        # Evaluate each branch one step ahead
-        best_branch_score = float('inf')
-        best_candidate = None
-        top_k = self.config.get('prefilter_top_k', 50)
-
-        for idx in close_indices:
-            candidate = partition_candidates[idx]
-            candidate_score = scores_array[idx]
-
-            # Tentatively apply this candidate's routing
-            temp_swap_cache = {}
-            neighbor_info = self._compute_neighbor_info(
-                candidate, tuple(F), None, neighbor_data, pi,
-                alpha=E_alpha, weight=neighbor_weight,
-            )
-            swaps, pi_next = candidate.transform_pi(
-                pi, D, temp_swap_cache, reverse=reverse,
-                adj=self._adj, neighbor_info=neighbor_info,
-            )
-
-            # Compute tentative front layer after committing this candidate
-            F_next = [p for p in F if p != candidate.partition_idx]
-            temp_resolved = list(resolved_partitions)
-            temp_resolved[candidate.partition_idx] = True
-
-            # Promote children (skip single-qubit partitions)
-            for child in DAG[candidate.partition_idx]:
-                if not temp_resolved[child] and child not in F_next:
-                    if all(temp_resolved[p] for p in IDAG[child]):
-                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
-                            temp_resolved[child] = True
-                            stack = list(DAG[child])
-                            while stack:
-                                gc = stack.pop()
-                                if not temp_resolved[gc] and gc not in F_next:
-                                    if all(temp_resolved[p] for p in IDAG[gc]):
-                                        if isinstance(optimized_partitions[gc], SingleQubitPartitionResult):
-                                            temp_resolved[gc] = True
-                                            stack.extend(DAG[gc])
-                                        else:
-                                            F_next.append(gc)
-                        else:
-                            F_next.append(child)
-
-            if not F_next:
-                # No next step — use candidate score alone
-                branch_score = candidate_score
-            else:
-                # Generate and score next-step candidates
-                next_candidates = self.obtain_partition_candidates(F_next, optimized_partitions)
-                if next_candidates:
-                    next_candidates = self._prefilter_candidates(
-                        next_candidates, pi_next, D, top_k, reverse=reverse
-                    )
-                    F_next_snapshot = tuple(F_next)
-                    E_next = self.generate_extended_set(
-                        F_next, DAG, IDAG, temp_resolved, optimized_partitions,
-                        max_E_size=self.config.get('max_E_size', 20),
-                        max_lookahead=self.config.get('max_lookahead', 4),
-                    )
-                    next_scores = [
-                        self.score_partition_candidate(
-                            pc, F_next_snapshot, pi_next, scoring_partitions, D,
-                            temp_swap_cache,
-                            E=E_next, W=E_W, alpha=E_alpha,
-                            reverse=reverse,
-                            neighbor_data=neighbor_data,
-                            adj=self._adj,
-                            neighbor_info=self._compute_neighbor_info(
-                                pc, F_next_snapshot, E_next, neighbor_data, pi_next,
-                                alpha=E_alpha, weight=neighbor_weight,
-                            ),
-                            E_overlap_floor=E_overlap_floor,
-                            local_cost_weight=self.config.get('local_cost_weight', 0.1),
-                        )
-                        for pc in next_candidates
-                    ]
-                    branch_score = candidate_score + min(next_scores)
-                else:
-                    branch_score = candidate_score
-
-            if branch_score < best_branch_score:
-                best_branch_score = branch_score
-                best_candidate = candidate
-
-        return best_candidate
+    def _bfs_shortest_path(self, src, dst):
+        """BFS shortest path on self._adj. Returns list of physical nodes
+        from src to dst (inclusive); empty list if unreachable."""
+        if src == dst:
+            return [src]
+        parent = {src: None}
+        q = deque([src])
+        while q:
+            node = q.popleft()
+            for nb in self._adj[node]:
+                if nb in parent:
+                    continue
+                parent[nb] = node
+                if nb == dst:
+                    path = [dst]
+                    while parent[path[-1]] is not None:
+                        path.append(parent[path[-1]])
+                    path.reverse()
+                    return path
+                q.append(nb)
+        return []
 
     @staticmethod
-    def _compute_neighbor_info(partition_candidate, F, E, neighbor_data, pi,
-                               alpha=0.9, weight=0.01):
-        """Build neighbor_info dict for SABRE-aware A* tiebreaker.
-
-        Collects virtual qubit edges from front-layer and extended-set partitions
-        (excluding the current partition) so the A* can prefer SWAP paths that
-        leave future-partition qubits closer together.
+    def _apply_swaps_to_pi(pi, swaps):
+        """Return a new pi after applying a list of (phys_a, phys_b) swaps."""
+        pi_new = [int(x) for x in pi]
+        n = len(pi_new)
+        p2v = [0] * n
+        for q in range(n):
+            p2v[pi_new[q]] = q
+        for P1, P2 in swaps:
+            q1, q2 = p2v[P1], p2v[P2]
+            p2v[P1], p2v[P2] = q2, q1
+            pi_new[q1], pi_new[q2] = P2, P1
+        return pi_new
+
+    def _release_valve(self, F, pi, D, canonical_data):
+        """Force progress on the easiest F partition's hardest pair.
+
+        Picks the F partition whose worst-pair distance under pi is smallest
+        (cheapest to bridge). BFS-routes that pair along the shortest path,
+        applying swaps from both ends toward the middle — LightSABRE §II.7.
+
+        Returns (swap_list, pi_new). Empty swap list if everything is already
+        adjacent or no eligible partition exists.
         """
-        if weight == 0 or neighbor_data is None:
-            return None
-
-        own_qubits = set(partition_candidate.involved_qbits)
-        # Collect weighted edges: (virtual_q_u, virtual_q_v, edge_weight)
-        raw_edges = []
-
-        # Front layer partitions (weight 1.0)
-        for part_idx in F:
-            if part_idx == partition_candidate.partition_idx:
-                continue
-            entry = neighbor_data.get(part_idx)
-            if entry is None:
+        best = None
+        for p_idx in F:
+            entry = canonical_data.get(p_idx)
+            if entry is None or entry['edges_u'] is None:
                 continue
-            cnot_arr, q_u_arr, q_v_arr = entry
-            if q_u_arr is None:
+            eu, ev = entry['edges_u'], entry['edges_v']
+            worst_d = 0
+            worst_pair = None
+            for i in range(len(eu)):
+                u, v = int(eu[i]), int(ev[i])
+                d = D[int(pi[u])][int(pi[v])]
+                if d > worst_d:
+                    worst_d = d
+                    worst_pair = (u, v)
+            if worst_d <= 1 or worst_pair is None:
                 continue
-            # Use the best (min-CNOT) permutation's edges
-            best_pdx = int(np.argmin(cnot_arr))
-            for e in range(q_u_arr.shape[1]):
-                qu, qv = int(q_u_arr[best_pdx, e]), int(q_v_arr[best_pdx, e])
-                if qu == qv:  # padding
-                    continue
-                if qu not in own_qubits or qv not in own_qubits:
-                    raw_edges.append((qu, qv, 1.0))
+            if best is None or worst_d < best[0] or (worst_d == best[0] and p_idx < best[1]):
+                best = (worst_d, p_idx, worst_pair[0], worst_pair[1])
 
-        # Extended set partitions (weight alpha^depth)
-        if E:
-            for part_idx, depth in E:
-                if part_idx == partition_candidate.partition_idx:
-                    continue
-                entry = neighbor_data.get(part_idx)
-                if entry is None:
-                    continue
-                cnot_arr, q_u_arr, q_v_arr = entry
-                if q_u_arr is None:
-                    continue
-                best_pdx = int(np.argmin(cnot_arr))
-                ew = alpha ** depth
-                for e in range(q_u_arr.shape[1]):
-                    qu, qv = int(q_u_arr[best_pdx, e]), int(q_v_arr[best_pdx, e])
-                    if qu == qv:
-                        continue
-                    if qu not in own_qubits or qv not in own_qubits:
-                        raw_edges.append((qu, qv, ew))
-
-        if not raw_edges:
-            return None
-
-        # Build ordered list of unique neighbor virtual qubits
-        vq_set = set()
-        for qu, qv, _ in raw_edges:
-            vq_set.add(qu)
-            vq_set.add(qv)
-        neighbor_vqs = sorted(vq_set)
-        vq_to_idx = {vq: i for i, vq in enumerate(neighbor_vqs)}
-
-        # Convert edges to index-based, dedup by summing weights
-        edge_map = {}
-        for qu, qv, ew in raw_edges:
-            iu, iv = vq_to_idx[qu], vq_to_idx[qv]
-            key = (min(iu, iv), max(iu, iv))
-            edge_map[key] = edge_map.get(key, 0.0) + ew
-
-        edges = [(iu, iv, w) for (iu, iv), w in edge_map.items()]
-        initial_pos = tuple(int(pi[vq]) for vq in neighbor_vqs)
-
-        return {
-            'neighbor_vqs': neighbor_vqs,
-            'initial_pos': initial_pos,
-            'edges': edges,
-            'weight': weight,
-        }
+        if best is None:
+            return [], list(pi)
+
+        _, _, u, v = best
+        path = self._bfs_shortest_path(int(pi[u]), int(pi[v]))
+        if len(path) < 2:
+            return [], list(pi)
+
+        k = len(path) - 1
+        m = k // 2
+        swaps = []
+        for i in range(m):
+            swaps.append((path[i], path[i + 1]))
+        for i in range(k, m + 1, -1):
+            swaps.append((path[i], path[i - 1]))
+
+        pi_new = self._apply_swaps_to_pi(pi, swaps)
+        return swaps, pi_new
 
     def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D):
         pi_initial = pi.copy()
@@ -1014,17 +883,22 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
         max_lookahead = self.config.get('max_lookahead', 4)
         E_W = self.config.get('E_weight', 0.5)
         E_alpha = self.config.get('E_alpha', 0.9)
-        E_overlap_floor = self.config.get('E_overlap_floor', 0.2)
 
-        neighbor_data = self._precompute_neighbor_data(scoring_partitions, reverse=False)
-        neighbor_weight = self.config.get('neighbor_weight', 0.5)
+        canonical_data = self._build_canonical_neighbor_data(scoring_partitions, reverse=False)
 
-        congestion_weight = self.config.get('congestion_weight', 0.1)
-        congestion_decay = self.config.get('congestion_decay', 0.9)
-        congestion = np.zeros(len(pi))
-        betweenness = getattr(self, '_betweenness', None)
+        valve_enabled = self.config.get('release_valve_enabled', True)
+        valve_threshold = self.config.get('release_valve_threshold', 20)
+        swaps_since_clean = 0
 
         while len(F) != 0:
+                if valve_enabled and swaps_since_clean > valve_threshold:
+                    valve_swaps, pi_bridged = self._release_valve(F, pi, D, canonical_data)
+                    if valve_swaps:
+                        partition_order.append(construct_swap_circuit(valve_swaps, len(pi)))
+                        pi = np.asarray(pi_bridged)
+                    swaps_since_clean = 0
+                    continue
+
                 partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
                 if len(partition_candidates) == 0:
                     break
@@ -1050,27 +924,13 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             E=E,
                             W=E_W,
                             alpha=E_alpha,
-                            neighbor_data=neighbor_data,
+                            canonical_data=canonical_data,
                             adj=self._adj,
-                            neighbor_info=self._compute_neighbor_info(
-                                partition_candidate, F_snapshot, E, neighbor_data, pi,
-                                alpha=E_alpha, weight=neighbor_weight,
-                            ),
-                            E_overlap_floor=E_overlap_floor,
-                            congestion=congestion,
-                            betweenness=betweenness,
-                            congestion_weight=congestion_weight,
                             local_cost_weight=self.config.get('local_cost_weight', 0.1),
                         )
                         for partition_candidate in partition_candidates
                     ]
-                min_partition_candidate = self._select_with_lookahead(
-                    partition_candidates, scores, pi, F,
-                    DAG, IDAG, resolved_partitions,
-                    optimized_partitions, scoring_partitions, D,
-                    E_W, E_alpha, E_overlap_floor,
-                    neighbor_data, neighbor_weight,
-                )
+                min_partition_candidate = self._select_best_candidate(partition_candidates, scores)
 
                 F.remove(min_partition_candidate.partition_idx)
                 resolved_partitions[min_partition_candidate.partition_idx] = True
@@ -1078,20 +938,14 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                 pbar.n = resolved_count
                 pbar.refresh()
 
-                neighbor_info = self._compute_neighbor_info(
-                    min_partition_candidate, F_snapshot, E, neighbor_data, pi,
-                    alpha=E_alpha, weight=neighbor_weight,
+                swap_order, pi = min_partition_candidate.transform_pi(
+                    pi, D, self._swap_cache, adj=self._adj,
                 )
-                swap_order, pi = min_partition_candidate.transform_pi(pi, D, self._swap_cache, adj=self._adj, neighbor_info=neighbor_info)
-                if len(swap_order)!=0:
+                if len(swap_order) != 0:
                     partition_order.append(construct_swap_circuit(swap_order, len(pi)))
-                    # Update congestion: increment for nodes used in SWAPs
-                    for p1, p2 in swap_order:
-                        congestion[p1] += 1.0
-                        congestion[p2] += 1.0
-
-                # Decay congestion each step
-                congestion *= congestion_decay
+                    swaps_since_clean += len(swap_order)
+                else:
+                    swaps_since_clean = 0
 
                 partition_order.append(min_partition_candidate)
                 children = list(DAG[min_partition_candidate.partition_idx])
@@ -1154,16 +1008,8 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
         max_lookahead = self.config.get('max_lookahead', 4)
         E_W = self.config.get('E_weight', 0.5)
         E_alpha = self.config.get('E_alpha', 0.9)
-        E_overlap_floor = self.config.get('E_overlap_floor', 0.2)
-
-        neighbor_data = self._precompute_neighbor_data(scoring_partitions, reverse=reverse)
-        neighbor_weight = self.config.get('neighbor_weight', 0.5)
 
-        congestion_weight = self.config.get('congestion_weight', 0.1)
-        congestion_decay = self.config.get('congestion_decay', 0.9)
-        N_layout = len(pi)
-        congestion = np.zeros(N_layout)
-        betweenness = getattr(self, '_betweenness', None)
+        canonical_data = self._build_canonical_neighbor_data(scoring_partitions, reverse=reverse)
 
         while F:
             partition_candidates = self.obtain_partition_candidates(F, optimized_partitions)
@@ -1186,45 +1032,22 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
                     self._swap_cache,
                     E=E, W=E_W, alpha=E_alpha,
                     reverse=reverse,
-                    neighbor_data=neighbor_data,
+                    canonical_data=canonical_data,
                     adj=self._adj,
-                    neighbor_info=self._compute_neighbor_info(
-                        pc, F_snapshot, E, neighbor_data, pi,
-                        alpha=E_alpha, weight=neighbor_weight,
-                    ),
-                    E_overlap_floor=E_overlap_floor,
-                    congestion=congestion,
-                    betweenness=betweenness,
-                    congestion_weight=congestion_weight,
                     local_cost_weight=self.config.get('local_cost_weight', 0.1),
                 )
                 for pc in partition_candidates
             ]
 
-            best = self._select_with_lookahead(
-                partition_candidates, scores, pi, F,
-                DAG, IDAG, resolved_partitions,
-                optimized_partitions, scoring_partitions, D,
-                E_W, E_alpha, E_overlap_floor,
-                neighbor_data, neighbor_weight,
-                reverse=reverse, rng=rng,
-            )
+            best = self._select_best_candidate(partition_candidates, scores, rng=rng)
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 
-            neighbor_info = self._compute_neighbor_info(
-                best, F_snapshot, E, neighbor_data, pi,
-                alpha=E_alpha, weight=neighbor_weight,
+            swaps, pi = best.transform_pi(
+                pi, D, self._swap_cache, reverse=reverse, adj=self._adj,
             )
-            swaps, pi = best.transform_pi(pi, D, self._swap_cache, reverse=reverse, adj=self._adj, neighbor_info=neighbor_info)
             total_swaps += len(swaps)
 
-            # Update and decay congestion
-            for p1, p2 in swaps:
-                congestion[p1] += 1.0
-                congestion[p2] += 1.0
-            congestion *= congestion_decay
-
             # Promote children
             for child in DAG[best.partition_idx]:
                 if not resolved_partitions[child] and child not in F:
@@ -1283,185 +1106,107 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def _precompute_neighbor_data(scoring_partitions, reverse=False):
-        """Precompute resolved virtual qubit edges for all scoring partitions.
-
-        Returns a dict mapping partition_idx to (cnot_arr, q_u_arr, q_v_arr)
-        where arrays are padded numpy arrays for vectorized scoring.
-        Partitions that are None are skipped.
+    def _build_canonical_neighbor_data(scoring_partitions, reverse=False):
+        """Per partition, keep only the virtual-qubit edges of the lowest-CNOT
+        (mini_topology, P_i, P_o) combo — LightSABRE-style: assume each F/E
+        partition will be scheduled with its best combo.
+
+        Returns dict {partition_idx: {'edges_u': np.intp[n_edges],
+                                       'edges_v': np.intp[n_edges],
+                                       'cnot': int}}.
+        Partitions with no mini-topology edges have edges_u = edges_v = None.
         """
-        neighbor_data = {}
+        data = {}
         for idx, partition in enumerate(scoring_partitions):
             if partition is None:
                 continue
             qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
-            cnot_list = []
-            q_u_list = []
-            q_v_list = []
-            edge_counts = []
-
+            best_cnot = None
+            best_edges = None
             for tdx, mini_topology in enumerate(partition.mini_topologies):
                 for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
-                    cnot_list.append(len(partition.circuit_structures[tdx][pdx]))
+                    cnot = len(partition.circuit_structures[tdx][pdx])
+                    if best_cnot is not None and cnot >= best_cnot:
+                        continue
                     P_route = P_o if reverse else P_i
-                    eu = []
-                    ev = []
                     if mini_topology:
-                        for u, v in mini_topology:
-                            eu.append(qbit_map_inv[P_route[u]])
-                            ev.append(qbit_map_inv[P_route[v]])
-                    q_u_list.append(eu)
-                    q_v_list.append(ev)
-                    edge_counts.append(len(eu))
-
-            if not cnot_list:
+                        edges = [(qbit_map_inv[P_route[u]], qbit_map_inv[P_route[v]])
+                                 for u, v in mini_topology]
+                    else:
+                        edges = []
+                    best_cnot = cnot
+                    best_edges = edges
+            if best_cnot is None:
                 continue
-
-            n_combos = len(cnot_list)
-            max_edges = max(edge_counts)
-            cnot_arr = np.array(cnot_list, dtype=np.float64)
-
-            if max_edges > 0:
-                # Pad with 0: output_perm[0] maps to some physical qubit p,
-                # D[p][p] = 0, so max(0, 0-1) = 0 — padding contributes nothing.
-                q_u_arr = np.zeros((n_combos, max_edges), dtype=np.intp)
-                q_v_arr = np.zeros((n_combos, max_edges), dtype=np.intp)
-                for i in range(n_combos):
-                    ne = edge_counts[i]
-                    if ne > 0:
-                        q_u_arr[i, :ne] = q_u_list[i]
-                        q_v_arr[i, :ne] = q_v_list[i]
-                neighbor_data[idx] = (cnot_arr, q_u_arr, q_v_arr)
+            if best_edges:
+                eu = np.array([e[0] for e in best_edges], dtype=np.intp)
+                ev = np.array([e[1] for e in best_edges], dtype=np.intp)
             else:
-                neighbor_data[idx] = (cnot_arr, None, None)
-
-        return neighbor_data
+                eu = ev = None
+            data[idx] = {'edges_u': eu, 'edges_v': ev, 'cnot': best_cnot}
+        return data
 
     @staticmethod
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
-                                  neighbor_data=None, adj=None, neighbor_info=None,
-                                  E_overlap_floor=0.2,
-                                  congestion=None, betweenness=None, congestion_weight=0.0,
+                                  canonical_data=None, adj=None,
                                   local_cost_weight=0.1):
-        score = 0
-        swap_weight = 1
-        swaps, output_perm = partition_candidate.transform_pi(pi, D, swap_cache, reverse=reverse, adj=adj, neighbor_info=neighbor_info)
-        score += swap_weight * len(swaps) * 3
+        """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
+
+        H = 3 * |swaps|
+          + local_cost_weight * cand.cnot_count
+          + (1/|F'|) * average routing cost over F \\ {cand}
+          + (W/|E|)  * alpha^d-decayed routing cost over E
+        """
+        swaps, output_perm = partition_candidate.transform_pi(
+            pi, D, swap_cache, reverse=reverse, adj=adj, neighbor_info=None,
+        )
+        score = 3.0 * len(swaps)
         score += local_cost_weight * partition_candidate.cnot_count
 
-        # Congestion penalty: penalize SWAP paths through congested bottleneck nodes
-        if congestion is not None and betweenness is not None and congestion_weight > 0 and swaps:
-            cong_penalty = 0.0
-            for p1, p2 in swaps:
-                cong_penalty += congestion[p1] * betweenness[p1]
-                cong_penalty += congestion[p2] * betweenness[p2]
-            score += congestion_weight * cong_penalty
+        if canonical_data is None:
+            return score
 
-        if neighbor_data is not None:
-            output_perm_arr = np.asarray(output_perm, dtype=np.intp)
-            D_arr = np.asarray(D)
+        output_perm_arr = np.asarray(output_perm, dtype=np.intp)
+        D_arr = np.asarray(D)
+        cand_idx = partition_candidate.partition_idx
 
-            for partition_idx in F:
-                if partition_idx == partition_candidate.partition_idx:
+        # Basic component: average dist over F \ {cand}
+        f_sum = 0.0
+        n_other = 0
+        for partition_idx in F:
+            if partition_idx == cand_idx:
+                continue
+            entry = canonical_data.get(partition_idx)
+            if entry is None:
+                continue
+            n_other += 1
+            eu = entry['edges_u']
+            if eu is None:
+                continue
+            phys_u = output_perm_arr[eu]
+            phys_v = output_perm_arr[entry['edges_v']]
+            f_sum += 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+        if n_other > 0:
+            score += f_sum / n_other
+
+        # Lookahead component: alpha^depth-decayed average over E
+        if E:
+            e_sum = 0.0
+            for partition_idx, depth in E:
+                if partition_idx == cand_idx:
                     continue
-                entry = neighbor_data.get(partition_idx)
+                entry = canonical_data.get(partition_idx)
                 if entry is None:
                     continue
-                cnot_arr, q_u_arr, q_v_arr = entry
-                if q_u_arr is not None:
-                    phys_u = output_perm_arr[q_u_arr]
-                    phys_v = output_perm_arr[q_v_arr]
-                    routing = 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum(axis=1)
-                    score += float((routing + cnot_arr).min())
-                else:
-                    score += float(cnot_arr.min())
-
-            if E:
-                e_score = 0.0
-                cand_qubits = set(partition_candidate.involved_qbits)
-                for partition_idx, depth in E:
-                    if partition_idx == partition_candidate.partition_idx:
-                        continue
-                    entry = neighbor_data.get(partition_idx)
-                    if entry is None:
-                        continue
-                    # Overlap-aware decay: partitions sharing qubits with
-                    # the candidate are weighted more heavily.
-                    e_part = scoring_partitions[partition_idx]
-                    if e_part is not None and e_part.involved_qbits:
-                        e_qubits = set(e_part.involved_qbits)
-                        overlap = len(cand_qubits & e_qubits)
-                        relevance = overlap / len(e_qubits)
-                    else:
-                        relevance = 0.0
-                    decay = (alpha ** depth) * (E_overlap_floor + (1 - E_overlap_floor) * relevance)
-                    cnot_arr, q_u_arr, q_v_arr = entry
-                    if q_u_arr is not None:
-                        phys_u = output_perm_arr[q_u_arr]
-                        phys_v = output_perm_arr[q_v_arr]
-                        routing = 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum(axis=1)
-                        e_score += float((routing + cnot_arr).min()) * decay
-                    else:
-                        e_score += float(cnot_arr.min()) * decay
-                if len(E) > 0:
-                    score += W * e_score / len(E)
-        else:
-            # Fallback: original Python loop (no precomputed data)
-            for partition_idx in F:
-                partition = scoring_partitions[partition_idx]
-                if partition is None or partition_idx == partition_candidate.partition_idx:
+                eu = entry['edges_u']
+                if eu is None:
                     continue
-                qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
-                mini_scores = []
-                for tdx, mini_topology in enumerate(partition.mini_topologies):
-                    for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
-                        cnot_count = len(partition.circuit_structures[tdx][pdx])
-                        P_route = P_o if reverse else P_i
-                        if mini_topology:
-                            routing_cost = swap_weight * 3 * sum(
-                                max(0, D[int(output_perm[qbit_map_inv[P_route[u]]])][int(output_perm[qbit_map_inv[P_route[v]]])] - 1)
-                                for u, v in mini_topology
-                            )
-                        else:
-                            routing_cost = 0
-                        mini_scores.append(routing_cost + cnot_count)
-                if mini_scores:
-                    score += min(mini_scores)
-
-            if E:
-                e_score = 0
-                cand_qubits = set(partition_candidate.involved_qbits)
-                for partition_idx, depth in E:
-                    partition = scoring_partitions[partition_idx]
-                    if partition is None or partition_idx == partition_candidate.partition_idx:
-                        continue
-                    # Overlap-aware decay
-                    if partition.involved_qbits:
-                        e_qubits = set(partition.involved_qbits)
-                        overlap = len(cand_qubits & e_qubits)
-                        relevance = overlap / len(e_qubits)
-                    else:
-                        relevance = 0.0
-                    decay = (alpha ** depth) * (E_overlap_floor + (1 - E_overlap_floor) * relevance)
-                    qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
-                    mini_scores = []
-                    for tdx, mini_topology in enumerate(partition.mini_topologies):
-                        for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
-                            cnot_count = len(partition.circuit_structures[tdx][pdx])
-                            P_route = P_o if reverse else P_i
-                            if mini_topology:
-                                routing_cost = swap_weight * 3 * sum(
-                                    max(0, D[int(output_perm[qbit_map_inv[P_route[u]]])][int(output_perm[qbit_map_inv[P_route[v]]])] - 1)
-                                    for u, v in mini_topology
-                                )
-                            else:
-                                routing_cost = 0
-                            mini_scores.append(routing_cost + cnot_count)
-                    if mini_scores:
-                        e_score += min(mini_scores) * decay
-                if len(E) > 0:
-                    score += W * e_score / len(E)
+                phys_u = output_perm_arr[eu]
+                phys_v = output_perm_arr[entry['edges_v']]
+                d_cost = 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+                e_sum += (alpha ** depth) * d_cost
+            score += W * e_sum / len(E)
 
         return score
 
@@ -1633,42 +1378,7 @@ def compute_distances_bfs(self, N):
         # Store adjacency list for reuse by A* routing
         self._adj = [list(adj[i]) for i in range(N)]
 
-        # Compute betweenness centrality for congestion-aware scoring.
-        # Brandes' algorithm adapted for unweighted BFS graphs: O(N * E).
-        bc = np.zeros(N)
-        for s in range(N):
-            # BFS from s
-            S = []  # stack of nodes in order of non-decreasing distance
-            P = [[] for _ in range(N)]  # predecessors on shortest paths
-            sigma = np.zeros(N)  # number of shortest paths from s
-            sigma[s] = 1.0
-            d = np.full(N, -1)
-            d[s] = 0
-            Q = deque([s])
-            while Q:
-                v = Q.popleft()
-                S.append(v)
-                for w in adj[v]:
-                    if d[w] < 0:
-                        Q.append(w)
-                        d[w] = d[v] + 1
-                    if d[w] == d[v] + 1:
-                        sigma[w] += sigma[v]
-                        P[w].append(v)
-            delta = np.zeros(N)
-            while S:
-                w = S.pop()
-                for v in P[w]:
-                    delta[v] += (sigma[v] / sigma[w]) * (1.0 + delta[w])
-                if w != s:
-                    bc[w] += delta[w]
-        # Normalize to [0, 1]
-        max_bc = bc.max()
-        if max_bc > 0:
-            bc /= max_bc
-        self._betweenness = bc
-
-        return D #multiply by 3 to make it CNOT cost instead of SWAP cost
+        return D
 
 
     def generate_DAG_levels(self, circuit):

From d2f24c2911e7536132a6c5b3916b8b312c645b25 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 19 Apr 2026 03:10:34 +0200
Subject: [PATCH 119/232] Add VF2Layout/SabrePreLayout seeded layouts and
 configurable swap cost
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement LightSABRE §II.3 initial layout seeding using rustworkx VF2
subgraph isomorphism: VF2Layout for exact embeddings, SabrePreLayout
with distance-2 augmented topology for near-perfect ones, and greedy
weighted-distance fallback when VF2 fails. Seed trial 0 with the
computed layout; remaining trials stay random.

Make swap_cost configurable (default 15.0, up from hardcoded 3.0) to
more aggressively penalize SWAPs in the scoring function, reducing
post-cleanup CNOT counts on constrained topologies.
---
 squander/synthesis/PartAM.py | 214 +++++++++++++++++++++++++++++++++--
 1 file changed, 204 insertions(+), 10 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index f60778448..4dd0d9aa5 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -578,6 +578,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
 
         D = self.compute_distances_bfs(N)
         scoring_partitions = self._build_scoring_partitions(optimized_partitions)
+        seeded_pi = self._compute_seeded_layout(optimized_partitions, D, N, circ)
 
         n_iterations = self.config.get('sabre_iterations', 1)
         n_trials = self.config.get('n_layout_trials', 1)
@@ -585,10 +586,10 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
         do_cleanup = self.config.get('cleanup', True)
         routing_start = time.time()
         if n_iterations == 0:
-            # Single forward pass from identity layout
+            # Single forward pass from seeded layout
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
             partition_order, pi, pi_initial = self.Heuristic_Search(
-                F, pi=np.arange(N), DAG=DAG, IDAG=IDAG,
+                F, pi=seeded_pi.copy(), DAG=DAG, IDAG=IDAG,
                 optimized_partitions=optimized_partitions,
                 scoring_partitions=scoring_partitions, D=D,
             )
@@ -612,7 +613,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
 
                 for trial in range(max(1, n_trials)):
                     rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
-                    pi = np.arange(N)
+                    pi = seeded_pi.copy() if trial == 0 else np.arange(N)
 
                     for iteration in range(n_iterations):
                         # Reverse pass: walk DAG backwards (swap DAG↔IDAG)
@@ -667,7 +668,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
 
                 for trial in range(max(1, n_trials)):
                     rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
-                    pi = np.arange(N)
+                    pi = seeded_pi.copy() if trial == 0 else np.arange(N)
 
                     for iteration in range(n_iterations):
                         # Reverse pass: walk DAG backwards (swap DAG↔IDAG)
@@ -751,8 +752,9 @@ def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=Fals
         if len(partition_candidates) <= top_k:
             return partition_candidates
         local_cost_weight = self.config.get('local_cost_weight', 0.1)
+        swap_cost = self.config.get('swap_cost', 15.0)
         estimates = np.array([
-            pc.estimate_swap_count(pi, D, reverse=reverse) * 3 + local_cost_weight * pc.cnot_count
+            pc.estimate_swap_count(pi, D, reverse=reverse) * swap_cost + local_cost_weight * pc.cnot_count
             for pc in partition_candidates
         ])
         top_k_indices = np.argpartition(estimates, top_k)[:top_k]
@@ -927,6 +929,7 @@ def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_parti
                             canonical_data=canonical_data,
                             adj=self._adj,
                             local_cost_weight=self.config.get('local_cost_weight', 0.1),
+                            swap_cost=self.config.get('swap_cost', 15.0),
                         )
                         for partition_candidate in partition_candidates
                     ]
@@ -1035,6 +1038,7 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
                     canonical_data=canonical_data,
                     adj=self._adj,
                     local_cost_weight=self.config.get('local_cost_weight', 0.1),
+                    swap_cost=self.config.get('swap_cost', 15.0),
                 )
                 for pc in partition_candidates
             ]
@@ -1150,10 +1154,10 @@ def _build_canonical_neighbor_data(scoring_partitions, reverse=False):
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
                                   canonical_data=None, adj=None,
-                                  local_cost_weight=0.1):
+                                  local_cost_weight=0.1, swap_cost=15.0):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
-        H = 3 * |swaps|
+        H = swap_cost * |swaps|
           + local_cost_weight * cand.cnot_count
           + (1/|F'|) * average routing cost over F \\ {cand}
           + (W/|E|)  * alpha^d-decayed routing cost over E
@@ -1161,7 +1165,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
         swaps, output_perm = partition_candidate.transform_pi(
             pi, D, swap_cache, reverse=reverse, adj=adj, neighbor_info=None,
         )
-        score = 3.0 * len(swaps)
+        score = swap_cost * len(swaps)
         score += local_cost_weight * partition_candidate.cnot_count
 
         if canonical_data is None:
@@ -1186,7 +1190,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 continue
             phys_u = output_perm_arr[eu]
             phys_v = output_perm_arr[entry['edges_v']]
-            f_sum += 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+            f_sum += swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
         if n_other > 0:
             score += f_sum / n_other
 
@@ -1204,7 +1208,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                     continue
                 phys_u = output_perm_arr[eu]
                 phys_v = output_perm_arr[entry['edges_v']]
-                d_cost = 3.0 * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+                d_cost = swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
                 e_sum += (alpha ** depth) * d_cost
             score += W * e_sum / len(E)
 
@@ -1380,6 +1384,196 @@ def compute_distances_bfs(self, N):
 
         return D
 
+    def _compute_seeded_layout(self, optimized_partitions, D, N, circ):
+        """VF2Layout + SabrePreLayout seeded initial layout (LightSABRE §II.3).
+
+        The interaction graph is built from the circuit's two-qubit gate pairs
+        (matching the paper's gate-level approach), not from partition cliques.
+        Partition-level weights are used only for the greedy fallback.
+
+        Steps:
+        1. VF2Layout: subgraph isomorphism of gate interaction graph into
+           hardware topology.  If a mapping exists, every gate qubit pair
+           lands on adjacent physical qubits (zero SWAPs).
+        2. SabrePreLayout: augment topology with distance-d edges (d=2),
+           retry VF2 — handles "almost perfect" embeddings.
+        3. Fallback: greedy weighted-distance placement from partition weights.
+        """
+        from collections import defaultdict
+        from squander.synthesis.PartAM_utils import PartitionSynthesisResult, SingleQubitPartitionResult
+
+        if not self.topology:
+            return np.arange(N)
+
+        # --- build gate-level interaction graph from circuit CNOT pairs ---
+        gate_edges = set()
+        for g in circ.get_Gates():
+            gname = str(type(g).__name__)
+            if 'CNOT' in gname or 'CX' in gname:
+                ctrl = g.get_Control_Qbit()
+                tgt = g.get_Target_Qbit()
+                gate_edges.add((min(ctrl, tgt), max(ctrl, tgt)))
+
+        if not gate_edges:
+            return np.arange(N)
+
+        # --- try rustworkx VF2 approaches ---
+        try:
+            import rustworkx as rx
+        except ImportError:
+            return self._greedy_seeded_layout(optimized_partitions, D, N)
+
+        G_int = rx.PyGraph()
+        G_int.add_nodes_from(range(N))
+        for u, v in gate_edges:
+            G_int.add_edge(u, v, None)
+
+        G_hw = rx.PyGraph()
+        G_hw.add_nodes_from(range(N))
+        for u, v in self.topology:
+            G_hw.add_edge(u, v, None)
+
+        # Step 1: VF2Layout — exact subgraph isomorphism
+        pi = self._try_vf2_layout(G_int, G_hw, N)
+        if pi is not None:
+            return pi
+
+        # Step 2: SabrePreLayout — augment topology with distance-2 edges
+        G_aug = rx.PyGraph()
+        G_aug.add_nodes_from(range(N))
+        seen = set()
+        for u, v in self.topology:
+            G_aug.add_edge(u, v, None)
+            seen.add((min(u, v), max(u, v)))
+        for i in range(N):
+            for j in range(i + 1, N):
+                if (i, j) not in seen and D[i][j] <= 2:
+                    G_aug.add_edge(i, j, None)
+                    seen.add((i, j))
+
+        pi = self._try_vf2_layout(G_int, G_aug, N)
+        if pi is not None:
+            return pi
+
+        # Step 3: greedy fallback using partition-level weights
+        return self._greedy_seeded_layout(optimized_partitions, D, N)
+
+    def _try_vf2_layout(self, G_int, G_hw, N):
+        """Try VF2 subgraph isomorphism of G_int into G_hw.
+
+        Returns pi (logical->physical mapping) or None if no embedding exists.
+        Uses induced=False to allow non-edges in the interaction graph to
+        correspond to edges in the hardware graph (monotone subgraph iso).
+        """
+        import rustworkx as rx
+
+        try:
+            vf2_iter = rx.vf2_mapping(G_hw, G_int, subgraph=True, induced=False)
+            mapping = next(vf2_iter)  # {hw_node: int_node}
+        except StopIteration:
+            return None
+
+        # Invert: pi[logical_q] = physical_q
+        pi = np.zeros(N, dtype=int)
+        inv = {v: k for k, v in mapping.items()}
+        used = set(inv.values())
+        free = [p for p in range(N) if p not in used]
+        fi = 0
+        for q in range(N):
+            if q in inv:
+                pi[q] = inv[q]
+            else:
+                pi[q] = free[fi]
+                fi += 1
+        return pi
+
+    def _greedy_seeded_layout(self, optimized_partitions, D, N):
+        """Greedy weighted-distance placement (fallback when VF2 fails)."""
+        from collections import defaultdict
+        from squander.synthesis.PartAM_utils import PartitionSynthesisResult, SingleQubitPartitionResult
+
+        # Build interaction weights from partitions
+        interaction_weight = defaultdict(float)
+        for partition in optimized_partitions:
+            if isinstance(partition, SingleQubitPartitionResult):
+                continue
+            if not isinstance(partition, PartitionSynthesisResult):
+                continue
+            involved = list(partition.involved_qbits)
+            if len(involved) < 2:
+                continue
+            best_cnot = float('inf')
+            for tdx in range(len(partition.cnot_counts)):
+                if not partition.cnot_counts[tdx]:
+                    continue
+                cnot_min = min(partition.cnot_counts[tdx])
+                if cnot_min < best_cnot:
+                    best_cnot = cnot_min
+            if best_cnot == float('inf'):
+                continue
+            for i in range(len(involved)):
+                for j in range(i + 1, len(involved)):
+                    key = (min(involved[i], involved[j]),
+                           max(involved[i], involved[j]))
+                    interaction_weight[key] += best_cnot
+
+        if not interaction_weight:
+            return np.arange(N)
+
+        pi = np.arange(N)
+        placed_logical = set()
+        placed_physical = set()
+
+        (q1, q2), _ = max(interaction_weight.items(), key=lambda x: x[1])
+        p1, p2 = self.topology[0]
+
+        holder1 = np.where(pi == p1)[0][0]
+        pi[q1], pi[holder1] = p1, pi[q1]
+        holder2 = np.where(pi == p2)[0][0]
+        pi[q2], pi[holder2] = p2, pi[q2]
+        placed_logical.update([q1, q2])
+        placed_physical.update([p1, p2])
+
+        remaining = [q for q in range(N) if q not in placed_logical]
+
+        def _score(q):
+            return sum(
+                interaction_weight.get((min(q, pq), max(q, pq)), 0.0)
+                for pq in placed_logical
+            )
+
+        remaining.sort(key=_score, reverse=True)
+
+        for logical_q in remaining:
+            best_physical = None
+            best_dist = float('inf')
+
+            for physical_q in range(N):
+                if physical_q in placed_physical:
+                    continue
+
+                total_dist = 0.0
+                total_w = 0.0
+                for other_q in placed_logical:
+                    key = (min(logical_q, other_q), max(logical_q, other_q))
+                    w = interaction_weight.get(key, 0.0)
+                    if w > 0:
+                        total_dist += D[physical_q][pi[other_q]] * w
+                        total_w += w
+
+                avg = total_dist / total_w if total_w > 0 else 0.0
+                if avg < best_dist:
+                    best_dist = avg
+                    best_physical = physical_q
+
+            if best_physical is not None:
+                holder = np.where(pi == best_physical)[0][0]
+                pi[logical_q], pi[holder] = best_physical, pi[logical_q]
+                placed_logical.add(logical_q)
+                placed_physical.add(best_physical)
+
+        return pi
+
 
     def generate_DAG_levels(self, circuit):
         """

From d25c85b2f713bea8b7d2e2dc331d2c3e45f494cb Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 19 Apr 2026 13:22:53 +0200
Subject: [PATCH 120/232] Only clean up top k trials during routing

---
 squander/synthesis/PartAM.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 4dd0d9aa5..c4405bfa4 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -91,6 +91,7 @@ def __init__(self, config):
         self.config.setdefault('random_seed', 42)
         self.config.setdefault('cleanup', True)
         self.config.setdefault('prefilter_top_k', 50)
+        self.config.setdefault('cleanup_top_k', 3)
         strategy = self.config['strategy']
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
@@ -608,8 +609,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                 saved_sq_circuits = {i: p.circuit for i, p in enumerate(optimized_partitions)
                                      if isinstance(p, SingleQubitPartitionResult)}
 
-                best_circuit = best_params = best_pi_init = best_pi = None
-                best_cost = float('inf')
+                trial_results = []  # (pre_cleanup_cnots, circuit, params, pi_init, pi_out)
 
                 for trial in range(max(1, n_trials)):
                     rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
@@ -642,21 +642,30 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                         F_trial, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
                     )
 
-                    # Build circuit + cleanup
                     trial_circuit, trial_params = self.Construct_circuit_from_HS(
                         partition_order, optimized_partitions, N,
                     )
                     pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get('CNOT', 0)
-                    trial_circuit, trial_params = wco.OptimizeWideCircuit(
-                        trial_circuit.get_Flat_Circuit(), trial_params
-                    )
+                    trial_results.append((pre_cleanup_cnots, trial_circuit, trial_params, pi_init, pi_out))
 
-                    cost = trial_circuit.get_Gate_Nums().get('CNOT', 0)
+                # Apply cleanup only to the top-k candidates by pre-cleanup CNOT count
+                cleanup_top_k = self.config.get('cleanup_top_k', 3)
+                trial_results.sort(key=lambda x: x[0])
+                top_k_results = trial_results[:cleanup_top_k]
 
+                best_circuit = best_params = best_pi_init = best_pi = None
+                best_cost = float('inf')
+                best_pre_cleanup = None
+
+                for pre_cleanup_cnots, trial_circuit, trial_params, pi_init, pi_out in top_k_results:
+                    cleaned_circuit, cleaned_params = wco.OptimizeWideCircuit(
+                        trial_circuit.get_Flat_Circuit(), trial_params
+                    )
+                    cost = cleaned_circuit.get_Gate_Nums().get('CNOT', 0)
                     if cost < best_cost:
                         best_cost = cost
                         best_pre_cleanup = pre_cleanup_cnots
-                        best_circuit, best_params = trial_circuit, trial_params
+                        best_circuit, best_params = cleaned_circuit, cleaned_params
                         best_pi_init, best_pi = pi_init, pi_out
 
                 final_circuit, final_parameters = best_circuit, best_params

From be457dc38495daedaff41b56fc8f0000489fb433 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 19 Apr 2026 13:31:22 +0200
Subject: [PATCH 121/232] Fall back on qiskit routing if synthesis fails

---
 squander/synthesis/PartAM.py | 129 ++++++++++++++++++++++++++---------
 1 file changed, 97 insertions(+), 32 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index c4405bfa4..6df859d72 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -180,6 +180,40 @@ def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params,
                     result.add_result((new_P_i, new_P_o), new_circ, new_params, topology_idx)
                     known_pairs[pair_key].add((new_P_i, new_P_o))
 
+    @staticmethod
+    def _qiskit_routing_fallback(meta, mini_topology):
+        """Route original partition circuit on mini_topology using Qiskit transpiler.
+
+        Called when unitary synthesis fails to reach tolerance.  Routes the
+        original (un-permuted) circuit and returns it with identity P_i/P_o.
+        Returns (circuit, params) or (None, None) if Qiskit is unavailable or
+        routing fails.
+        """
+        try:
+            from squander.IO_interfaces.Qiskit_IO import get_Qiskit_Circuit, convert_Qiskit_to_Squander
+            from qiskit.compiler import transpile
+            from qiskit.transpiler import CouplingMap
+        except ImportError:
+            return None, None
+
+        try:
+            qk_circ = get_Qiskit_Circuit(meta['circuit'], meta['params'])
+            edges = []
+            for u, v in mini_topology:
+                edges.append([u, v])
+                edges.append([v, u])
+            coupling_map = CouplingMap(couplinglist=edges)
+            qk_routed = transpile(
+                qk_circ,
+                coupling_map=coupling_map,
+                optimization_level=1,
+                basis_gates=['cx', 'u3'],
+            )
+            return convert_Qiskit_to_Squander(qk_routed)
+        except Exception as exc:
+            logging.warning("Qiskit routing fallback failed: %s", exc)
+            return None, None
+
     def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[PartitionScoreData]]:
         """
         Create lightweight, picklable views of partitions that contain only the
@@ -276,13 +310,12 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
             logging.warning(
                 "DecomposePartition_and_Perm: %d-qubit partition on topology %s "
                 "did not reach tolerance %.2e after %d retries "
-                "(best error %.2e). Returning best attempt; final circuit error "
-                "may be elevated.",
+                "(best error %.2e). Will attempt Qiskit routing fallback.",
                 N, list(mini_topology) if mini_topology else None,
                 tolerance, max_retries, best_err,
             )
 
-        return best_circuit, best_params
+        return best_circuit, best_params, best_err
 
     # ------------------------------------------------------------------------
     # Circuit Synthesis
@@ -472,13 +505,14 @@ def _run_parallel_synthesis(self, partition_meta):
                 N = meta['N']
                 P_o_initial = stage1_P_o[(partition_idx, topology_idx)]
                 mini_topology = meta['mini_topologies'][topology_idx]
-                synth_circuit, synth_params = decomp_cache[ck]
-                pair_key = (partition_idx, topology_idx)
-                self._add_result_with_auts(
-                    results_map[partition_idx], (P_i, P_o_initial),
-                    synth_circuit, synth_params, topology_idx,
-                    N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
-                )
+                synth_circuit, synth_params, synth_err = decomp_cache[ck]
+                if synth_err <= self.config['tolerance']:
+                    pair_key = (partition_idx, topology_idx)
+                    self._add_result_with_auts(
+                        results_map[partition_idx], (P_i, P_o_initial),
+                        synth_circuit, synth_params, topology_idx,
+                        N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                    )
 
             # Collect Stage 1 pool results
             cache_hits_s1 = len(stage1_cached)
@@ -486,18 +520,19 @@ def _run_parallel_synthesis(self, partition_meta):
                 stage1_futures, desc=f"Stage 1 Synthesis ({cache_hits_s1} cached)",
                 disable=disable_pbar
             ):
-                synth_circuit, synth_params = future.get()
-                decomp_cache[ck] = (synth_circuit, synth_params)
+                synth_circuit, synth_params, synth_err = future.get()
+                decomp_cache[ck] = (synth_circuit, synth_params, synth_err)
                 meta = partition_meta[partition_idx]
                 N = meta['N']
                 P_o_initial = stage1_P_o[(partition_idx, topology_idx)]
                 mini_topology = meta['mini_topologies'][topology_idx]
-                pair_key = (partition_idx, topology_idx)
-                self._add_result_with_auts(
-                    results_map[partition_idx], (P_i, P_o_initial),
-                    synth_circuit, synth_params, topology_idx,
-                    N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
-                )
+                if synth_err <= self.config['tolerance']:
+                    pair_key = (partition_idx, topology_idx)
+                    self._add_result_with_auts(
+                        results_map[partition_idx], (P_i, P_o_initial),
+                        synth_circuit, synth_params, topology_idx,
+                        N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                    )
 
             # ---- Stage 2: fix top-k P_i from Stage 1, sweep all P_o ----
             top_k_pi = self.config.get('top_k_pi', 1)
@@ -532,13 +567,14 @@ def _run_parallel_synthesis(self, partition_meta):
                 meta = partition_meta[partition_idx]
                 N = meta['N']
                 mini_topology = meta['mini_topologies'][topology_idx]
-                synth_circuit, synth_params = decomp_cache[ck]
-                pair_key = (partition_idx, topology_idx)
-                self._add_result_with_auts(
-                    results_map[partition_idx], (tuple(P_i_cand), P_o),
-                    synth_circuit, synth_params, topology_idx,
-                    N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
-                )
+                synth_circuit, synth_params, synth_err = decomp_cache[ck]
+                if synth_err <= self.config['tolerance']:
+                    pair_key = (partition_idx, topology_idx)
+                    self._add_result_with_auts(
+                        results_map[partition_idx], (tuple(P_i_cand), P_o),
+                        synth_circuit, synth_params, topology_idx,
+                        N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                    )
 
             # Collect Stage 2 pool results
             cache_hits_s2 = len(stage2_cached)
@@ -546,16 +582,45 @@ def _run_parallel_synthesis(self, partition_meta):
                 stage2_futures, desc=f"Stage 2 Synthesis ({cache_hits_s2} cached)",
                 disable=disable_pbar
             ):
-                synth_circuit, synth_params = future.get()
-                decomp_cache[ck] = (synth_circuit, synth_params)
+                synth_circuit, synth_params, synth_err = future.get()
+                decomp_cache[ck] = (synth_circuit, synth_params, synth_err)
                 meta = partition_meta[partition_idx]
                 N = meta['N']
                 mini_topology = meta['mini_topologies'][topology_idx]
-                pair_key = (partition_idx, topology_idx)
-                self._add_result_with_auts(
-                    results_map[partition_idx], (tuple(P_i_cand), P_o),
-                    synth_circuit, synth_params, topology_idx,
-                    N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                if synth_err <= self.config['tolerance']:
+                    pair_key = (partition_idx, topology_idx)
+                    self._add_result_with_auts(
+                        results_map[partition_idx], (tuple(P_i_cand), P_o),
+                        synth_circuit, synth_params, topology_idx,
+                        N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
+                    )
+
+        # Qiskit routing fallback: for any (partition, topology) pair where all
+        # synthesis attempts failed (no results stored), route the original circuit
+        # with Qiskit and add the result with identity P_i/P_o permutations.
+        qiskit_fallback_cache = {}
+        for partition_idx, meta in enumerate(partition_meta):
+            if meta is None:
+                continue
+            N = meta['N']
+            for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
+                if results_map[partition_idx].permutations_pairs[topology_idx]:
+                    continue
+                fkey = (partition_idx, topology_idx)
+                if fkey not in qiskit_fallback_cache:
+                    fb_circuit, fb_params = self._qiskit_routing_fallback(meta, mini_topology)
+                    qiskit_fallback_cache[fkey] = (fb_circuit, fb_params)
+                fb_circuit, fb_params = qiskit_fallback_cache[fkey]
+                if fb_circuit is None:
+                    logging.warning(
+                        "Partition %d topology_idx %d: synthesis failed and Qiskit "
+                        "fallback unavailable; no result for this combination.",
+                        partition_idx, topology_idx,
+                    )
+                    continue
+                identity = tuple(range(N))
+                results_map[partition_idx].add_result(
+                    (identity, identity), fb_circuit, fb_params, topology_idx
                 )
 
         return results_map

From 732d1e567c18a765736e1c3a8aef73784624fb1a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 19 Apr 2026 13:33:06 +0200
Subject: [PATCH 122/232] Remove priting

---
 squander/synthesis/PartAM.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6df859d72..cdddfee18 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -305,16 +305,6 @@ def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology =
             if best_err <= tolerance:
                 break
 
-        if best_err > tolerance:
-            N = int(np.log2(Umtx.shape[0]))
-            logging.warning(
-                "DecomposePartition_and_Perm: %d-qubit partition on topology %s "
-                "did not reach tolerance %.2e after %d retries "
-                "(best error %.2e). Will attempt Qiskit routing fallback.",
-                N, list(mini_topology) if mini_topology else None,
-                tolerance, max_retries, best_err,
-            )
-
         return best_circuit, best_params, best_err
 
     # ------------------------------------------------------------------------

From b3aaf5ef0e949d3bd377bf2c52c9e28f0a9ac383 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 21 Apr 2026 15:17:32 +0200
Subject: [PATCH 123/232] speed up routing

---
 squander/synthesis/PartAM.py | 881 +++++++++++++++++++++++++----------
 1 file changed, 633 insertions(+), 248 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index cdddfee18..8456b7951 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -54,7 +54,52 @@ def _decompose_one(Umtx, mini_topology):
     construct_swap_circuit,
 )
 
+_routing_worker_state = None
 
+
+def _init_layout_trial_worker(state):
+    global _routing_worker_state
+    from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping
+
+    worker_config = dict(state["config"])
+    worker_config["progressbar"] = False
+
+    mapper = qgd_Partition_Aware_Mapping(worker_config)
+    mapper._adj = [list(neighbors) for neighbors in state["adj"]]
+    mapper._swap_cache = {}
+
+    _routing_worker_state = {
+        "mapper": mapper,
+        "seeded_pi": np.asarray(state["seeded_pi"]),
+        "DAG": state["DAG"],
+        "IDAG": state["IDAG"],
+        "layout_partitions": state["layout_partitions"],
+        "scoring_partitions": state["scoring_partitions"],
+        "D": np.asarray(state["D"]),
+        "candidate_cache": state["candidate_cache"],
+        "n_iterations": state["n_iterations"],
+        "n_trials": state["n_trials"],
+        "random_seed": state["random_seed"],
+    }
+
+
+def _run_layout_trial_worker(trial_idx):
+    state = _routing_worker_state
+    mapper = state["mapper"]
+
+    return mapper._run_single_layout_trial(
+        trial_idx=trial_idx,
+        seeded_pi=state["seeded_pi"],
+        DAG=state["DAG"],
+        IDAG=state["IDAG"],
+        layout_partitions=state["layout_partitions"],
+        scoring_partitions=state["scoring_partitions"],
+        D=state["D"],
+        candidate_cache=state["candidate_cache"],
+        n_iterations=state["n_iterations"],
+        n_trials=state["n_trials"],
+        random_seed=state["random_seed"],
+    )
 # ============================================================================
 # Main Class: qgd_Partition_Aware_Mapping
 # ============================================================================
@@ -93,6 +138,8 @@ def __init__(self, config):
         self.config.setdefault('prefilter_top_k', 50)
         self.config.setdefault('cleanup_top_k', 3)
         strategy = self.config['strategy']
+        self.config.setdefault('parallel_layout_trials', False)
+        self.config.setdefault('layout_trial_workers', 0)
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
             raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
@@ -261,7 +308,75 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
                 )
             )
         return scoring_partitions
+    @staticmethod
+    def _partition_is_single(partition):
+        if isinstance(partition, dict):
+            return partition.get("is_single", False)
+        return isinstance(partition, SingleQubitPartitionResult)
+
+
+    @staticmethod
+    def _partition_involved_qbits(partition):
+        if isinstance(partition, dict):
+            return partition["involved_qbits"]
+        return partition.involved_qbits
+
+
+    @staticmethod
+    def _build_layout_partition_info(optimized_partitions):
+        return [
+            {
+                "is_single": isinstance(
+                    partition, SingleQubitPartitionResult
+                ),
+                "involved_qbits": tuple(partition.involved_qbits),
+            }
+            for partition in optimized_partitions
+        ]
+    def _build_partition_candidate_cache(self, scoring_partitions):
+        """
+        Precompute all PartitionCandidate objects once, grouped by partition_idx.
+
+        Returns:
+            tuple where candidate_cache[partition_idx] is a tuple of
+            PartitionCandidate objects for that partition. Single-qubit
+            partitions get an empty tuple.
+        """
+        candidate_cache = []
+
+        for partition_idx, partition in enumerate(scoring_partitions):
+            if partition is None:
+                candidate_cache.append(())
+                continue
+
+            cached_candidates = []
+            for tdx, mini_topology in enumerate(partition.mini_topologies):
+                topology_candidates = partition.topology_candidates[tdx]
+                permutation_pairs = partition.permutations_pairs[tdx]
+                circuit_structures = partition.circuit_structures[tdx]
+
+                for topology_candidate in topology_candidates:
+                    for pdx, permutation_pair in enumerate(permutation_pairs):
+                        circuit_structure = circuit_structures[pdx]
+                        cached_candidates.append(
+                            PartitionCandidate(
+                                partition_idx,
+                                tdx,
+                                pdx,
+                                circuit_structure,
+                                permutation_pair[0],
+                                permutation_pair[1],
+                                topology_candidate,
+                                mini_topology,
+                                partition.qubit_map,
+                                partition.involved_qbits,
+                                cnot_count=len(circuit_structure),
+                            )
+                        )
+
+            candidate_cache.append(tuple(cached_candidates))
 
+        return tuple(candidate_cache)
     # ------------------------------------------------------------------------
     # Partition Decomposition Methods
     # ------------------------------------------------------------------------
@@ -618,10 +733,139 @@ def _run_parallel_synthesis(self, partition_meta):
     # ------------------------------------------------------------------------
     # Main Public API
     # ------------------------------------------------------------------------
+    def _run_single_layout_trial(
+        self,
+        trial_idx,
+        seeded_pi,
+        DAG,
+        IDAG,
+        layout_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        n_iterations,
+        n_trials,
+        random_seed,
+    ):
+        N = len(seeded_pi)
+        rng = (
+            np.random.RandomState(random_seed + trial_idx)
+            if n_trials > 1
+            else None
+        )
+        pi = seeded_pi.copy() if trial_idx == 0 else np.arange(N)
+
+        for iteration in range(n_iterations):
+            F_rev = self.get_final_layer(DAG, N, layout_partitions)
+            pi, _ = self._heuristic_search_layout_only(
+                F_rev,
+                pi,
+                IDAG,
+                DAG,
+                layout_partitions,
+                scoring_partitions,
+                D,
+                rng=rng,
+                reverse=True,
+                candidate_cache=candidate_cache,
+            )
 
-    def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
-        N = circ.get_Qbit_Num()
+            if iteration < n_iterations - 1:
+                F_fwd = self.get_initial_layer(IDAG, N, layout_partitions)
+                pi, _ = self._heuristic_search_layout_only(
+                    F_fwd,
+                    pi,
+                    DAG,
+                    IDAG,
+                    layout_partitions,
+                    scoring_partitions,
+                    D,
+                    rng=rng,
+                    candidate_cache=candidate_cache,
+                )
 
+        F_eval = self.get_initial_layer(IDAG, N, layout_partitions)
+        _, cost = self._heuristic_search_layout_only(
+            F_eval,
+            pi.copy(),
+            DAG,
+            IDAG,
+            layout_partitions,
+            scoring_partitions,
+            D,
+            rng=None,
+            candidate_cache=candidate_cache,
+        )
+        return cost, pi
+
+
+    def _run_layout_trials(
+        self,
+        seeded_pi,
+        DAG,
+        IDAG,
+        layout_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        n_iterations,
+        n_trials,
+        random_seed,
+    ):
+        trial_indices = list(range(max(1, n_trials)))
+        use_parallel = (
+            self.config.get("parallel_layout_trials", False)
+            and len(trial_indices) > 1
+        )
+
+        if not use_parallel:
+            return [
+                self._run_single_layout_trial(
+                    trial_idx=trial_idx,
+                    seeded_pi=seeded_pi,
+                    DAG=DAG,
+                    IDAG=IDAG,
+                    layout_partitions=layout_partitions,
+                    scoring_partitions=scoring_partitions,
+                    D=D,
+                    candidate_cache=candidate_cache,
+                    n_iterations=n_iterations,
+                    n_trials=n_trials,
+                    random_seed=random_seed,
+                )
+                for trial_idx in trial_indices
+            ]
+
+        workers = self.config.get("layout_trial_workers", 0)
+        if workers <= 0:
+            workers = min(len(trial_indices), mp.cpu_count())
+
+        worker_state = {
+            "config": dict(self.config),
+            "adj": tuple(tuple(neighbors) for neighbors in self._adj),
+            "seeded_pi": np.asarray(seeded_pi),
+            "DAG": DAG,
+            "IDAG": IDAG,
+            "layout_partitions": layout_partitions,
+            "scoring_partitions": scoring_partitions,
+            "D": np.asarray(D),
+            "candidate_cache": candidate_cache,
+            "n_iterations": n_iterations,
+            "n_trials": n_trials,
+            "random_seed": random_seed,
+        }
+
+        with Pool(
+            processes=workers,
+            initializer=_init_layout_trial_worker,
+            initargs=(worker_state,),
+        ) as pool:
+            return pool.map(_run_layout_trial_worker, trial_indices)
+        
+    def Partition_Aware_Mapping(
+        self, circ: Circuit, orig_parameters: np.ndarray
+    ):
+        N = circ.get_Qbit_Num()
 
         optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters)
 
@@ -634,24 +878,59 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
 
         D = self.compute_distances_bfs(N)
         scoring_partitions = self._build_scoring_partitions(optimized_partitions)
-        seeded_pi = self._compute_seeded_layout(optimized_partitions, D, N, circ)
+        candidate_cache = self._build_partition_candidate_cache(
+            scoring_partitions
+        )
+        layout_partitions = self._build_layout_partition_info(
+            optimized_partitions
+        )
+        seeded_pi = self._compute_seeded_layout(
+            optimized_partitions, D, N, circ
+        )
 
         n_iterations = self.config.get('sabre_iterations', 1)
         n_trials = self.config.get('n_layout_trials', 1)
         random_seed = self.config.get('random_seed', 42)
         do_cleanup = self.config.get('cleanup', True)
+
         routing_start = time.time()
+
         if n_iterations == 0:
-            # Single forward pass from seeded layout
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
             partition_order, pi, pi_initial = self.Heuristic_Search(
-                F, pi=seeded_pi.copy(), DAG=DAG, IDAG=IDAG,
+                F,
+                pi=seeded_pi.copy(),
+                DAG=DAG,
+                IDAG=IDAG,
                 optimized_partitions=optimized_partitions,
-                scoring_partitions=scoring_partitions, D=D,
+                scoring_partitions=scoring_partitions,
+                D=D,
+                candidate_cache=candidate_cache,
             )
+            final_circuit, final_parameters = self.Construct_circuit_from_HS(
+                partition_order, optimized_partitions, N
+            )
+
         else:
+            trial_results = self._run_layout_trials(
+                seeded_pi=seeded_pi,
+                DAG=DAG,
+                IDAG=IDAG,
+                layout_partitions=layout_partitions,
+                scoring_partitions=scoring_partitions,
+                D=D,
+                candidate_cache=candidate_cache,
+                n_iterations=n_iterations,
+                n_trials=max(1, n_trials),
+                random_seed=random_seed,
+            )
+            trial_results.sort(key=lambda x: x[0])
+
             if do_cleanup:
-                from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
+                from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+                    qgd_Wide_Circuit_Optimization,
+                )
+
                 cleanup_config = dict(self.config)
                 cleanup_config['topology'] = self.topology
                 cleanup_config['routed'] = True
@@ -660,125 +939,100 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                 cleanup_config['global_min'] = False
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
-                # Save single-qubit partition circuits before trial loop
-                saved_sq_circuits = {i: p.circuit for i, p in enumerate(optimized_partitions)
-                                     if isinstance(p, SingleQubitPartitionResult)}
-
-                trial_results = []  # (pre_cleanup_cnots, circuit, params, pi_init, pi_out)
-
-                for trial in range(max(1, n_trials)):
-                    rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
-                    pi = seeded_pi.copy() if trial == 0 else np.arange(N)
+                saved_sq_circuits = {
+                    i: p.circuit.copy()
+                    for i, p in enumerate(optimized_partitions)
+                    if isinstance(p, SingleQubitPartitionResult)
+                }
 
-                    for iteration in range(n_iterations):
-                        # Reverse pass: walk DAG backwards (swap DAG↔IDAG)
-                        F_rev = self.get_final_layer(DAG, N, optimized_partitions)
-                        pi, _ = self._heuristic_search_layout_only(
-                            F_rev, pi, IDAG, DAG, optimized_partitions, scoring_partitions, D,
-                            rng=rng,
-                            reverse=True,
-                        )
+                cleanup_top_k = self.config.get('cleanup_top_k', 3)
+                top_layouts = trial_results[:cleanup_top_k]
 
-                        # Forward layout-only pass (skip on last iteration)
-                        if iteration < n_iterations - 1:
-                            F_fwd = self.get_initial_layer(IDAG, N, optimized_partitions)
-                            pi, _ = self._heuristic_search_layout_only(
-                                F_fwd, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
-                                rng=rng,
-                            )
+                best_circuit = None
+                best_params = None
+                best_pi_init = None
+                best_pi = None
+                best_cost = float('inf')
+                best_pre_cleanup = None
 
-                    # Restore single-qubit partition circuits before full forward pass
-                    for i, orig in saved_sq_circuits.items():
-                        optimized_partitions[i].circuit = orig.copy()
+                for _, trial_pi in top_layouts:
+                    for idx, orig in saved_sq_circuits.items():
+                        optimized_partitions[idx].circuit = orig.copy()
 
-                    # Full forward pass
-                    F_trial = self.get_initial_layer(IDAG, N, optimized_partitions)
+                    F_trial = self.get_initial_layer(
+                        IDAG, N, optimized_partitions
+                    )
                     partition_order, pi_out, pi_init = self.Heuristic_Search(
-                        F_trial, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                        F_trial,
+                        trial_pi.copy(),
+                        DAG,
+                        IDAG,
+                        optimized_partitions,
+                        scoring_partitions,
+                        D,
+                        candidate_cache=candidate_cache,
                     )
 
                     trial_circuit, trial_params = self.Construct_circuit_from_HS(
-                        partition_order, optimized_partitions, N,
+                        partition_order, optimized_partitions, N
+                    )
+                    pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get(
+                        'CNOT', 0
                     )
-                    pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get('CNOT', 0)
-                    trial_results.append((pre_cleanup_cnots, trial_circuit, trial_params, pi_init, pi_out))
-
-                # Apply cleanup only to the top-k candidates by pre-cleanup CNOT count
-                cleanup_top_k = self.config.get('cleanup_top_k', 3)
-                trial_results.sort(key=lambda x: x[0])
-                top_k_results = trial_results[:cleanup_top_k]
-
-                best_circuit = best_params = best_pi_init = best_pi = None
-                best_cost = float('inf')
-                best_pre_cleanup = None
 
-                for pre_cleanup_cnots, trial_circuit, trial_params, pi_init, pi_out in top_k_results:
                     cleaned_circuit, cleaned_params = wco.OptimizeWideCircuit(
-                        trial_circuit.get_Flat_Circuit(), trial_params
+                        trial_circuit.get_Flat_Circuit(),
+                        trial_params,
+                    )
+                    cleaned_cost = cleaned_circuit.get_Gate_Nums().get(
+                        'CNOT', 0
                     )
-                    cost = cleaned_circuit.get_Gate_Nums().get('CNOT', 0)
-                    if cost < best_cost:
-                        best_cost = cost
+
+                    if cleaned_cost < best_cost:
+                        best_cost = cleaned_cost
                         best_pre_cleanup = pre_cleanup_cnots
-                        best_circuit, best_params = cleaned_circuit, cleaned_params
-                        best_pi_init, best_pi = pi_init, pi_out
+                        best_circuit = cleaned_circuit
+                        best_params = cleaned_params
+                        best_pi_init = pi_init
+                        best_pi = pi_out
 
-                final_circuit, final_parameters = best_circuit, best_params
-                pi_initial, pi = best_pi_init, best_pi
+                final_circuit = best_circuit
+                final_parameters = best_params
+                pi_initial = best_pi_init
+                pi = best_pi
 
             else:
-                best_pi = None
-                best_cost = float('inf')
-
-                for trial in range(max(1, n_trials)):
-                    rng = np.random.RandomState(random_seed + trial) if n_trials > 1 else None
-                    pi = seeded_pi.copy() if trial == 0 else np.arange(N)
-
-                    for iteration in range(n_iterations):
-                        # Reverse pass: walk DAG backwards (swap DAG↔IDAG)
-                        F_rev = self.get_final_layer(DAG, N, optimized_partitions)
-                        pi, _ = self._heuristic_search_layout_only(
-                            F_rev, pi, IDAG, DAG, optimized_partitions, scoring_partitions, D,
-                            rng=rng,
-                            reverse=True,
-                        )
-
-                        # Forward layout-only pass (skip on last iteration — real pass follows)
-                        if iteration < n_iterations - 1:
-                            F_fwd = self.get_initial_layer(IDAG, N, optimized_partitions)
-                            pi, _ = self._heuristic_search_layout_only(
-                                F_fwd, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
-                                rng=rng,
-                            )
+                _, best_pi = trial_results[0]
 
-                    # Score this trial: deterministic forward layout-only pass
-                    F_eval = self.get_initial_layer(IDAG, N, optimized_partitions)
-                    _, cost = self._heuristic_search_layout_only(
-                        F_eval, pi.copy(), DAG, IDAG, optimized_partitions, scoring_partitions, D,
-                        rng=None,
-                    )
-
-                    if cost < best_cost:
-                        best_cost = cost
-                        best_pi = pi.copy()
-
-                # Final forward pass — builds actual circuits
                 F = self.get_initial_layer(IDAG, N, optimized_partitions)
                 partition_order, pi, pi_initial = self.Heuristic_Search(
-                    F, best_pi, DAG, IDAG, optimized_partitions, scoring_partitions, D,
+                    F,
+                    best_pi.copy(),
+                    DAG,
+                    IDAG,
+                    optimized_partitions,
+                    scoring_partitions,
+                    D,
+                    candidate_cache=candidate_cache,
+                )
+                final_circuit, final_parameters = self.Construct_circuit_from_HS(
+                    partition_order, optimized_partitions, N
                 )
 
         if do_cleanup and n_iterations > 0:
-            # Cleanup already done per-trial
             self._routing_time = time.time() - routing_start
             self._cnot_pre_cleanup = best_pre_cleanup
         else:
-            final_circuit, final_parameters = self.Construct_circuit_from_HS(partition_order, optimized_partitions, N)
             self._routing_time = time.time() - routing_start
-            self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get('CNOT', 0)
+            self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get(
+                'CNOT', 0
+            )
 
             if self.config.get('cleanup', True):
-                from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
+                from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+                    qgd_Wide_Circuit_Optimization,
+                )
+
                 cleanup_config = dict(self.config)
                 cleanup_config['topology'] = self.topology
                 cleanup_config['routed'] = True
@@ -786,6 +1040,7 @@ def Partition_Aware_Mapping(self, circ: Circuit, orig_parameters: np.ndarray):
                 cleanup_config['test_final_circuit'] = False
                 cleanup_config['global_min'] = False
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+
                 final_circuit, final_parameters = wco.OptimizeWideCircuit(
                     final_circuit.get_Flat_Circuit(), final_parameters
                 )
@@ -908,154 +1163,219 @@ def _release_valve(self, F, pi, D, canonical_data):
         pi_new = self._apply_swaps_to_pi(pi, swaps)
         return swaps, pi_new
 
-    def Heuristic_Search(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D):
+    def Heuristic_Search(
+        self,
+        F,
+        pi,
+        DAG,
+        IDAG,
+        optimized_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache=None,
+    ):
         pi_initial = pi.copy()
+        F = list(F)
 
         resolved_partitions = [False] * len(DAG)
         partition_order = []
-        step = 0
+        resolved_count = 0
 
-        # Drain initial single-qubit partitions from F, recursively resolving
-        # any single-qubit descendants that become ready.  Children that are
-        # multi-qubit are pushed into F for the main search loop.
-        queue = [p for p in F if isinstance(optimized_partitions[p], SingleQubitPartitionResult)]
+        queue = deque(
+            p
+            for p in F
+            if isinstance(optimized_partitions[p], SingleQubitPartitionResult)
+        )
         while queue:
             partition_idx = queue.pop()
             if resolved_partitions[partition_idx]:
                 continue
             if partition_idx in F:
                 F.remove(partition_idx)
+
             single_qubit_part = optimized_partitions[partition_idx]
             original_qubit = int(single_qubit_part.involved_qbits[0])
             circuit_qubit = int(single_qubit_part.circuit.get_Qbits()[0])
-            single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits({circuit_qubit: int(pi[original_qubit])}, max(D.shape))
+            single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits(
+                {circuit_qubit: int(pi[original_qubit])},
+                max(D.shape),
+            )
             partition_order.append(single_qubit_part)
             resolved_partitions[partition_idx] = True
+            resolved_count += 1
+
             for child in DAG[partition_idx]:
                 if not resolved_partitions[child] and child not in F:
                     if all(resolved_partitions[p] for p in IDAG[child]):
-                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                        if isinstance(
+                            optimized_partitions[child],
+                            SingleQubitPartitionResult,
+                        ):
                             queue.append(child)
                         else:
                             F.append(child)
 
-        # Initialize progress bar
         total_partitions = len(DAG)
-        pbar = tqdm(total=total_partitions, desc="Heuristic Search",
-                   bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved',
-                   disable=self.config.get('progressbar', 0) == False)
+        pbar = tqdm(
+            total=total_partitions,
+            desc="Heuristic Search",
+            bar_format=(
+                "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved"
+            ),
+            disable=self.config.get("progressbar", 0) is False,
+            mininterval=0.2,
+        )
+        if resolved_count:
+            pbar.update(resolved_count)
 
-        max_E_size = self.config.get('max_E_size', 20)
-        max_lookahead = self.config.get('max_lookahead', 4)
-        E_W = self.config.get('E_weight', 0.5)
-        E_alpha = self.config.get('E_alpha', 0.9)
+        max_E_size = self.config.get("max_E_size", 20)
+        max_lookahead = self.config.get("max_lookahead", 4)
+        E_W = self.config.get("E_weight", 0.5)
+        E_alpha = self.config.get("E_alpha", 0.9)
 
-        canonical_data = self._build_canonical_neighbor_data(scoring_partitions, reverse=False)
+        canonical_data = self._build_canonical_neighbor_data(
+            scoring_partitions, reverse=False
+        )
 
-        valve_enabled = self.config.get('release_valve_enabled', True)
-        valve_threshold = self.config.get('release_valve_threshold', 20)
+        valve_enabled = self.config.get("release_valve_enabled", True)
+        valve_threshold = self.config.get("release_valve_threshold", 20)
         swaps_since_clean = 0
 
-        while len(F) != 0:
-                if valve_enabled and swaps_since_clean > valve_threshold:
-                    valve_swaps, pi_bridged = self._release_valve(F, pi, D, canonical_data)
-                    if valve_swaps:
-                        partition_order.append(construct_swap_circuit(valve_swaps, len(pi)))
-                        pi = np.asarray(pi_bridged)
-                    swaps_since_clean = 0
-                    continue
+        while F:
+            if valve_enabled and swaps_since_clean > valve_threshold:
+                valve_swaps, pi_bridged = self._release_valve(
+                    F, pi, D, canonical_data
+                )
+                if valve_swaps:
+                    partition_order.append(
+                        construct_swap_circuit(valve_swaps, len(pi))
+                    )
+                    pi = np.asarray(pi_bridged)
+                swaps_since_clean = 0
+                continue
 
-                partition_candidates = self.obtain_partition_candidates(F,optimized_partitions)
-                if len(partition_candidates) == 0:
-                    break
+            partition_candidates = self.obtain_partition_candidates(
+            F,
+            optimized_partitions,
+            candidate_cache=candidate_cache,
+            )
 
-                top_k = self.config.get('prefilter_top_k', 50)
-                partition_candidates = self._prefilter_candidates(partition_candidates, pi, D, top_k)
+            if not partition_candidates:
+                break
+
+            top_k = self.config.get("prefilter_top_k", 50)
+            partition_candidates = self._prefilter_candidates(
+                partition_candidates, pi, D, top_k
+            )
 
-                F_snapshot = tuple(F)
+            F_snapshot = tuple(F)
+            E = self.generate_extended_set(
+                F,
+                DAG,
+                IDAG,
+                resolved_partitions,
+                optimized_partitions,
+                max_E_size=max_E_size,
+                max_lookahead=max_lookahead,
+            )
 
-                E = self.generate_extended_set(
-                    F, DAG, IDAG, resolved_partitions, optimized_partitions,
-                    max_E_size=max_E_size, max_lookahead=max_lookahead
+            scores = [
+                self.score_partition_candidate(
+                    partition_candidate,
+                    F_snapshot,
+                    pi,
+                    scoring_partitions,
+                    D,
+                    self._swap_cache,
+                    E=E,
+                    W=E_W,
+                    alpha=E_alpha,
+                    canonical_data=canonical_data,
+                    adj=self._adj,
+                    local_cost_weight=self.config.get("local_cost_weight", 0.1),
+                    swap_cost=self.config.get("swap_cost", 15.0),
                 )
+                for partition_candidate in partition_candidates
+            ]
+            min_partition_candidate = self._select_best_candidate(
+                partition_candidates, scores
+            )
 
-                scores = [
-                        self.score_partition_candidate(
-                            partition_candidate,
-                            F_snapshot,
-                            pi,
-                            scoring_partitions,
-                            D,
-                            self._swap_cache,
-                            E=E,
-                            W=E_W,
-                            alpha=E_alpha,
-                            canonical_data=canonical_data,
-                            adj=self._adj,
-                            local_cost_weight=self.config.get('local_cost_weight', 0.1),
-                            swap_cost=self.config.get('swap_cost', 15.0),
-                        )
-                        for partition_candidate in partition_candidates
-                    ]
-                min_partition_candidate = self._select_best_candidate(partition_candidates, scores)
-
-                F.remove(min_partition_candidate.partition_idx)
-                resolved_partitions[min_partition_candidate.partition_idx] = True
-                resolved_count = sum(resolved_partitions)
-                pbar.n = resolved_count
-                pbar.refresh()
-
-                swap_order, pi = min_partition_candidate.transform_pi(
-                    pi, D, self._swap_cache, adj=self._adj,
+            F.remove(min_partition_candidate.partition_idx)
+            resolved_partitions[min_partition_candidate.partition_idx] = True
+            resolved_count += 1
+            pbar.update(1)
+
+            swap_order, pi = min_partition_candidate.transform_pi(
+                pi, D, self._swap_cache, adj=self._adj
+            )
+            if swap_order:
+                partition_order.append(construct_swap_circuit(swap_order, len(pi)))
+                swaps_since_clean += len(swap_order)
+            else:
+                swaps_since_clean = 0
+
+            partition_order.append(min_partition_candidate)
+
+            children = deque(DAG[min_partition_candidate.partition_idx])
+            while children:
+                child = children.popleft()
+                parents_resolved = all(
+                    resolved_partitions[parent] for parent in IDAG[child]
                 )
-                if len(swap_order) != 0:
-                    partition_order.append(construct_swap_circuit(swap_order, len(pi)))
-                    swaps_since_clean += len(swap_order)
-                else:
-                    swaps_since_clean = 0
-
-                partition_order.append(min_partition_candidate)
-                children = list(DAG[min_partition_candidate.partition_idx])
-                step += 1
-                while len(children) != 0:
-                    child = children.pop(0)
-                    parents_resolved = True
-                    for parent in IDAG[child]:
-                        parents_resolved *= resolved_partitions[parent]
-                    if (not resolved_partitions[child] and child not in F) and parents_resolved:
-                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
-                            child_partition = optimized_partitions[child]
-                            original_qubit = int(child_partition.involved_qbits[0])
-                            circuit_qubit = int(child_partition.circuit.get_Qbits()[0])
-                            child_partition.circuit = child_partition.circuit.Remap_Qbits({circuit_qubit: int(pi[original_qubit])},max(D.shape))
-                            partition_order.append(child_partition)
-                            resolved_partitions[child] = True
-                            resolved_count = sum(resolved_partitions)
-                            pbar.n = resolved_count
-                            pbar.refresh()
-                            children.extend(DAG[child])
-                        else:
-                            F.append(child)
+                if (not resolved_partitions[child] and child not in F) and (
+                    parents_resolved
+                ):
+                    if isinstance(
+                        optimized_partitions[child], SingleQubitPartitionResult
+                    ):
+                        child_partition = optimized_partitions[child]
+                        original_qubit = int(child_partition.involved_qbits[0])
+                        circuit_qubit = int(child_partition.circuit.get_Qbits()[0])
+                        child_partition.circuit = child_partition.circuit.Remap_Qbits(
+                            {circuit_qubit: int(pi[original_qubit])},
+                            max(D.shape),
+                        )
+                        partition_order.append(child_partition)
+                        resolved_partitions[child] = True
+                        resolved_count += 1
+                        pbar.update(1)
+                        children.extend(DAG[child])
+                    else:
+                        F.append(child)
 
         pbar.close()
         return partition_order, pi, pi_initial
 
-    def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions, scoring_partitions, D, rng=None, reverse=False):
+    def _heuristic_search_layout_only(
+        self,
+        F,
+        pi,
+        DAG,
+        IDAG,
+        optimized_partitions,
+        scoring_partitions,
+        D,
+        rng=None,
+        reverse=False,
+        candidate_cache=None,
+    ):
         """Run heuristic search but only track layout (pi). No circuit modification.
 
         Args:
             reverse: When True, swap P_i/P_o roles in scoring and layout
-                     updates (used for backward passes in SABRE iterations).
+                    updates (used for backward passes in SABRE iterations).
 
         Returns:
             (pi, total_swaps): final layout and total number of SWAPs accumulated.
         """
+        F = list(F)
         resolved_partitions = [False] * len(DAG)
         total_swaps = 0
 
-        # Resolve initial single-qubit partitions, recursively draining any
-        # single-qubit descendants.  Multi-qubit descendants go into F.
-        queue = [p for p in F if isinstance(optimized_partitions[p], SingleQubitPartitionResult)]
+        queue = deque(
+            p for p in F if self._partition_is_single(optimized_partitions[p])
+        )
         while queue:
             partition_idx = queue.pop()
             if resolved_partitions[partition_idx]:
@@ -1063,71 +1383,100 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
             if partition_idx in F:
                 F.remove(partition_idx)
             resolved_partitions[partition_idx] = True
+
             for child in DAG[partition_idx]:
                 if not resolved_partitions[child] and child not in F:
                     if all(resolved_partitions[p] for p in IDAG[child]):
-                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                        if self._partition_is_single(optimized_partitions[child]):
                             queue.append(child)
                         else:
                             F.append(child)
 
-        max_E_size = self.config.get('max_E_size', 20)
-        max_lookahead = self.config.get('max_lookahead', 4)
-        E_W = self.config.get('E_weight', 0.5)
-        E_alpha = self.config.get('E_alpha', 0.9)
+        max_E_size = self.config.get("max_E_size", 20)
+        max_lookahead = self.config.get("max_lookahead", 4)
+        E_W = self.config.get("E_weight", 0.5)
+        E_alpha = self.config.get("E_alpha", 0.9)
 
-        canonical_data = self._build_canonical_neighbor_data(scoring_partitions, reverse=reverse)
+        canonical_data = self._build_canonical_neighbor_data(
+            scoring_partitions, reverse=reverse
+        )
 
         while F:
-            partition_candidates = self.obtain_partition_candidates(F, optimized_partitions)
+            partition_candidates = self.obtain_partition_candidates(
+                F,
+                optimized_partitions,
+                candidate_cache=candidate_cache,
+            )
             if not partition_candidates:
                 break
 
-            top_k = self.config.get('prefilter_top_k', 50)
-            partition_candidates = self._prefilter_candidates(partition_candidates, pi, D, top_k, reverse=reverse)
+            top_k = self.config.get("prefilter_top_k", 50)
+            partition_candidates = self._prefilter_candidates(
+                partition_candidates, pi, D, top_k, reverse=reverse
+            )
 
             F_snapshot = tuple(F)
-
             E = self.generate_extended_set(
-                F, DAG, IDAG, resolved_partitions, optimized_partitions,
-                max_E_size=max_E_size, max_lookahead=max_lookahead
+                F,
+                DAG,
+                IDAG,
+                resolved_partitions,
+                optimized_partitions,
+                max_E_size=max_E_size,
+                max_lookahead=max_lookahead,
             )
 
             scores = [
                 self.score_partition_candidate(
-                    pc, F_snapshot, pi, scoring_partitions, D,
+                    pc,
+                    F_snapshot,
+                    pi,
+                    scoring_partitions,
+                    D,
                     self._swap_cache,
-                    E=E, W=E_W, alpha=E_alpha,
+                    E=E,
+                    W=E_W,
+                    alpha=E_alpha,
                     reverse=reverse,
                     canonical_data=canonical_data,
                     adj=self._adj,
-                    local_cost_weight=self.config.get('local_cost_weight', 0.1),
-                    swap_cost=self.config.get('swap_cost', 15.0),
+                    local_cost_weight=self.config.get("local_cost_weight", 0.1),
+                    swap_cost=self.config.get("swap_cost", 15.0),
                 )
                 for pc in partition_candidates
             ]
 
-            best = self._select_best_candidate(partition_candidates, scores, rng=rng)
+            best = self._select_best_candidate(
+                partition_candidates, scores, rng=rng
+            )
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 
             swaps, pi = best.transform_pi(
-                pi, D, self._swap_cache, reverse=reverse, adj=self._adj,
+                pi,
+                D,
+                self._swap_cache,
+                reverse=reverse,
+                adj=self._adj,
             )
             total_swaps += len(swaps)
 
-            # Promote children
             for child in DAG[best.partition_idx]:
                 if not resolved_partitions[child] and child not in F:
                     if all(resolved_partitions[p] for p in IDAG[child]):
-                        if isinstance(optimized_partitions[child], SingleQubitPartitionResult):
+                        if self._partition_is_single(optimized_partitions[child]):
                             resolved_partitions[child] = True
-                            stack = list(DAG[child])
+                            stack = deque(DAG[child])
                             while stack:
                                 gc = stack.pop()
                                 if not resolved_partitions[gc] and gc not in F:
-                                    if all(resolved_partitions[p] for p in IDAG[gc]):
-                                        if isinstance(optimized_partitions[gc], SingleQubitPartitionResult):
+                                    if all(
+                                        resolved_partitions[p]
+                                        for p in IDAG[gc]
+                                    ):
+                                        if self._partition_is_single(
+                                            optimized_partitions[gc]
+                                        ):
                                             resolved_partitions[gc] = True
                                             stack.extend(DAG[gc])
                                         else:
@@ -1135,8 +1484,7 @@ def _heuristic_search_layout_only(self, F, pi, DAG, IDAG, optimized_partitions,
                         else:
                             F.append(child)
 
-        return pi, total_swaps
-
+        return pi, total_swaps    
     # ------------------------------------------------------------------------
     # Circuit Construction
     # ------------------------------------------------------------------------
@@ -1283,12 +1631,19 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def generate_extended_set(F, DAG, IDAG, resolved_partitions, optimized_partitions,
-                              max_E_size=20, max_lookahead=4):
+    def generate_extended_set(
+        F,
+        DAG,
+        IDAG,
+        resolved_partitions,
+        optimized_partitions,
+        max_E_size=20,
+        max_lookahead=4,
+    ):
         """
         Generate SABRE-style extended set: multi-qubit partitions near the
         front layer, up to ``max_lookahead`` levels deep and ``max_E_size``
-        entries.  Returns list of (partition_idx, depth) tuples.
+        entries. Returns list of (partition_idx, depth) tuples.
         """
         E = []
         E_set = set()
@@ -1298,13 +1653,10 @@ def generate_extended_set(F, DAG, IDAG, resolved_partitions, optimized_partition
             if len(E) >= max_E_size:
                 break
 
-            # BFS from front_idx through DAG children
-            queue = []  # (child_idx, depth)
-            for child in DAG[front_idx]:
-                queue.append((child, 1))
+            queue = deque((child, 1) for child in DAG[front_idx])
 
             while queue and len(E) < max_E_size:
-                child_idx, depth = queue.pop(0)
+                child_idx, depth = queue.popleft()
                 if depth > max_lookahead:
                     continue
                 if child_idx in E_set or child_idx in F_set:
@@ -1312,16 +1664,15 @@ def generate_extended_set(F, DAG, IDAG, resolved_partitions, optimized_partition
                 if resolved_partitions[child_idx]:
                     continue
 
-                # Check all parents resolved (except those still in F)
                 parents_resolved = all(
-                    resolved_partitions[p] or p in F_set
-                    for p in IDAG[child_idx]
+                    resolved_partitions[p] or p in F_set for p in IDAG[child_idx]
                 )
                 if not parents_resolved:
                     continue
 
-                # Skip single-qubit partitions — follow through them
-                if isinstance(optimized_partitions[child_idx], SingleQubitPartitionResult):
+                if qgd_Partition_Aware_Mapping._partition_is_single(
+                    optimized_partitions[child_idx]
+                ):
                     for grandchild in DAG[child_idx]:
                         queue.append((grandchild, depth))
                     continue
@@ -1339,19 +1690,49 @@ def generate_extended_set(F, DAG, IDAG, resolved_partitions, optimized_partition
     # Candidate Generation
     # ------------------------------------------------------------------------
 
-    def obtain_partition_candidates(self, F, optimized_partitions):
+    def obtain_partition_candidates(
+        self,
+        F,
+        optimized_partitions=None,
+        candidate_cache=None,
+    ):
+        if candidate_cache is not None:
+            partition_candidates = []
+            for partition_idx in F:
+                cached = candidate_cache[partition_idx]
+                if cached:
+                    partition_candidates.extend(cached)
+            return partition_candidates
+
         partition_candidates = []
         for partition_idx in F:
             partition = optimized_partitions[partition_idx]
             for tdx, mini_topology in enumerate(partition.mini_topologies):
-                # Use pre-computed topology candidates if available, otherwise compute and cache
                 if hasattr(partition, 'get_topology_candidates'):
                     topology_candidates = partition.get_topology_candidates(tdx)
                 else:
-                    topology_candidates = self._get_subtopologies_of_type_cached(mini_topology)
+                    topology_candidates = self._get_subtopologies_of_type_cached(
+                        mini_topology
+                    )
                 for topology_candidate in topology_candidates:
-                    for pdx, permutation_pair in enumerate(partition.permutations_pairs[tdx]):
-                        partition_candidates.append(PartitionCandidate(partition_idx,tdx,pdx,partition.circuit_structures[tdx][pdx],permutation_pair[0],permutation_pair[1],topology_candidate,mini_topology,partition.qubit_map,partition.involved_qbits,cnot_count=partition.cnot_counts[tdx][pdx]))
+                    for pdx, permutation_pair in enumerate(
+                        partition.permutations_pairs[tdx]
+                    ):
+                        partition_candidates.append(
+                            PartitionCandidate(
+                                partition_idx,
+                                tdx,
+                                pdx,
+                                partition.circuit_structures[tdx][pdx],
+                                permutation_pair[0],
+                                permutation_pair[1],
+                                topology_candidate,
+                                mini_topology,
+                                partition.qubit_map,
+                                partition.involved_qbits,
+                                cnot_count=partition.cnot_counts[tdx][pdx],
+                            )
+                        )
         return partition_candidates
 
     # ------------------------------------------------------------------------
@@ -1360,29 +1741,33 @@ def obtain_partition_candidates(self, F, optimized_partitions):
         
     def get_initial_layer(self, IDAG, N, optimized_partitions):
         initial_layer = []
-        active_qbits = list(range(N))
+        active_qbits = set(range(N))
         for idx in range(len(IDAG)):
             if len(IDAG[idx]) == 0:
                 initial_layer.append(idx)
-                for qbit in optimized_partitions[idx].involved_qbits:
-                    active_qbits.remove(qbit)
-            if len(active_qbits) == 0:
+                for qbit in self._partition_involved_qbits(
+                    optimized_partitions[idx]
+                ):
+                    active_qbits.discard(qbit)
+            if not active_qbits:
                 break
         return initial_layer
 
+
     def get_final_layer(self, DAG, N, optimized_partitions):
         final_layer = []
-        active_qbits = list(range(N))
+        active_qbits = set(range(N))
         for idx in range(len(DAG) - 1, -1, -1):
             if len(DAG[idx]) == 0:
                 final_layer.append(idx)
-                for qbit in optimized_partitions[idx].involved_qbits:
-                    if qbit in active_qbits:
-                        active_qbits.remove(qbit)
-            if len(active_qbits) == 0:
+                for qbit in self._partition_involved_qbits(
+                    optimized_partitions[idx]
+                ):
+                    active_qbits.discard(qbit)
+            if not active_qbits:
                 break
         return final_layer
-            
+                
     def construct_DAG_and_IDAG(self, optimized_partitions):
         DAG = []
         IDAG = []

From 4a1790e222f90cc22c5ac32c4474fd4360d37e58 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Tue, 21 Apr 2026 19:34:30 +0200
Subject: [PATCH 124/232] set global min to true for cleanup

---
 squander/synthesis/PartAM.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 8456b7951..941b546b1 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -936,7 +936,7 @@ def Partition_Aware_Mapping(
                 cleanup_config['routed'] = True
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
-                cleanup_config['global_min'] = False
+                cleanup_config['global_min'] = True
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
                 saved_sq_circuits = {
@@ -1038,7 +1038,7 @@ def Partition_Aware_Mapping(
                 cleanup_config['routed'] = True
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
-                cleanup_config['global_min'] = False
+                cleanup_config['global_min'] = True
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
                 final_circuit, final_parameters = wco.OptimizeWideCircuit(

From 0bfd35f0a8c97b053f92f0f5d1b4193463b7ee68 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 21 Apr 2026 21:17:40 +0200
Subject: [PATCH 125/232] dont include cleanup in routing time

---
 squander/synthesis/PartAM.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 941b546b1..21beaf1a9 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -954,6 +954,7 @@ def Partition_Aware_Mapping(
                 best_pi = None
                 best_cost = float('inf')
                 best_pre_cleanup = None
+                cleanup_total = 0.0
 
                 for _, trial_pi in top_layouts:
                     for idx, orig in saved_sq_circuits.items():
@@ -980,10 +981,12 @@ def Partition_Aware_Mapping(
                         'CNOT', 0
                     )
 
+                    cleanup_t0 = time.time()
                     cleaned_circuit, cleaned_params = wco.OptimizeWideCircuit(
                         trial_circuit.get_Flat_Circuit(),
                         trial_params,
                     )
+                    cleanup_total += time.time() - cleanup_t0
                     cleaned_cost = cleaned_circuit.get_Gate_Nums().get(
                         'CNOT', 0
                     )
@@ -1020,7 +1023,7 @@ def Partition_Aware_Mapping(
                 )
 
         if do_cleanup and n_iterations > 0:
-            self._routing_time = time.time() - routing_start
+            self._routing_time = time.time() - routing_start - cleanup_total
             self._cnot_pre_cleanup = best_pre_cleanup
         else:
             self._routing_time = time.time() - routing_start

From 021b870495fa0126ffe24fe60ce553b1a9394f6a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 21 Apr 2026 22:06:16 +0200
Subject: [PATCH 126/232] Fix tests

---
 tests/gates/test_gates.py | 23 +++++++----------------
 1 file changed, 7 insertions(+), 16 deletions(-)

diff --git a/tests/gates/test_gates.py b/tests/gates/test_gates.py
index ba38e7e85..546ed32e9 100644
--- a/tests/gates/test_gates.py
+++ b/tests/gates/test_gates.py
@@ -49,7 +49,8 @@ def _discover_gate_names():
 
 
 ALL_GATE_NAMES = _discover_gate_names()
-QISKIT_EXCLUDED_GATES = {"SYC", "CR", "CROT"}
+QISKIT_EXCLUDED_GATES = {"SYC", "CR", "CROT", "Permutation"}
+CIRCUIT_UNSUPPORTED_GATES = {"Gate", "Permutation"}
 QISKIT_MATRIX_UNSUPPORTED = {"Gate"} | QISKIT_EXCLUDED_GATES
 NATIVE_UNSAFE_MATRIX_GATES = {"Gate"}
 NATIVE_UNSAFE_APPLY_GATES = {"Gate"}
@@ -70,7 +71,7 @@ def _discover_parameterized_gate_names():
 def _discover_multi_qubit_gate_names():
     names = []
     for gate_name in ALL_GATE_NAMES:
-        if gate_name == "Gate":
+        if gate_name in CIRCUIT_UNSUPPORTED_GATES:
             continue
         gate_obj = _instantiate_gate(gate_name)
         if len(gate_obj.get_Involved_Qbits()) >= 2:
@@ -93,6 +94,8 @@ def _instantiate_gate(gate_name, qbit_num=4):
         return gate_cls(qbit_num, 0, qbit_num - 1)
     if gate_name.startswith("C"):
         return gate_cls(qbit_num, 0, qbit_num - 1)
+    if gate_name == "Permutation":
+        return gate_cls(qbit_num, list(range(qbit_num)))
     return gate_cls(qbit_num, 0)
 
 
@@ -608,7 +611,7 @@ def test_qiskit_io_roundtrip_per_gate(self, gate_name):
 
     @pytest.mark.parametrize(
         "gate_name",
-        [name for name in ALL_GATE_NAMES if name != "Gate"],
+        [name for name in ALL_GATE_NAMES if name not in CIRCUIT_UNSUPPORTED_GATES],
     )
     def test_squander_invert_circuit(self, gate_name):
         script = f"""
@@ -679,16 +682,4 @@ def test_circuit_to_cnot_basis_removes_non_cnot_multi_qubit_gates(self, gate_nam
 
         original_matrix = np.asarray(circuit.get_Matrix(params))
         transpiled_matrix = np.asarray(cnot_circuit.get_Matrix(cnot_params))
-        _assert_matrices_close(original_matrix, transpiled_matrix, tol=1e-6)
-
-            if inspect.isclass(obj):
-                
-                if name == "SYC" or name == "Gate" or name=="CR" or name=="CROT" or name=="Permutation":
-                    continue
-
-                print(f"testing gate: {name}")
-
-                self.perform_gate_matrix_testing( obj )
-                if name == "SWAP" or name == "RXX" or name == "RYY" or name == "RZZ":
-                    continue
-                self.perform_gate_apply_to_testing( obj )
\ No newline at end of file
+        _assert_matrices_close(original_matrix, transpiled_matrix, tol=1e-6)
\ No newline at end of file

From 4ba0ebae5b31f333ed0d158f987f921fbe4048c5 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 21 Apr 2026 22:36:41 +0200
Subject: [PATCH 127/232] Make PartAM cover larger initial layouts

---
 squander/synthesis/PartAM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 21beaf1a9..02aaa5fcf 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -753,7 +753,7 @@ def _run_single_layout_trial(
             if n_trials > 1
             else None
         )
-        pi = seeded_pi.copy() if trial_idx == 0 else np.arange(N)
+        pi = seeded_pi.copy() if trial_idx == 0 else rng.permutation(N)
 
         for iteration in range(n_iterations):
             F_rev = self.get_final_layer(DAG, N, layout_partitions)

From 18a1764ec4c48afcedf7663fd0cce4ad1125ce21 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 21 Apr 2026 22:39:37 +0200
Subject: [PATCH 128/232] add vf2 cutoff

---
 squander/synthesis/PartAM.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 02aaa5fcf..d926345c3 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -753,7 +753,11 @@ def _run_single_layout_trial(
             if n_trials > 1
             else None
         )
-        pi = seeded_pi.copy() if trial_idx == 0 else rng.permutation(N)
+        vf2_cutoff = max(1, int(n_trials * 0.05))
+        if trial_idx < vf2_cutoff:
+            pi = seeded_pi.copy()
+        else:
+            pi = rng.permutation(N)
 
         for iteration in range(n_iterations):
             F_rev = self.get_final_layer(DAG, N, layout_partitions)

From 290f5462dcca53221e8976527531cf926ebf53c5 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 22 Apr 2026 20:33:31 +0200
Subject: [PATCH 129/232] Add C++ SABRE routing engine (31x speedup)

Port the hot-path heuristic search from Python to C++17 with pybind11:
- A* constrained swap search (find_constrained_swaps_partial)
- LightSABRE scoring (score_partition_candidate)
- BFS extended set generation (generate_extended_set)
- Full heuristic search loop (_heuristic_search_layout_only)
- Forward-backward-forward trial runner (_run_single_layout_trial)

C++ router runs with GIL released, enabling true multithreading via
ThreadPoolExecutor instead of multiprocessing. On bv_n14: routing drops
from 3.5s to 0.07s (128 parallel trials), same CNOT count as Python.

Key correctness details:
- DAG/IDAG are swapped for backward passes (matching Python convention)
- Single-qubit partitions are NOT auto-resolved in heuristic_search,
  matching Python behavior when layout_partitions (dicts) are passed
- get_initial/final_layer stop adding once all qubits are covered

Python fallback preserved behind use_cpp_router config flag.
---
 CMakeLists.txt                                |   6 +
 squander/src-cpp/sabre_router/CMakeLists.txt  | 118 +++
 .../sabre_router/include/sabre_router.hpp     | 320 ++++++
 .../src-cpp/sabre_router/sabre_router.cpp     | 909 ++++++++++++++++++
 squander/synthesis/PartAM.py                  |  85 ++
 squander/synthesis/bindings.cpp               | 203 ++++
 6 files changed, 1641 insertions(+)
 create mode 100644 squander/src-cpp/sabre_router/CMakeLists.txt
 create mode 100644 squander/src-cpp/sabre_router/include/sabre_router.hpp
 create mode 100644 squander/src-cpp/sabre_router/sabre_router.cpp
 create mode 100644 squander/synthesis/bindings.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2d9595be5..839eeea81 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1059,6 +1059,12 @@ add_subdirectory (squander/VQA)
 
 add_subdirectory (squander/src-cpp/density_matrix)
 
+# ===================================================================
+# SABRE Router Module
+# ===================================================================
+
+add_subdirectory (squander/src-cpp/sabre_router)
+
 if(DEFINED ENV{QGD_CTEST})
     # adding CMAKE files for executables
     add_subdirectory (test_standalone)
diff --git a/squander/src-cpp/sabre_router/CMakeLists.txt b/squander/src-cpp/sabre_router/CMakeLists.txt
new file mode 100644
index 000000000..f4cfa9604
--- /dev/null
+++ b/squander/src-cpp/sabre_router/CMakeLists.txt
@@ -0,0 +1,118 @@
+# ===================================================================
+# SQUANDER SABRE Router Module - C++ Routing Engine + pybind11 Bindings
+# ===================================================================
+
+message(STATUS "")
+message(STATUS "=== Configuring SABRE Router Module ===")
+
+# ===================================================================
+# Find pybind11
+# ===================================================================
+
+find_package(pybind11 CONFIG QUIET)
+
+if(NOT pybind11_FOUND)
+    message(STATUS "pybind11 not found via find_package, trying Python import...")
+    execute_process(
+        COMMAND ${PYTHON_EXECUTABLE} -c "import pybind11; print(pybind11.get_cmake_dir())"
+        OUTPUT_VARIABLE pybind11_DIR
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_QUIET
+    )
+    if(pybind11_DIR)
+        message(STATUS "Found pybind11 via Python at: ${pybind11_DIR}")
+        find_package(pybind11 CONFIG PATHS ${pybind11_DIR})
+    endif()
+endif()
+
+if(NOT pybind11_FOUND)
+    message(WARNING "")
+    message(WARNING "pybind11 not found - SABRE router module will be skipped")
+    message(WARNING "Install with: pip install pybind11")
+    message(WARNING "")
+    return()
+endif()
+
+message(STATUS "pybind11 version: ${pybind11_VERSION}")
+
+# ===================================================================
+# Source Files
+# ===================================================================
+
+set(SABRE_SOURCES
+    sabre_router.cpp
+)
+
+set(SABRE_HEADERS
+    include/sabre_router.hpp
+)
+
+# ===================================================================
+# Static C++ library
+# ===================================================================
+
+add_library(sabre_router_core STATIC
+    ${SABRE_SOURCES}
+    ${SABRE_HEADERS}
+)
+
+target_include_directories(sabre_router_core
+    PUBLIC
+        $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+)
+
+# C++17 for this module only (does not affect global C++11)
+target_compile_features(sabre_router_core PUBLIC cxx_std_17)
+
+target_compile_options(sabre_router_core PRIVATE
+    $<$<OR:$<CXX_COMPILER_ID:GNU>,$<CXX_COMPILER_ID:Clang>>:
+        -Wall -Wextra -fPIC
+        $<$<CONFIG:Release>:-O3 -march=native>
+        $<$<CONFIG:Debug>:-g -O0>
+    >
+    $<$<CXX_COMPILER_ID:MSVC>:
+        $<$<CONFIG:Release>:/O2>
+        $<$<CONFIG:Debug>:/Od /Zi>
+    >
+)
+
+set_target_properties(sabre_router_core PROPERTIES
+    POSITION_INDEPENDENT_CODE ON
+)
+
+# ===================================================================
+# pybind11 module
+# ===================================================================
+
+pybind11_add_module(_sabre_router MODULE
+    ../../synthesis/bindings.cpp
+)
+
+target_link_libraries(_sabre_router PRIVATE
+    sabre_router_core
+)
+
+set_target_properties(_sabre_router PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/squander/synthesis
+    OUTPUT_NAME "_sabre_router"
+)
+
+# Set BUILD_RPATH to prioritize conda libraries
+if(DEFINED ENV{CONDA_PREFIX})
+    set_target_properties(_sabre_router PROPERTIES
+        BUILD_RPATH "${CONDA_PREFIX}/lib"
+        BUILD_RPATH_USE_ORIGIN TRUE
+    )
+endif()
+
+# ===================================================================
+# Installation
+# ===================================================================
+
+install(TARGETS _sabre_router
+        LIBRARY DESTINATION squander/synthesis
+        RUNTIME DESTINATION squander/synthesis
+        COMPONENT python)
+
+message(STATUS "=== SABRE Router Module Configured ===")
+message(STATUS "")
diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
new file mode 100644
index 000000000..d1c55cf3f
--- /dev/null
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -0,0 +1,320 @@
+#pragma once
+/*
+Copyright 2025 SQUANDER Contributors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+C++ backend for the SABRE-style partition-aware routing engine.
+Ported from squander/synthesis/PartAM.py and PartAM_utils.py.
+*/
+
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <queue>
+#include <random>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace squander::routing {
+
+// ---------------------------------------------------------------------------
+// Data structures (flattened from Python objects)
+// ---------------------------------------------------------------------------
+
+struct Edge {
+    int u, v;
+};
+
+struct CandidateData {
+    int partition_idx;
+    int topology_idx;
+    int permutation_idx;
+    int cnot_count;
+
+    // Permutations within the reduced (q*) space
+    // P_i[v] = position in Q* space for input routing
+    // P_o[v] = position in Q* space for output placement
+    std::vector<int> P_i;
+    std::vector<int> P_o;
+
+    // node_mapping_flat[Q*_idx] = Q (physical qubit)
+    // Dense array indexed by Q* index
+    std::vector<int> node_mapping_flat;
+
+    // qbit_map: original circuit qubit q -> reduced qubit q*
+    std::vector<int> qbit_map_keys;
+    std::vector<int> qbit_map_vals;
+
+    // Original circuit qubits involved in this partition
+    std::vector<int> involved_qbits;
+};
+
+struct CanonicalEntry {
+    std::vector<int> edges_u; // virtual qubit indices
+    std::vector<int> edges_v;
+    int cnot;
+};
+
+struct LayoutPartInfo {
+    bool is_single;
+    std::vector<int> involved_qbits;
+};
+
+struct SabreConfig {
+    int prefilter_top_k = 50;
+    int max_E_size = 20;
+    int max_lookahead = 4;
+    double E_weight = 0.5;
+    double E_alpha = 0.9;
+    double local_cost_weight = 0.1;
+    double swap_cost = 15.0;
+    double score_tolerance = 0.05;
+    int sabre_iterations = 1;
+    int n_layout_trials = 1;
+    int random_seed = 42;
+};
+
+struct TrialResult {
+    std::vector<int> pi;
+    int total_cost;
+};
+
+// ---------------------------------------------------------------------------
+// Swap cache key for deduplication within a single heuristic_search call
+// ---------------------------------------------------------------------------
+
+struct SwapCacheKey {
+    // Snapshot of pi[] at the involved qubit positions + target positions
+    std::vector<int> pi_snapshot;
+    std::vector<int> targets;
+
+    bool operator==(const SwapCacheKey& o) const {
+        return pi_snapshot == o.pi_snapshot && targets == o.targets;
+    }
+};
+
+struct SwapCacheKeyHash {
+    size_t operator()(const SwapCacheKey& k) const {
+        size_t h = 0;
+        for (int v : k.pi_snapshot) h = h * 31 + static_cast<size_t>(v);
+        for (int v : k.targets) h = h * 31 + static_cast<size_t>(v);
+        return h;
+    }
+};
+
+// ---------------------------------------------------------------------------
+// A* state packing helpers
+// ---------------------------------------------------------------------------
+
+// For k <= 4 partition qubits on N <= 64 physical qubits, pack state into int64_t
+// State = sum(positions[i] * N^i), fits in 64 bits when N <= 64 and k <= 4
+inline int64_t pack_state(const std::vector<int>& positions, int N) {
+    int64_t s = 0;
+    int64_t stride = 1;
+    for (size_t i = 0; i < positions.size(); i++) {
+        s += static_cast<int64_t>(positions[i]) * stride;
+        stride *= N;
+    }
+    return s;
+}
+
+inline std::vector<int> unpack_state(int64_t packed, int k, int N) {
+    std::vector<int> positions(k);
+    for (int i = 0; i < k; i++) {
+        positions[i] = static_cast<int>(packed % N);
+        packed /= N;
+    }
+    return positions;
+}
+
+// ---------------------------------------------------------------------------
+// SabreRouter class
+// ---------------------------------------------------------------------------
+
+class SabreRouter {
+public:
+    SabreRouter(
+        const SabreConfig& config,
+        int N,
+        const std::vector<double>& D,
+        const std::vector<std::vector<int>>& adj,
+        const std::vector<std::vector<int>>& DAG,
+        const std::vector<std::vector<int>>& IDAG,
+        const std::vector<std::vector<CandidateData>>& candidate_cache,
+        const std::vector<LayoutPartInfo>& layout_partitions,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data_fwd,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data_rev
+    );
+
+    // Thread-safe: all mutable state is stack-local
+    TrialResult run_trial(
+        int trial_idx,
+        const std::vector<int>& seeded_pi,
+        int n_iterations,
+        int n_trials
+    ) const;
+
+private:
+    // Distance lookup (flat row-major)
+    inline double dist(int phys_u, int phys_v) const {
+        return D_[phys_u * N_ + phys_v];
+    }
+
+    // Heuristic search (port of _heuristic_search_layout_only)
+    // children_graph/parents_graph: swapped for backward passes
+    std::pair<std::vector<int>, int> heuristic_search(
+        const std::vector<int>& F_init,
+        std::vector<int> pi,
+        bool reverse,
+        std::mt19937* rng,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::vector<std::vector<int>>& parents_graph
+    ) const;
+
+    // A* constrained swap search (port of find_constrained_swaps_partial)
+    std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+    find_constrained_swaps(
+        const std::vector<int>& pi,
+        const std::vector<int>& qbit_map_keys,
+        const std::vector<int>& qbit_map_vals,
+        const std::vector<int>& node_mapping_flat,
+        const std::vector<int>& P_route_inv,
+        std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+    ) const;
+
+    // Lower-bound swap estimate (port of estimate_swap_count)
+    int estimate_swap_count(
+        const CandidateData& cand,
+        const std::vector<int>& pi,
+        bool reverse
+    ) const;
+
+    // BFS lookahead (port of generate_extended_set)
+    std::vector<std::pair<int,int>> generate_extended_set(
+        const std::vector<int>& F,
+        const std::vector<uint8_t>& resolved,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::vector<std::vector<int>>& parents_graph
+    ) const;
+
+    // LightSABRE scoring (port of score_partition_candidate)
+    double score_candidate(
+        const CandidateData& cand,
+        const std::vector<int>& F_snapshot,
+        const std::vector<int>& pi,
+        const std::vector<std::pair<int,int>>& E,
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data,
+        std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+    ) const;
+
+    // Route and update layout for a candidate (port of transform_pi)
+    std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+    transform_pi(
+        const CandidateData& cand,
+        const std::vector<int>& pi,
+        bool reverse,
+        std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+    ) const;
+
+    // Release valve for stuck front layers
+    std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+    release_valve(
+        const std::vector<int>& F,
+        const std::vector<int>& pi,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    // BFS shortest path on adjacency graph
+    std::vector<int> bfs_shortest_path(int src, int dst) const;
+
+    // Apply a list of SWAPs to pi
+    std::vector<int> apply_swaps_to_pi(
+        const std::vector<int>& pi,
+        const std::vector<std::pair<int,int>>& swaps
+    ) const;
+
+    // Get initial layer (partitions with no unresolved parents)
+    std::vector<int> get_initial_layer() const;
+
+    // Get final layer (partitions with no children)
+    std::vector<int> get_final_layer() const;
+
+    // Prefilter candidates by cheap swap estimate
+    std::vector<const CandidateData*> prefilter_candidates(
+        const std::vector<const CandidateData*>& candidates,
+        const std::vector<int>& pi,
+        int top_k,
+        bool reverse
+    ) const;
+
+    // Select best candidate with optional stochastic tie-breaking
+    const CandidateData& select_best_candidate(
+        const std::vector<const CandidateData*>& candidates,
+        const std::vector<double>& scores,
+        std::mt19937* rng
+    ) const;
+
+    // Check if partition is single-qubit
+    inline bool partition_is_single(int partition_idx) const {
+        return layout_partitions_[partition_idx].is_single;
+    }
+
+    // Gather all candidates for partitions in F
+    std::vector<const CandidateData*> obtain_partition_candidates(
+        const std::vector<int>& F
+    ) const;
+
+    // Random permutation of [0..N-1]
+    std::vector<int> random_permutation(int n, std::mt19937& rng) const;
+
+    // Build P_route_inv: the inverse permutation used for routing
+    std::vector<int> build_route_inv(const std::vector<int>& P, bool reverse) const;
+
+    // Build target dict for A*: {qbit_map_key -> node_mapping[P_route_inv[qbit_map_val]]}
+    void build_target_positions(
+        const CandidateData& cand,
+        bool reverse,
+        std::vector<int>& out_keys,
+        std::vector<int>& out_targets
+    ) const;
+
+    // Compute routing cost for canonical edges under a given pi
+    double compute_routing_cost(
+        const std::vector<int>& pi,
+        int exclude_partition_idx,
+        const std::vector<int>& partition_indices,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    // Compute lookahead cost with alpha^depth decay
+    double compute_lookahead_cost(
+        const std::vector<int>& pi,
+        int exclude_partition_idx,
+        const std::vector<std::pair<int,int>>& E,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    // Immutable data members
+    SabreConfig config_;
+    int N_; // number of physical qubits
+    int num_partitions_;
+    std::vector<double> D_; // flat N*N distance matrix (owned copy)
+    std::vector<std::vector<int>> adj_;
+    std::vector<std::vector<int>> DAG_;
+    std::vector<std::vector<int>> IDAG_;
+    std::vector<std::vector<CandidateData>> candidate_cache_;
+    std::vector<LayoutPartInfo> layout_partitions_;
+    std::unordered_map<int, CanonicalEntry> canonical_data_fwd_;
+    std::unordered_map<int, CanonicalEntry> canonical_data_rev_;
+};
+
+} // namespace squander::routing
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
new file mode 100644
index 000000000..ba9ffa389
--- /dev/null
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -0,0 +1,909 @@
+/*
+Copyright 2025 SQUANDER Contributors
+
+C++ backend for the SABRE-style partition-aware routing engine.
+*/
+
+#include "sabre_router.hpp"
+
+#include <algorithm>
+#include <cmath>
+#include <deque>
+#include <numeric>
+#include <queue>
+#include <random>
+#include <stdexcept>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+namespace squander::routing {
+
+// ---------------------------------------------------------------------------
+// Constructor
+// ---------------------------------------------------------------------------
+
+SabreRouter::SabreRouter(
+    const SabreConfig& config,
+    int N,
+    const std::vector<double>& D,
+    const std::vector<std::vector<int>>& adj,
+    const std::vector<std::vector<int>>& DAG,
+    const std::vector<std::vector<int>>& IDAG,
+    const std::vector<std::vector<CandidateData>>& candidate_cache,
+    const std::vector<LayoutPartInfo>& layout_partitions,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data_fwd,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data_rev
+)
+    : config_(config)
+    , N_(N)
+    , num_partitions_(static_cast<int>(DAG.size()))
+    , D_(D)
+    , adj_(adj)
+    , DAG_(DAG)
+    , IDAG_(IDAG)
+    , candidate_cache_(candidate_cache)
+    , layout_partitions_(layout_partitions)
+    , canonical_data_fwd_(canonical_data_fwd)
+    , canonical_data_rev_(canonical_data_rev)
+{
+    if (static_cast<int>(D_.size()) != N_ * N_) {
+        throw std::invalid_argument("Distance matrix D must be N x N");
+    }
+}
+
+// ---------------------------------------------------------------------------
+// run_trial (stub for Phase A)
+// ---------------------------------------------------------------------------
+
+// run_trial implemented below (after all private methods)
+
+// ---------------------------------------------------------------------------
+// Helper: random permutation
+// ---------------------------------------------------------------------------
+
+std::vector<int> SabreRouter::random_permutation(int n, std::mt19937& rng) const {
+    std::vector<int> perm(n);
+    std::iota(perm.begin(), perm.end(), 0);
+    std::shuffle(perm.begin(), perm.end(), rng);
+    return perm;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: build P_route_inv
+// ---------------------------------------------------------------------------
+
+std::vector<int> SabreRouter::build_route_inv(const std::vector<int>& P, bool /*reverse*/) const {
+    // P_route_inv[i] = index of i in P (inverse permutation)
+    int k = static_cast<int>(P.size());
+    std::vector<int> inv(k);
+    for (int i = 0; i < k; i++) {
+        inv[P[i]] = i;
+    }
+    return inv;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: build target positions for A*
+// ---------------------------------------------------------------------------
+
+void SabreRouter::build_target_positions(
+    const CandidateData& cand,
+    bool reverse,
+    std::vector<int>& out_keys,
+    std::vector<int>& out_targets
+) const {
+    const std::vector<int>& P_route = reverse ? cand.P_o : cand.P_i;
+    std::vector<int> P_route_inv(P_route.size());
+    for (size_t i = 0; i < P_route.size(); i++) {
+        P_route_inv[P_route[i]] = static_cast<int>(i);
+    }
+
+    out_keys = cand.qbit_map_keys;
+    out_targets.resize(cand.qbit_map_keys.size());
+    for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
+        int v = cand.qbit_map_vals[i];
+        out_targets[i] = cand.node_mapping_flat[P_route_inv[v]];
+    }
+}
+
+// ---------------------------------------------------------------------------
+// BFS shortest path
+// ---------------------------------------------------------------------------
+
+std::vector<int> SabreRouter::bfs_shortest_path(int src, int dst) const {
+    if (src == dst) return {src};
+
+    std::vector<int> parent(N_, -1);
+    std::vector<uint8_t> visited(N_, 0);
+    std::deque<int> queue;
+    queue.push_back(src);
+    visited[src] = 1;
+
+    while (!queue.empty()) {
+        int node = queue.front();
+        queue.pop_front();
+        for (int nb : adj_[node]) {
+            if (!visited[nb]) {
+                visited[nb] = 1;
+                parent[nb] = node;
+                if (nb == dst) {
+                    // Reconstruct path
+                    std::vector<int> path;
+                    int cur = dst;
+                    while (cur != src) {
+                        path.push_back(cur);
+                        cur = parent[cur];
+                    }
+                    path.push_back(src);
+                    std::reverse(path.begin(), path.end());
+                    return path;
+                }
+                queue.push_back(nb);
+            }
+        }
+    }
+    return {}; // unreachable
+}
+
+// ---------------------------------------------------------------------------
+// apply_swaps_to_pi
+// ---------------------------------------------------------------------------
+
+std::vector<int> SabreRouter::apply_swaps_to_pi(
+    const std::vector<int>& pi,
+    const std::vector<std::pair<int,int>>& swaps
+) const {
+    std::vector<int> result(pi);
+    std::vector<int> p2v(N_);
+    for (int q = 0; q < N_; q++) p2v[result[q]] = q;
+
+    for (auto [P1, P2] : swaps) {
+        int q1 = p2v[P1];
+        int q2 = p2v[P2];
+        p2v[P1] = q2;
+        p2v[P2] = q1;
+        result[q1] = P2;
+        result[q2] = P1;
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// get_initial_layer / get_final_layer
+// ---------------------------------------------------------------------------
+
+std::vector<int> SabreRouter::get_initial_layer() const {
+    std::vector<int> layer;
+    std::vector<uint8_t> covered(N_, 0);
+    int uncovered = N_;
+    for (int p = 0; p < num_partitions_ && uncovered > 0; p++) {
+        if (IDAG_[p].empty()) {
+            layer.push_back(p);
+            for (int q : layout_partitions_[p].involved_qbits) {
+                if (q < N_ && !covered[q]) {
+                    covered[q] = 1;
+                    uncovered--;
+                }
+            }
+        }
+    }
+    return layer;
+}
+
+std::vector<int> SabreRouter::get_final_layer() const {
+    std::vector<int> layer;
+    std::vector<uint8_t> covered(N_, 0);
+    int uncovered = N_;
+    for (int p = num_partitions_ - 1; p >= 0 && uncovered > 0; p--) {
+        if (DAG_[p].empty()) {
+            layer.push_back(p);
+            for (int q : layout_partitions_[p].involved_qbits) {
+                if (q < N_ && !covered[q]) {
+                    covered[q] = 1;
+                    uncovered--;
+                }
+            }
+        }
+    }
+    return layer;
+}
+
+// ---------------------------------------------------------------------------
+// estimate_swap_count
+// ---------------------------------------------------------------------------
+
+int SabreRouter::estimate_swap_count(
+    const CandidateData& cand,
+    const std::vector<int>& pi,
+    bool reverse
+) const {
+    const std::vector<int>& P_route = reverse ? cand.P_o : cand.P_i;
+    std::vector<int> P_route_inv(P_route.size());
+    for (size_t i = 0; i < P_route.size(); i++) {
+        P_route_inv[P_route[i]] = static_cast<int>(i);
+    }
+
+    double total = 0.0;
+    for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
+        int k = cand.qbit_map_keys[i];
+        int v = cand.qbit_map_vals[i];
+        int target_P = cand.node_mapping_flat[P_route_inv[v]];
+        int current_P = pi[k];
+        double d = dist(current_P, target_P);
+        if (d < std::numeric_limits<double>::infinity()) {
+            total += d;
+        }
+    }
+    return static_cast<int>(total / 2.0);
+}
+
+// ---------------------------------------------------------------------------
+// find_constrained_swaps (A* over k-dimensional state space)
+// Port of find_constrained_swaps_partial from PartAM_utils.py
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+SabreRouter::find_constrained_swaps(
+    const std::vector<int>& pi,
+    const std::vector<int>& qbit_map_keys,
+    const std::vector<int>& qbit_map_vals,
+    const std::vector<int>& node_mapping_flat,
+    const std::vector<int>& P_route_inv,
+    std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+) const {
+    // Build target dict: {q -> target_physical}
+    int k = static_cast<int>(qbit_map_keys.size());
+    std::vector<int> partition_qubits(k);
+    std::vector<int> target_positions(k);
+    std::vector<int> initial_positions(k);
+
+    for (int i = 0; i < k; i++) {
+        int q = qbit_map_keys[i];
+        int v = qbit_map_vals[i];
+        partition_qubits[i] = q;
+        target_positions[i] = node_mapping_flat[P_route_inv[v]];
+        initial_positions[i] = pi[q];
+    }
+
+    // Check if already at target
+    bool already_there = true;
+    for (int i = 0; i < k; i++) {
+        if (initial_positions[i] != target_positions[i]) {
+            already_there = false;
+            break;
+        }
+    }
+    if (already_there) {
+        return {{}, pi};
+    }
+
+    // Check swap cache
+    if (swap_cache) {
+        SwapCacheKey key;
+        key.pi_snapshot.resize(k);
+        key.targets.resize(k);
+        for (int i = 0; i < k; i++) {
+            key.pi_snapshot[i] = initial_positions[i];
+            key.targets[i] = target_positions[i];
+        }
+        auto it = swap_cache->find(key);
+        if (it != swap_cache->end()) {
+            // Replay cached swaps on current pi
+            auto result_pi = apply_swaps_to_pi(pi, it->second.first);
+            return {it->second.first, result_pi};
+        }
+    }
+
+    // A* search over k-dimensional state space
+    // State: vector of physical positions for each partition qubit
+    // Heuristic: sum(D[pos_i][target_i]) / 2
+
+    int64_t initial_packed = pack_state(initial_positions, N_);
+    int64_t target_packed = pack_state(target_positions, N_);
+
+    // Compute initial heuristic
+    double h0 = 0.0;
+    for (int i = 0; i < k; i++) {
+        h0 += dist(initial_positions[i], target_positions[i]);
+    }
+    h0 /= 2.0;
+
+    // Priority queue: (f_score, g_score, counter, packed_state)
+    // Using greater for min-heap behavior
+    using PQEntry = std::tuple<double, int, int64_t>;
+    std::priority_queue<PQEntry, std::vector<PQEntry>, std::greater<PQEntry>> pq;
+
+    // Visited: packed_state -> best g_score
+    std::unordered_map<int64_t, int> visited;
+    // Parent: packed_state -> (parent_packed_state, swap)
+    std::unordered_map<int64_t, std::pair<int64_t, std::pair<int,int>>> parent;
+
+    pq.push({h0, 0, initial_packed});
+    visited[initial_packed] = 0;
+    parent[initial_packed] = {-1, {-1, -1}};
+
+    while (!pq.empty()) {
+        auto [f, g, packed] = pq.top();
+        pq.pop();
+
+        if (packed == target_packed) {
+            // Reconstruct swap path
+            std::vector<std::pair<int,int>> path;
+            int64_t cur = packed;
+            while (parent[cur].first != -1) {
+                path.push_back(parent[cur].second);
+                cur = parent[cur].first;
+            }
+            std::reverse(path.begin(), path.end());
+
+            // Replay swaps on full pi
+            auto result_pi = apply_swaps_to_pi(pi, path);
+
+            // Store in cache
+            if (swap_cache) {
+                SwapCacheKey key;
+                key.pi_snapshot.resize(k);
+                key.targets.resize(k);
+                for (int i = 0; i < k; i++) {
+                    key.pi_snapshot[i] = initial_positions[i];
+                    key.targets[i] = target_positions[i];
+                }
+                (*swap_cache)[key] = {path, result_pi};
+            }
+
+            return {path, result_pi};
+        }
+
+        // Skip if we've found a better path to this state
+        auto vis_it = visited.find(packed);
+        if (vis_it != visited.end() && vis_it->second < g) {
+            continue;
+        }
+
+        auto positions = unpack_state(packed, k, N_);
+
+        // pos_to_k_idx: physical position -> index in partition_qubits
+        std::unordered_map<int, int> pos_to_k_idx;
+        for (int i = 0; i < k; i++) {
+            pos_to_k_idx[positions[i]] = i;
+        }
+
+        // Try every SWAP that moves at least one partition qubit
+        for (int i = 0; i < k; i++) {
+            int p = positions[i];
+            for (int nb : adj_[p]) {
+                auto new_positions = positions;
+                new_positions[i] = nb;
+                // If neighbor also holds a partition qubit, swap it
+                auto it = pos_to_k_idx.find(nb);
+                if (it != pos_to_k_idx.end()) {
+                    new_positions[it->second] = p;
+                }
+
+                int64_t new_packed = pack_state(new_positions, N_);
+                int new_g = g + 1;
+
+                auto new_vis = visited.find(new_packed);
+                if (new_vis != visited.end() && new_vis->second <= new_g) {
+                    continue;
+                }
+
+                // Compute heuristic
+                double h = 0.0;
+                for (int j = 0; j < k; j++) {
+                    h += dist(new_positions[j], target_positions[j]);
+                }
+                h /= 2.0;
+
+                visited[new_packed] = new_g;
+                parent[new_packed] = {packed, {std::min(p, nb), std::max(p, nb)}};
+                pq.push({new_g + h, new_g, new_packed});
+            }
+        }
+    }
+
+    // Failed to route (should not happen on a connected graph)
+    return {{}, pi};
+}
+
+// ---------------------------------------------------------------------------
+// transform_pi
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+SabreRouter::transform_pi(
+    const CandidateData& cand,
+    const std::vector<int>& pi,
+    bool reverse,
+    std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+) const {
+    // Build P_route_inv
+    const std::vector<int>& P_route = reverse ? cand.P_o : cand.P_i;
+    std::vector<int> P_route_inv(P_route.size());
+    for (size_t i = 0; i < P_route.size(); i++) {
+        P_route_inv[P_route[i]] = static_cast<int>(i);
+    }
+
+    // Route qubits to input positions
+    auto [swaps, pi_routed] = find_constrained_swaps(
+        pi,
+        cand.qbit_map_keys,
+        cand.qbit_map_vals,
+        cand.node_mapping_flat,
+        P_route_inv,
+        swap_cache
+    );
+
+    // Update output positions using P_exit
+    const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
+    std::vector<int> pi_output = pi_routed;
+
+    // Build inverse qbit_map: q* -> q
+    std::unordered_map<int, int> qbit_map_inv;
+    for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
+        qbit_map_inv[cand.qbit_map_vals[i]] = cand.qbit_map_keys[i];
+    }
+
+    for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
+        auto it = qbit_map_inv.find(static_cast<int>(q_star));
+        if (it != qbit_map_inv.end()) {
+            int k = it->second;
+            pi_output[k] = cand.node_mapping_flat[P_exit[q_star]];
+        }
+    }
+
+    return {swaps, pi_output};
+}
+
+// ---------------------------------------------------------------------------
+// generate_extended_set (BFS lookahead)
+// ---------------------------------------------------------------------------
+
+std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
+    const std::vector<int>& F,
+    const std::vector<uint8_t>& resolved,
+    const std::vector<std::vector<int>>& children_graph,
+    const std::vector<std::vector<int>>& parents_graph
+) const {
+    std::vector<std::pair<int,int>> E;
+    std::vector<uint8_t> in_E(num_partitions_, 0);
+    std::vector<uint8_t> in_F(num_partitions_, 0);
+    for (int p : F) in_F[p] = 1;
+
+    struct BFSNode {
+        int partition;
+        int depth;
+    };
+    std::deque<BFSNode> queue;
+
+    // Seed with children of F partitions
+    for (int p : F) {
+        for (int child : children_graph[p]) {
+            if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                queue.push_back({child, 1});
+            }
+        }
+    }
+
+    while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
+        auto [part, depth] = queue.front();
+        queue.pop_front();
+
+        if (depth > config_.max_lookahead) continue;
+        if (in_E[part] || in_F[part] || resolved[part]) continue;
+
+        // Check all parents resolved or in F
+        bool parents_ok = true;
+        for (int par : parents_graph[part]) {
+            if (!resolved[par] && !in_F[par]) {
+                parents_ok = false;
+                break;
+            }
+        }
+        if (!parents_ok) continue;
+
+        if (partition_is_single(part)) {
+            // Single-qubit: skip, add children
+            for (int child : children_graph[part]) {
+                if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                    queue.push_back({child, depth + 1});
+                }
+            }
+            continue;
+        }
+
+        E.push_back({part, depth});
+        in_E[part] = 1;
+
+        if (depth < config_.max_lookahead) {
+            for (int child : children_graph[part]) {
+                if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                    queue.push_back({child, depth + 1});
+                }
+            }
+        }
+    }
+
+    return E;
+}
+
+// ---------------------------------------------------------------------------
+// Routing cost helpers
+// ---------------------------------------------------------------------------
+
+double SabreRouter::compute_routing_cost(
+    const std::vector<int>& pi,
+    int exclude_partition_idx,
+    const std::vector<int>& partition_indices,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    double total = 0.0;
+    for (int p_idx : partition_indices) {
+        if (p_idx == exclude_partition_idx) continue;
+        auto it = canonical_data.find(p_idx);
+        if (it == canonical_data.end()) continue;
+        const auto& entry = it->second;
+        if (entry.edges_u.empty()) continue;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            int u = entry.edges_u[i];
+            int v = entry.edges_v[i];
+            double d = dist(pi[u], pi[v]);
+            double cost = d - 1.0;
+            if (cost > 0.0) total += config_.swap_cost * cost;
+        }
+    }
+    return total;
+}
+
+double SabreRouter::compute_lookahead_cost(
+    const std::vector<int>& pi,
+    int exclude_partition_idx,
+    const std::vector<std::pair<int,int>>& E,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    if (E.empty()) return 0.0;
+    double total = 0.0;
+    for (auto [p_idx, depth] : E) {
+        if (p_idx == exclude_partition_idx) continue;
+        auto it = canonical_data.find(p_idx);
+        if (it == canonical_data.end()) continue;
+        const auto& entry = it->second;
+        if (entry.edges_u.empty()) continue;
+        double d_cost = 0.0;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            int u = entry.edges_u[i];
+            int v = entry.edges_v[i];
+            double d = dist(pi[u], pi[v]);
+            double cost = d - 1.0;
+            if (cost > 0.0) d_cost += config_.swap_cost * cost;
+        }
+        total += std::pow(config_.E_alpha, depth) * d_cost;
+    }
+    return config_.E_weight * total / static_cast<double>(E.size());
+}
+
+// ---------------------------------------------------------------------------
+// score_candidate (LightSABRE scoring)
+// ---------------------------------------------------------------------------
+
+double SabreRouter::score_candidate(
+    const CandidateData& cand,
+    const std::vector<int>& F_snapshot,
+    const std::vector<int>& pi,
+    const std::vector<std::pair<int,int>>& E,
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data,
+    std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+) const {
+    auto [swaps, output_perm] = transform_pi(cand, pi, reverse, swap_cache);
+
+    double score = config_.swap_cost * static_cast<double>(swaps.size());
+    score += config_.local_cost_weight * static_cast<double>(cand.cnot_count);
+
+    // F cost: average routing cost over F \ {cand}
+    int cand_idx = cand.partition_idx;
+    int n_other = 0;
+    double f_sum = 0.0;
+    for (int p_idx : F_snapshot) {
+        if (p_idx == cand_idx) continue;
+        auto it = canonical_data.find(p_idx);
+        if (it == canonical_data.end()) continue;
+        n_other++;
+        const auto& entry = it->second;
+        if (entry.edges_u.empty()) continue;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            int u = entry.edges_u[i];
+            int v = entry.edges_v[i];
+            double d = dist(output_perm[u], output_perm[v]);
+            double cost = d - 1.0;
+            if (cost > 0.0) f_sum += config_.swap_cost * cost;
+        }
+    }
+    if (n_other > 0) score += f_sum / static_cast<double>(n_other);
+
+    // E cost: alpha^depth-decayed lookahead
+    if (!E.empty()) {
+        double e_sum = 0.0;
+        for (auto [p_idx, depth] : E) {
+            if (p_idx == cand_idx) continue;
+            auto it = canonical_data.find(p_idx);
+            if (it == canonical_data.end()) continue;
+            const auto& entry = it->second;
+            if (entry.edges_u.empty()) continue;
+            double d_cost = 0.0;
+            for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                int u = entry.edges_u[i];
+                int v = entry.edges_v[i];
+                double d = dist(output_perm[u], output_perm[v]);
+                double cost = d - 1.0;
+                if (cost > 0.0) d_cost += config_.swap_cost * cost;
+            }
+            e_sum += std::pow(config_.E_alpha, depth) * d_cost;
+        }
+        score += config_.E_weight * e_sum / static_cast<double>(E.size());
+    }
+
+    return score;
+}
+
+// ---------------------------------------------------------------------------
+// obtain_partition_candidates
+// ---------------------------------------------------------------------------
+
+std::vector<const CandidateData*> SabreRouter::obtain_partition_candidates(
+    const std::vector<int>& F
+) const {
+    std::vector<const CandidateData*> result;
+    for (int p_idx : F) {
+        if (p_idx < 0 || p_idx >= num_partitions_) continue;
+        for (const auto& cand : candidate_cache_[p_idx]) {
+            result.push_back(&cand);
+        }
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// prefilter_candidates
+// ---------------------------------------------------------------------------
+
+std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
+    const std::vector<const CandidateData*>& candidates,
+    const std::vector<int>& pi,
+    int top_k,
+    bool reverse
+) const {
+    if (static_cast<int>(candidates.size()) <= top_k) return candidates;
+
+    using Pair = std::pair<int, const CandidateData*>;
+    std::vector<Pair> estimated;
+    estimated.reserve(candidates.size());
+    for (const auto* cand : candidates) {
+        estimated.push_back({estimate_swap_count(*cand, pi, reverse), cand});
+    }
+
+    std::partial_sort(estimated.begin(),
+                      estimated.begin() + std::min(top_k, static_cast<int>(estimated.size())),
+                      estimated.end(),
+                      [](const Pair& a, const Pair& b) { return a.first < b.first; });
+
+    std::vector<const CandidateData*> result;
+    result.reserve(top_k);
+    for (int i = 0; i < top_k && i < static_cast<int>(estimated.size()); i++) {
+        result.push_back(estimated[i].second);
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// select_best_candidate
+// ---------------------------------------------------------------------------
+
+const CandidateData& SabreRouter::select_best_candidate(
+    const std::vector<const CandidateData*>& candidates,
+    const std::vector<double>& scores,
+    std::mt19937* rng
+) const {
+    // Find minimum score
+    double min_score = scores[0];
+    for (size_t i = 1; i < scores.size(); i++) {
+        if (scores[i] < min_score) min_score = scores[i];
+    }
+
+    // Collect all candidates within tolerance of minimum
+    std::vector<size_t> near_best;
+    for (size_t i = 0; i < scores.size(); i++) {
+        if (scores[i] <= min_score * (1.0 + config_.score_tolerance)) {
+            near_best.push_back(i);
+        }
+    }
+
+    // Select randomly among near-best if rng provided
+    if (rng && near_best.size() > 1) {
+        std::uniform_int_distribution<size_t> dist(0, near_best.size() - 1);
+        return *candidates[near_best[dist(*rng)]];
+    }
+    return *candidates[near_best[0]];
+}
+
+// ---------------------------------------------------------------------------
+// release_valve
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
+SabreRouter::release_valve(
+    const std::vector<int>& F,
+    const std::vector<int>& pi,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    // Find the F partition whose worst-pair distance is smallest
+    int best_d = std::numeric_limits<int>::max();
+    int best_p = -1;
+    int best_u = -1, best_v = -1;
+
+    for (int p_idx : F) {
+        auto it = canonical_data.find(p_idx);
+        if (it == canonical_data.end()) continue;
+        const auto& entry = it->second;
+        if (entry.edges_u.empty()) continue;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            int u = entry.edges_u[i];
+            int v = entry.edges_v[i];
+            double d = dist(pi[u], pi[v]);
+            int di = static_cast<int>(d);
+            if (di > 1 && (di < best_d || (di == best_d && p_idx < best_p))) {
+                best_d = di;
+                best_p = p_idx;
+                best_u = u;
+                best_v = v;
+            }
+        }
+    }
+
+    if (best_p < 0) return {{}, pi};
+
+    auto path = bfs_shortest_path(pi[best_u], pi[best_v]);
+    if (static_cast<int>(path.size()) < 2) return {{}, pi};
+
+    int k = static_cast<int>(path.size()) - 1;
+    int m = k / 2;
+    std::vector<std::pair<int,int>> swaps;
+    for (int i = 0; i < m; i++) {
+        swaps.push_back({path[i], path[i + 1]});
+    }
+    for (int i = k; i > m; i--) {
+        swaps.push_back({path[i], path[i - 1]});
+    }
+
+    auto pi_new = apply_swaps_to_pi(pi, swaps);
+    return {swaps, pi_new};
+}
+
+// ---------------------------------------------------------------------------
+// heuristic_search (main loop)
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
+    const std::vector<int>& F_init,
+    std::vector<int> pi,
+    bool reverse,
+    std::mt19937* rng,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data,
+    const std::vector<std::vector<int>>& cg,
+    const std::vector<std::vector<int>>& pg
+) const {
+    std::vector<int> F = F_init;
+    std::vector<uint8_t> resolved(num_partitions_, 0);
+    int total_swaps = 0;
+
+    // Swap cache for this search call (thread-local, on stack)
+    std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash> swap_cache;
+
+    // Note: single-qubit partitions are NOT auto-resolved upfront, matching
+    // Python behavior when layout_partitions (dicts) are passed to
+    // _heuristic_search_layout_only. Single-qubit children get resolved
+    // via the cascade after each candidate is selected.
+
+    // Main search loop
+    while (!F.empty()) {
+        auto all_candidates = obtain_partition_candidates(F);
+        if (all_candidates.empty()) break;
+
+        // Prefilter
+        auto candidates = prefilter_candidates(
+            all_candidates, pi, config_.prefilter_top_k, reverse);
+
+        // Generate extended set
+        auto E = generate_extended_set(F, resolved, cg, pg);
+
+        // Score all candidates
+        std::vector<double> scores;
+        scores.reserve(candidates.size());
+        for (const auto* cand : candidates) {
+            scores.push_back(score_candidate(
+                *cand, F, pi, E, reverse, canonical_data, &swap_cache));
+        }
+
+        // Select best
+        const auto& best = select_best_candidate(candidates, scores, rng);
+
+        // Remove from F and mark resolved
+        F.erase(std::remove(F.begin(), F.end(), best.partition_idx), F.end());
+        resolved[best.partition_idx] = 1;
+
+        // Apply transform
+        auto [swaps, pi_new] = transform_pi(best, pi, reverse, &swap_cache);
+        total_swaps += static_cast<int>(swaps.size());
+        pi = std::move(pi_new);
+
+        // Update F with newly eligible children
+        for (int child : cg[best.partition_idx]) {
+            if (!resolved[child]) {
+                bool in_F = std::find(F.begin(), F.end(), child) != F.end();
+                if (!in_F) {
+                    bool all_parents_resolved = true;
+                    for (int par : pg[child]) {
+                        if (!resolved[par]) {
+                            all_parents_resolved = false;
+                            break;
+                        }
+                    }
+                    if (all_parents_resolved) {
+                        F.push_back(child);
+                    }
+                }
+            }
+        }
+    }
+
+    return {pi, total_swaps};
+}
+
+// ---------------------------------------------------------------------------
+// run_trial (full implementation)
+// ---------------------------------------------------------------------------
+
+TrialResult SabreRouter::run_trial(
+    int trial_idx,
+    const std::vector<int>& seeded_pi,
+    int n_iterations,
+    int n_trials
+) const {
+    // RNG setup
+    std::mt19937 rng_gen(config_.random_seed + trial_idx);
+    std::mt19937* rng = (n_trials > 1) ? &rng_gen : nullptr;
+
+    // vf2_cutoff: first 5% of trials use seeded layout
+    int vf2_cutoff = std::max(1, static_cast<int>(n_trials * 0.05));
+    std::vector<int> pi;
+    if (trial_idx < vf2_cutoff) {
+        pi = seeded_pi;
+    } else {
+        pi = random_permutation(N_, rng_gen);
+    }
+
+    // Forward-backward-forward iterations
+    for (int iteration = 0; iteration < n_iterations; iteration++) {
+        // Backward pass: swap DAG/IDAG
+        auto F_rev = get_final_layer();
+        auto [pi_bwd, _] = heuristic_search(F_rev, pi, true, rng, canonical_data_rev_, IDAG_, DAG_);
+        pi = std::move(pi_bwd);
+
+        // Forward pass (skip on last iteration)
+        if (iteration < n_iterations - 1) {
+            auto F_fwd = get_initial_layer();
+            auto [pi_fwd, __] = heuristic_search(F_fwd, pi, false, rng, canonical_data_fwd_, DAG_, IDAG_);
+            pi = std::move(pi_fwd);
+        }
+    }
+
+    // Final evaluation pass (deterministic, no RNG)
+    auto F_eval = get_initial_layer();
+    auto [pi_final, cost] = heuristic_search(F_eval, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_);
+
+    return TrialResult{std::move(pi_final), cost};
+}
+
+} // namespace squander::routing
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index d926345c3..0778b7bf2 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -816,6 +816,14 @@ def _run_layout_trials(
         n_trials,
         random_seed,
     ):
+        use_cpp = self.config.get('use_cpp_router', True)
+        if use_cpp:
+            return self._run_layout_trials_cpp(
+                seeded_pi, DAG, IDAG, layout_partitions,
+                scoring_partitions, D, candidate_cache,
+                n_iterations, n_trials, random_seed,
+            )
+
         trial_indices = list(range(max(1, n_trials)))
         use_parallel = (
             self.config.get("parallel_layout_trials", False)
@@ -865,6 +873,83 @@ def _run_layout_trials(
             initargs=(worker_state,),
         ) as pool:
             return pool.map(_run_layout_trial_worker, trial_indices)
+
+    def _run_layout_trials_cpp(
+        self,
+        seeded_pi,
+        DAG,
+        IDAG,
+        layout_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        n_iterations,
+        n_trials,
+        random_seed,
+    ):
+        from squander.synthesis._sabre_router import SabreRouter, SabreConfig
+
+        cfg = SabreConfig()
+        cfg.prefilter_top_k = self.config.get('prefilter_top_k', 50)
+        cfg.max_E_size = self.config.get('max_E_size', 20)
+        cfg.max_lookahead = self.config.get('max_lookahead', 4)
+        cfg.E_weight = self.config.get('E_weight', 0.5)
+        cfg.E_alpha = self.config.get('E_alpha', 0.9)
+        cfg.local_cost_weight = self.config.get('local_cost_weight', 0.1)
+        cfg.swap_cost = self.config.get('swap_cost', 15.0)
+        cfg.score_tolerance = self.config.get('score_tolerance', 0.05)
+        cfg.sabre_iterations = n_iterations
+        cfg.n_layout_trials = max(1, n_trials)
+        cfg.random_seed = random_seed
+
+        canonical_fwd = self._build_canonical_neighbor_data(
+            scoring_partitions, reverse=False
+        )
+        canonical_rev = self._build_canonical_neighbor_data(
+            scoring_partitions, reverse=True
+        )
+
+        # Convert candidate_cache: list of tuples -> list of lists
+        candidate_cache_lists = [list(cands) for cands in candidate_cache]
+
+        # Convert layout_partitions: list of dicts with tuple involved_qbits
+        layout_partitions_lists = [
+            {'is_single': lp['is_single'], 'involved_qbits': list(lp['involved_qbits'])}
+            for lp in layout_partitions
+        ]
+
+        router = SabreRouter(
+            cfg, D, self._adj, DAG, IDAG,
+            candidate_cache_lists, layout_partitions_lists,
+            canonical_fwd, canonical_rev,
+        )
+
+        seeded_pi_list = [int(x) for x in seeded_pi]
+        n_trials_actual = max(1, n_trials)
+        trial_indices = list(range(n_trials_actual))
+
+        use_parallel = (
+            self.config.get("parallel_layout_trials", False)
+            and n_trials_actual > 1
+        )
+
+        if not use_parallel:
+            return [
+                router.run_trial(idx, seeded_pi_list, n_iterations, n_trials_actual)
+                for idx in trial_indices
+            ]
+
+        from concurrent.futures import ThreadPoolExecutor
+        workers = self.config.get("layout_trial_workers", 0)
+        if workers <= 0:
+            workers = min(n_trials_actual, mp.cpu_count())
+
+        with ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = [
+                pool.submit(router.run_trial, idx, seeded_pi_list, n_iterations, n_trials_actual)
+                for idx in trial_indices
+            ]
+            return [f.result() for f in futures]
         
     def Partition_Aware_Mapping(
         self, circ: Circuit, orig_parameters: np.ndarray
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
new file mode 100644
index 000000000..9b3e21413
--- /dev/null
+++ b/squander/synthesis/bindings.cpp
@@ -0,0 +1,203 @@
+/*
+Copyright 2025 SQUANDER Contributors
+
+pybind11 bindings for the SABRE routing engine.
+*/
+
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+#include "sabre_router.hpp"
+
+namespace py = pybind11;
+using namespace squander::routing;
+
+// ---------------------------------------------------------------------------
+// Helper: extract fields from a Python PartitionCandidate object into CandidateData
+// ---------------------------------------------------------------------------
+
+static CandidateData extract_candidate(py::handle pc) {
+    CandidateData cd;
+    cd.partition_idx = pc.attr("partition_idx").cast<int>();
+    cd.topology_idx = pc.attr("topology_idx").cast<int>();
+    cd.permutation_idx = pc.attr("permutation_idx").cast<int>();
+    cd.cnot_count = pc.attr("cnot_count").cast<int>();
+
+    // P_i, P_o: tuples of ints
+    cd.P_i = pc.attr("P_i").cast<std::vector<int>>();
+    cd.P_o = pc.attr("P_o").cast<std::vector<int>>();
+
+    // node_mapping: dict {Q* -> Q} -> flatten to dense array
+    py::dict nm = pc.attr("node_mapping");
+    int max_qstar = -1;
+    for (auto [key, val] : nm) {
+        int qs = key.cast<int>();
+        if (qs > max_qstar) max_qstar = qs;
+    }
+    cd.node_mapping_flat.resize(max_qstar + 1, -1);
+    for (auto [key, val] : nm) {
+        cd.node_mapping_flat[key.cast<int>()] = val.cast<int>();
+    }
+
+    // qbit_map: dict {q -> q*}
+    py::dict qm = pc.attr("qbit_map");
+    cd.qbit_map_keys.reserve(py::len(qm));
+    cd.qbit_map_vals.reserve(py::len(qm));
+    for (auto [key, val] : qm) {
+        cd.qbit_map_keys.push_back(key.cast<int>());
+        cd.qbit_map_vals.push_back(val.cast<int>());
+    }
+
+    // involved_qbits: tuple of ints
+    cd.involved_qbits = pc.attr("involved_qbits").cast<std::vector<int>>();
+
+    return cd;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: extract canonical_data dict -> unordered_map
+// ---------------------------------------------------------------------------
+
+static std::unordered_map<int, CanonicalEntry> extract_canonical_data(py::dict cd) {
+    std::unordered_map<int, CanonicalEntry> result;
+    for (auto [key, val] : cd) {
+        int pidx = key.cast<int>();
+        CanonicalEntry entry;
+        // val is a dict with 'edges_u', 'edges_v', 'cnot'
+        py::dict d = py::reinterpret_borrow<py::dict>(val);
+        if (d.contains("edges_u") && !d["edges_u"].is_none()) {
+            auto buf_u = py::array_t<int, py::array::c_style>::ensure(d["edges_u"]);
+            if (buf_u) {
+                auto acc = buf_u.unchecked<1>();
+                entry.edges_u.resize(acc.shape(0));
+                for (ssize_t i = 0; i < acc.shape(0); i++) entry.edges_u[i] = acc(i);
+            }
+        }
+        if (d.contains("edges_v") && !d["edges_v"].is_none()) {
+            auto buf_v = py::array_t<int, py::array::c_style>::ensure(d["edges_v"]);
+            if (buf_v) {
+                auto acc = buf_v.unchecked<1>();
+                entry.edges_v.resize(acc.shape(0));
+                for (ssize_t i = 0; i < acc.shape(0); i++) entry.edges_v[i] = acc(i);
+            }
+        }
+        entry.cnot = d["cnot"].cast<int>();
+        result[pidx] = std::move(entry);
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// Helper: extract layout_partitions list -> vector<LayoutPartInfo>
+// ---------------------------------------------------------------------------
+
+static std::vector<LayoutPartInfo> extract_layout_partitions(py::list lp) {
+    std::vector<LayoutPartInfo> result;
+    result.reserve(py::len(lp));
+    for (auto item : lp) {
+        py::dict d = py::reinterpret_borrow<py::dict>(item);
+        LayoutPartInfo info;
+        info.is_single = d["is_single"].cast<bool>();
+        info.involved_qbits = d["involved_qbits"].cast<std::vector<int>>();
+        result.push_back(std::move(info));
+    }
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// Module definition
+// ---------------------------------------------------------------------------
+
+PYBIND11_MODULE(_sabre_router, m) {
+    m.doc() = "SQUANDER SABRE Routing Engine - C++ Backend";
+
+    // Bind SabreConfig
+    py::class_<SabreConfig>(m, "SabreConfig")
+        .def(py::init<>())
+        .def_readwrite("prefilter_top_k", &SabreConfig::prefilter_top_k)
+        .def_readwrite("max_E_size", &SabreConfig::max_E_size)
+        .def_readwrite("max_lookahead", &SabreConfig::max_lookahead)
+        .def_readwrite("E_weight", &SabreConfig::E_weight)
+        .def_readwrite("E_alpha", &SabreConfig::E_alpha)
+        .def_readwrite("local_cost_weight", &SabreConfig::local_cost_weight)
+        .def_readwrite("swap_cost", &SabreConfig::swap_cost)
+        .def_readwrite("score_tolerance", &SabreConfig::score_tolerance)
+        .def_readwrite("sabre_iterations", &SabreConfig::sabre_iterations)
+        .def_readwrite("n_layout_trials", &SabreConfig::n_layout_trials)
+        .def_readwrite("random_seed", &SabreConfig::random_seed);
+
+    // Bind SabreRouter with data-converting constructor
+    py::class_<SabreRouter>(m, "SabreRouter")
+        .def(py::init(
+            [](const SabreConfig& config,
+               py::array_t<double, py::array::c_style> D_arr,
+               const std::vector<std::vector<int>>& adj,
+               const std::vector<std::vector<int>>& DAG,
+               const std::vector<std::vector<int>>& IDAG,
+               py::list candidate_cache_py,
+               py::list layout_partitions_py,
+               py::dict canonical_data_fwd_py,
+               py::dict canonical_data_rev_py
+            ) {
+                // Extract D matrix
+                auto buf = D_arr.request();
+                if (buf.ndim != 2 || buf.shape[0] != buf.shape[1]) {
+                    throw std::invalid_argument("D must be a square 2D array");
+                }
+                int N = static_cast<int>(buf.shape[0]);
+                std::vector<double> D_flat(N * N);
+                auto* ptr = static_cast<const double*>(buf.ptr);
+                std::copy(ptr, ptr + N * N, D_flat.begin());
+
+                // Convert candidate_cache: list of lists of PartitionCandidate
+                std::vector<std::vector<CandidateData>> cc;
+                cc.reserve(py::len(candidate_cache_py));
+                for (auto part_cands : candidate_cache_py) {
+                    std::vector<CandidateData> cands;
+                    py::list cl = py::reinterpret_borrow<py::list>(part_cands);
+                    cands.reserve(py::len(cl));
+                    for (auto c : cl) {
+                        cands.push_back(extract_candidate(c));
+                    }
+                    cc.push_back(std::move(cands));
+                }
+
+                auto lp = extract_layout_partitions(layout_partitions_py);
+                auto cd_fwd = extract_canonical_data(canonical_data_fwd_py);
+                auto cd_rev = extract_canonical_data(canonical_data_rev_py);
+
+                return new SabreRouter(
+                    config, N, D_flat, adj, DAG, IDAG,
+                    cc, lp, cd_fwd, cd_rev
+                );
+            }),
+            py::arg("config"),
+            py::arg("D"),
+            py::arg("adj"),
+            py::arg("DAG"),
+            py::arg("IDAG"),
+            py::arg("candidate_cache"),
+            py::arg("layout_partitions"),
+            py::arg("canonical_data_fwd"),
+            py::arg("canonical_data_rev")
+        )
+        .def("run_trial",
+            [](const SabreRouter& self,
+               int trial_idx,
+               const std::vector<int>& seeded_pi,
+               int n_iterations,
+               int n_trials
+            ) -> py::tuple {
+                py::gil_scoped_release release;
+                auto result = self.run_trial(trial_idx, seeded_pi, n_iterations, n_trials);
+                py::gil_scoped_acquire acquire;
+                return py::make_tuple(result.total_cost, result.pi);
+            },
+            py::arg("trial_idx"),
+            py::arg("seeded_pi"),
+            py::arg("n_iterations"),
+            py::arg("n_trials"),
+            "Run a single layout trial (GIL-free, thread-safe)"
+        );
+}

From 0fadf3b0fcf4b05f4ef098052386e00300732a30 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 22 Apr 2026 20:42:50 +0200
Subject: [PATCH 130/232] Use os.sched_getaffinity instead of mp.cpu_count for
 pool sizing

mp.cpu_count() ignores CPU affinity masks set by taskset, cgroups, or
SLURM, causing oversubscription on constrained nodes. Use
os.sched_getaffinity(0) to respect the actual CPU set.
---
 .../qgd_Wide_Circuit_Optimization.py          |  2 +-
 squander/synthesis/PartAM.py                  | 19 ++++++++++++++++---
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 372004f2d..900e7868d 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -2190,7 +2190,7 @@ def process_result(partition_idx):
         with (
             contextlib.nullcontext()
             if in_parent
-            else Pool(processes=mp.cpu_count())
+            else Pool(processes=len(os.sched_getaffinity(0)) if hasattr(os, 'sched_getaffinity') else mp.cpu_count())
         ) as pool:
             remaining = list(range(len(subcircuits)))
             while remaining:
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 0778b7bf2..9bcfb872a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -3,6 +3,7 @@
 """
 import logging
 import multiprocessing as mp
+import os
 import time
 from collections import deque, defaultdict
 from itertools import permutations
@@ -39,6 +40,18 @@ def _decompose_one(Umtx, mini_topology):
         Umtx, _worker_config, mini_topology
     )
 
+def _available_cpus():
+    """Return the number of CPUs available to this process.
+
+    Respects affinity masks set by taskset, cgroups, SLURM, etc.
+    Falls back to mp.cpu_count() on platforms without sched_getaffinity.
+    """
+    try:
+        return len(os.sched_getaffinity(0))
+    except (AttributeError, OSError):
+        return mp.cpu_count()
+
+
 from squander.synthesis.PartAM_utils import (
     get_subtopologies_of_type,
     get_unique_subtopologies,
@@ -561,7 +574,7 @@ def _run_parallel_synthesis(self, partition_meta):
         Returns:
             results_map: Dict mapping partition_idx to PartitionSynthesisResult.
         """
-        n_cpus = mp.cpu_count()
+        n_cpus = _available_cpus()
         use_auts = self.config.get('use_automorphisms', True)
         disable_pbar = self.config.get('progressbar', 0) == False
         aut_cache = {}
@@ -850,7 +863,7 @@ def _run_layout_trials(
 
         workers = self.config.get("layout_trial_workers", 0)
         if workers <= 0:
-            workers = min(len(trial_indices), mp.cpu_count())
+            workers = min(len(trial_indices), _available_cpus())
 
         worker_state = {
             "config": dict(self.config),
@@ -942,7 +955,7 @@ def _run_layout_trials_cpp(
         from concurrent.futures import ThreadPoolExecutor
         workers = self.config.get("layout_trial_workers", 0)
         if workers <= 0:
-            workers = min(n_trials_actual, mp.cpu_count())
+            workers = min(n_trials_actual, _available_cpus())
 
         with ThreadPoolExecutor(max_workers=workers) as pool:
             futures = [

From 58a34a417825462140291f878994cdd9cf1a405a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 22 Apr 2026 21:05:32 +0200
Subject: [PATCH 131/232] Fix C++ SABRE routing to match Python behavior (3
 fixes)

1. Add single-qubit upfront resolution before main heuristic search loop
2. Add single-qubit cascade in child update after resolving a candidate
3. Match extended set BFS seeding order (per front partition, not all at once)
---
 .../src-cpp/sabre_router/sabre_router.cpp     | 127 +++++++++++++-----
 1 file changed, 90 insertions(+), 37 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index ba9ffa389..e54ace439 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -476,51 +476,53 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
         int partition;
         int depth;
     };
-    std::deque<BFSNode> queue;
 
-    // Seed with children of F partitions
-    for (int p : F) {
-        for (int child : children_graph[p]) {
+    // Seed per front partition, matching Python's per-partition BFS seeding
+    for (int front_idx : F) {
+        if (static_cast<int>(E.size()) >= config_.max_E_size) break;
+
+        std::deque<BFSNode> queue;
+        for (int child : children_graph[front_idx]) {
             if (!in_F[child] && !in_E[child] && !resolved[child]) {
                 queue.push_back({child, 1});
             }
         }
-    }
 
-    while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
-        auto [part, depth] = queue.front();
-        queue.pop_front();
+        while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
+            auto [part, depth] = queue.front();
+            queue.pop_front();
 
-        if (depth > config_.max_lookahead) continue;
-        if (in_E[part] || in_F[part] || resolved[part]) continue;
+            if (depth > config_.max_lookahead) continue;
+            if (in_E[part] || in_F[part] || resolved[part]) continue;
 
-        // Check all parents resolved or in F
-        bool parents_ok = true;
-        for (int par : parents_graph[part]) {
-            if (!resolved[par] && !in_F[par]) {
-                parents_ok = false;
-                break;
+            // Check all parents resolved or in F
+            bool parents_ok = true;
+            for (int par : parents_graph[part]) {
+                if (!resolved[par] && !in_F[par]) {
+                    parents_ok = false;
+                    break;
+                }
             }
-        }
-        if (!parents_ok) continue;
+            if (!parents_ok) continue;
 
-        if (partition_is_single(part)) {
-            // Single-qubit: skip, add children
-            for (int child : children_graph[part]) {
-                if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                    queue.push_back({child, depth + 1});
+            if (partition_is_single(part)) {
+                // Single-qubit: skip, add children
+                for (int child : children_graph[part]) {
+                    if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                        queue.push_back({child, depth});
+                    }
                 }
+                continue;
             }
-            continue;
-        }
 
-        E.push_back({part, depth});
-        in_E[part] = 1;
+            E.push_back({part, depth});
+            in_E[part] = 1;
 
-        if (depth < config_.max_lookahead) {
-            for (int child : children_graph[part]) {
-                if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                    queue.push_back({child, depth + 1});
+            if (depth < config_.max_lookahead) {
+                for (int child : children_graph[part]) {
+                    if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                        queue.push_back({child, depth + 1});
+                    }
                 }
             }
         }
@@ -801,10 +803,34 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
     // Swap cache for this search call (thread-local, on stack)
     std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash> swap_cache;
 
-    // Note: single-qubit partitions are NOT auto-resolved upfront, matching
-    // Python behavior when layout_partitions (dicts) are passed to
-    // _heuristic_search_layout_only. Single-qubit children get resolved
-    // via the cascade after each candidate is selected.
+    // Resolve single-qubit partitions upfront (Python _heuristic_search_layout_only lines 1481-1498)
+    std::deque<int> sq_queue;
+    for (int p : F) {
+        if (partition_is_single(p)) sq_queue.push_back(p);
+    }
+    while (!sq_queue.empty()) {
+        int p_idx = sq_queue.front();
+        sq_queue.pop_front();
+        if (resolved[p_idx]) continue;
+        F.erase(std::remove(F.begin(), F.end(), p_idx), F.end());
+        resolved[p_idx] = 1;
+
+        for (int child : cg[p_idx]) {
+            if (!resolved[child] && std::find(F.begin(), F.end(), child) == F.end()) {
+                bool all_resolved = true;
+                for (int par : pg[child]) {
+                    if (!resolved[par]) { all_resolved = false; break; }
+                }
+                if (all_resolved) {
+                    if (partition_is_single(child)) {
+                        sq_queue.push_back(child);
+                    } else {
+                        F.push_back(child);
+                    }
+                }
+            }
+        }
+    }
 
     // Main search loop
     while (!F.empty()) {
@@ -838,7 +864,9 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         total_swaps += static_cast<int>(swaps.size());
         pi = std::move(pi_new);
 
-        // Update F with newly eligible children
+        // Update F with newly eligible children (cascade single-qubit)
+        // Python _heuristic_search_layout_only lines 1569-1590
+        std::deque<int> child_queue;
         for (int child : cg[best.partition_idx]) {
             if (!resolved[child]) {
                 bool in_F = std::find(F.begin(), F.end(), child) != F.end();
@@ -851,11 +879,36 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
                         }
                     }
                     if (all_parents_resolved) {
-                        F.push_back(child);
+                        if (partition_is_single(child)) {
+                            resolved[child] = 1;
+                            for (int gc : cg[child]) child_queue.push_back(gc);
+                        } else {
+                            F.push_back(child);
+                        }
                     }
                 }
             }
         }
+        // Cascade: resolve single-qubit grandchildren
+        while (!child_queue.empty()) {
+            int gc = child_queue.front();
+            child_queue.pop_front();
+            if (resolved[gc] || std::find(F.begin(), F.end(), gc) != F.end()) continue;
+            bool all_parents_resolved = true;
+            for (int par : pg[gc]) {
+                if (!resolved[par]) {
+                    all_parents_resolved = false;
+                    break;
+                }
+            }
+            if (!all_parents_resolved) continue;
+            if (partition_is_single(gc)) {
+                resolved[gc] = 1;
+                for (int ggc : cg[gc]) child_queue.push_back(ggc);
+            } else {
+                F.push_back(gc);
+            }
+        }
     }
 
     return {pi, total_swaps};

From aab861e404fd2fbcc59d1f21957c3483e195178a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 22 Apr 2026 21:17:06 +0200
Subject: [PATCH 132/232] Revert "Fix C++ SABRE routing to match Python
 behavior (3 fixes)"

This reverts commit 58a34a417825462140291f878994cdd9cf1a405a.
---
 .../src-cpp/sabre_router/sabre_router.cpp     | 127 +++++-------------
 1 file changed, 37 insertions(+), 90 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index e54ace439..ba9ffa389 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -476,53 +476,51 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
         int partition;
         int depth;
     };
+    std::deque<BFSNode> queue;
 
-    // Seed per front partition, matching Python's per-partition BFS seeding
-    for (int front_idx : F) {
-        if (static_cast<int>(E.size()) >= config_.max_E_size) break;
-
-        std::deque<BFSNode> queue;
-        for (int child : children_graph[front_idx]) {
+    // Seed with children of F partitions
+    for (int p : F) {
+        for (int child : children_graph[p]) {
             if (!in_F[child] && !in_E[child] && !resolved[child]) {
                 queue.push_back({child, 1});
             }
         }
+    }
 
-        while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
-            auto [part, depth] = queue.front();
-            queue.pop_front();
+    while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
+        auto [part, depth] = queue.front();
+        queue.pop_front();
 
-            if (depth > config_.max_lookahead) continue;
-            if (in_E[part] || in_F[part] || resolved[part]) continue;
+        if (depth > config_.max_lookahead) continue;
+        if (in_E[part] || in_F[part] || resolved[part]) continue;
 
-            // Check all parents resolved or in F
-            bool parents_ok = true;
-            for (int par : parents_graph[part]) {
-                if (!resolved[par] && !in_F[par]) {
-                    parents_ok = false;
-                    break;
-                }
+        // Check all parents resolved or in F
+        bool parents_ok = true;
+        for (int par : parents_graph[part]) {
+            if (!resolved[par] && !in_F[par]) {
+                parents_ok = false;
+                break;
             }
-            if (!parents_ok) continue;
+        }
+        if (!parents_ok) continue;
 
-            if (partition_is_single(part)) {
-                // Single-qubit: skip, add children
-                for (int child : children_graph[part]) {
-                    if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                        queue.push_back({child, depth});
-                    }
+        if (partition_is_single(part)) {
+            // Single-qubit: skip, add children
+            for (int child : children_graph[part]) {
+                if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                    queue.push_back({child, depth + 1});
                 }
-                continue;
             }
+            continue;
+        }
 
-            E.push_back({part, depth});
-            in_E[part] = 1;
+        E.push_back({part, depth});
+        in_E[part] = 1;
 
-            if (depth < config_.max_lookahead) {
-                for (int child : children_graph[part]) {
-                    if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                        queue.push_back({child, depth + 1});
-                    }
+        if (depth < config_.max_lookahead) {
+            for (int child : children_graph[part]) {
+                if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                    queue.push_back({child, depth + 1});
                 }
             }
         }
@@ -803,34 +801,10 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
     // Swap cache for this search call (thread-local, on stack)
     std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash> swap_cache;
 
-    // Resolve single-qubit partitions upfront (Python _heuristic_search_layout_only lines 1481-1498)
-    std::deque<int> sq_queue;
-    for (int p : F) {
-        if (partition_is_single(p)) sq_queue.push_back(p);
-    }
-    while (!sq_queue.empty()) {
-        int p_idx = sq_queue.front();
-        sq_queue.pop_front();
-        if (resolved[p_idx]) continue;
-        F.erase(std::remove(F.begin(), F.end(), p_idx), F.end());
-        resolved[p_idx] = 1;
-
-        for (int child : cg[p_idx]) {
-            if (!resolved[child] && std::find(F.begin(), F.end(), child) == F.end()) {
-                bool all_resolved = true;
-                for (int par : pg[child]) {
-                    if (!resolved[par]) { all_resolved = false; break; }
-                }
-                if (all_resolved) {
-                    if (partition_is_single(child)) {
-                        sq_queue.push_back(child);
-                    } else {
-                        F.push_back(child);
-                    }
-                }
-            }
-        }
-    }
+    // Note: single-qubit partitions are NOT auto-resolved upfront, matching
+    // Python behavior when layout_partitions (dicts) are passed to
+    // _heuristic_search_layout_only. Single-qubit children get resolved
+    // via the cascade after each candidate is selected.
 
     // Main search loop
     while (!F.empty()) {
@@ -864,9 +838,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         total_swaps += static_cast<int>(swaps.size());
         pi = std::move(pi_new);
 
-        // Update F with newly eligible children (cascade single-qubit)
-        // Python _heuristic_search_layout_only lines 1569-1590
-        std::deque<int> child_queue;
+        // Update F with newly eligible children
         for (int child : cg[best.partition_idx]) {
             if (!resolved[child]) {
                 bool in_F = std::find(F.begin(), F.end(), child) != F.end();
@@ -879,36 +851,11 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
                         }
                     }
                     if (all_parents_resolved) {
-                        if (partition_is_single(child)) {
-                            resolved[child] = 1;
-                            for (int gc : cg[child]) child_queue.push_back(gc);
-                        } else {
-                            F.push_back(child);
-                        }
+                        F.push_back(child);
                     }
                 }
             }
         }
-        // Cascade: resolve single-qubit grandchildren
-        while (!child_queue.empty()) {
-            int gc = child_queue.front();
-            child_queue.pop_front();
-            if (resolved[gc] || std::find(F.begin(), F.end(), gc) != F.end()) continue;
-            bool all_parents_resolved = true;
-            for (int par : pg[gc]) {
-                if (!resolved[par]) {
-                    all_parents_resolved = false;
-                    break;
-                }
-            }
-            if (!all_parents_resolved) continue;
-            if (partition_is_single(gc)) {
-                resolved[gc] = 1;
-                for (int ggc : cg[gc]) child_queue.push_back(ggc);
-            } else {
-                F.push_back(gc);
-            }
-        }
     }
 
     return {pi, total_swaps};

From 28d21465c996f0d2a70399c774084fcfe0df2198 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 22 Apr 2026 21:28:23 +0200
Subject: [PATCH 133/232] Fix extended set generation to match Python behavior

Two fixes to generate_extended_set:
1. Don't increment depth for single-qubit partitions (they're free,
   matching Python's transparent passthrough). The old depth+1 caused
   alpha^depth to over-discount lookahead entries on circuits with many
   single-qubit partitions between multi-qubit ones.
2. Seed BFS per front partition instead of all at once, matching Python's
   iteration pattern.
---
 .../src-cpp/sabre_router/sabre_router.cpp     | 64 ++++++++++---------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index ba9ffa389..be428dc24 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -476,51 +476,53 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
         int partition;
         int depth;
     };
-    std::deque<BFSNode> queue;
 
-    // Seed with children of F partitions
-    for (int p : F) {
-        for (int child : children_graph[p]) {
+    // Seed per front partition, matching Python's per-partition BFS seeding
+    for (int front_idx : F) {
+        if (static_cast<int>(E.size()) >= config_.max_E_size) break;
+
+        std::deque<BFSNode> queue;
+        for (int child : children_graph[front_idx]) {
             if (!in_F[child] && !in_E[child] && !resolved[child]) {
                 queue.push_back({child, 1});
             }
         }
-    }
 
-    while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
-        auto [part, depth] = queue.front();
-        queue.pop_front();
+        while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
+            auto [part, depth] = queue.front();
+            queue.pop_front();
 
-        if (depth > config_.max_lookahead) continue;
-        if (in_E[part] || in_F[part] || resolved[part]) continue;
+            if (depth > config_.max_lookahead) continue;
+            if (in_E[part] || in_F[part] || resolved[part]) continue;
 
-        // Check all parents resolved or in F
-        bool parents_ok = true;
-        for (int par : parents_graph[part]) {
-            if (!resolved[par] && !in_F[par]) {
-                parents_ok = false;
-                break;
+            // Check all parents resolved or in F
+            bool parents_ok = true;
+            for (int par : parents_graph[part]) {
+                if (!resolved[par] && !in_F[par]) {
+                    parents_ok = false;
+                    break;
+                }
             }
-        }
-        if (!parents_ok) continue;
+            if (!parents_ok) continue;
 
-        if (partition_is_single(part)) {
-            // Single-qubit: skip, add children
-            for (int child : children_graph[part]) {
-                if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                    queue.push_back({child, depth + 1});
+            if (partition_is_single(part)) {
+                // Single-qubit partitions are free — don't increment depth
+                for (int child : children_graph[part]) {
+                    if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                        queue.push_back({child, depth});
+                    }
                 }
+                continue;
             }
-            continue;
-        }
 
-        E.push_back({part, depth});
-        in_E[part] = 1;
+            E.push_back({part, depth});
+            in_E[part] = 1;
 
-        if (depth < config_.max_lookahead) {
-            for (int child : children_graph[part]) {
-                if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                    queue.push_back({child, depth + 1});
+            if (depth < config_.max_lookahead) {
+                for (int child : children_graph[part]) {
+                    if (!in_F[child] && !in_E[child] && !resolved[child]) {
+                        queue.push_back({child, depth + 1});
+                    }
                 }
             }
         }

From 5dc6ec370d87b156e3b7cd635f6363966a570077 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 22 Apr 2026 21:40:07 +0200
Subject: [PATCH 134/232] Fix C++ prefilter to match Python cost-weighted
 formula

---
 squander/src-cpp/sabre_router/sabre_router.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index be428dc24..e8a1ddc4e 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -679,11 +679,13 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
 ) const {
     if (static_cast<int>(candidates.size()) <= top_k) return candidates;
 
-    using Pair = std::pair<int, const CandidateData*>;
+    using Pair = std::pair<double, const CandidateData*>;
     std::vector<Pair> estimated;
     estimated.reserve(candidates.size());
     for (const auto* cand : candidates) {
-        estimated.push_back({estimate_swap_count(*cand, pi, reverse), cand});
+        double est = estimate_swap_count(*cand, pi, reverse) * config_.swap_cost
+                     + config_.local_cost_weight * cand->cnot_count;
+        estimated.push_back({est, cand});
     }
 
     std::partial_sort(estimated.begin(),

From 58d5ef819440709cb42816d29f3131d5881d9711 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 22 Apr 2026 22:35:57 +0200
Subject: [PATCH 135/232] Minor fixes

---
 .../src-cpp/sabre_router/sabre_router.cpp     | 29 +++++++++----------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index e8a1ddc4e..36131be4f 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -311,21 +311,22 @@ SabreRouter::find_constrained_swaps(
     h0 /= 2.0;
 
     // Priority queue: (f_score, g_score, counter, packed_state)
-    // Using greater for min-heap behavior
-    using PQEntry = std::tuple<double, int, int64_t>;
+    // Counter provides FIFO tie-breaking, matching Python's counter variable
+    using PQEntry = std::tuple<double, int, uint64_t, int64_t>;
     std::priority_queue<PQEntry, std::vector<PQEntry>, std::greater<PQEntry>> pq;
+    uint64_t counter = 0;
 
     // Visited: packed_state -> best g_score
     std::unordered_map<int64_t, int> visited;
     // Parent: packed_state -> (parent_packed_state, swap)
     std::unordered_map<int64_t, std::pair<int64_t, std::pair<int,int>>> parent;
 
-    pq.push({h0, 0, initial_packed});
+    pq.push({h0, 0, counter++, initial_packed});
     visited[initial_packed] = 0;
     parent[initial_packed] = {-1, {-1, -1}};
 
     while (!pq.empty()) {
-        auto [f, g, packed] = pq.top();
+        auto [f, g, cnt, packed] = pq.top();
         pq.pop();
 
         if (packed == target_packed) {
@@ -399,7 +400,7 @@ SabreRouter::find_constrained_swaps(
 
                 visited[new_packed] = new_g;
                 parent[new_packed] = {packed, {std::min(p, nb), std::max(p, nb)}};
-                pq.push({new_g + h, new_g, new_packed});
+                pq.push({new_g + h, new_g, counter++, new_packed});
             }
         }
     }
@@ -688,10 +689,11 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
         estimated.push_back({est, cand});
     }
 
-    std::partial_sort(estimated.begin(),
-                      estimated.begin() + std::min(top_k, static_cast<int>(estimated.size())),
-                      estimated.end(),
-                      [](const Pair& a, const Pair& b) { return a.first < b.first; });
+    int kth = std::min(top_k, static_cast<int>(estimated.size()));
+    std::nth_element(estimated.begin(),
+                     estimated.begin() + kth,
+                     estimated.end(),
+                     [](const Pair& a, const Pair& b) { return a.first < b.first; });
 
     std::vector<const CandidateData*> result;
     result.reserve(top_k);
@@ -724,8 +726,8 @@ const CandidateData& SabreRouter::select_best_candidate(
         }
     }
 
-    // Select randomly among near-best if rng provided
-    if (rng && near_best.size() > 1) {
+    // Select randomly among near-best if rng provided and min_score > 0
+    if (min_score > 0.0 && rng && near_best.size() > 1) {
         std::uniform_int_distribution<size_t> dist(0, near_best.size() - 1);
         return *candidates[near_best[dist(*rng)]];
     }
@@ -805,11 +807,6 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
     // Swap cache for this search call (thread-local, on stack)
     std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash> swap_cache;
 
-    // Note: single-qubit partitions are NOT auto-resolved upfront, matching
-    // Python behavior when layout_partitions (dicts) are passed to
-    // _heuristic_search_layout_only. Single-qubit children get resolved
-    // via the cascade after each candidate is selected.
-
     // Main search loop
     while (!F.empty()) {
         auto all_candidates = obtain_partition_candidates(F);

From 5306a305ac107e74e667088d2b76cbe1a778f354 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 23 Apr 2026 00:01:23 +0200
Subject: [PATCH 136/232] minor bugfixes

---
 .../src-cpp/sabre_router/sabre_router.cpp     | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 36131be4f..50cb2564c 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -714,24 +714,27 @@ const CandidateData& SabreRouter::select_best_candidate(
 ) const {
     // Find minimum score
     double min_score = scores[0];
+    size_t min_idx = 0;
     for (size_t i = 1; i < scores.size(); i++) {
-        if (scores[i] < min_score) min_score = scores[i];
+        if (scores[i] < min_score) {
+            min_score = scores[i];
+            min_idx = i;
+        }
     }
 
-    // Collect all candidates within tolerance of minimum
-    std::vector<size_t> near_best;
-    for (size_t i = 0; i < scores.size(); i++) {
-        if (scores[i] <= min_score * (1.0 + config_.score_tolerance)) {
-            near_best.push_back(i);
+    if (rng && min_score > 0.0) {
+        std::vector<size_t> near_best;
+        double threshold = min_score * (1.0 + config_.score_tolerance);
+        for (size_t i = 0; i < scores.size(); i++) {
+            if (scores[i] <= threshold) near_best.push_back(i);
+        }
+        if (near_best.size() > 1) {
+            std::uniform_int_distribution<size_t> dist(0, near_best.size() - 1);
+            return *candidates[near_best[dist(*rng)]];
         }
     }
 
-    // Select randomly among near-best if rng provided and min_score > 0
-    if (min_score > 0.0 && rng && near_best.size() > 1) {
-        std::uniform_int_distribution<size_t> dist(0, near_best.size() - 1);
-        return *candidates[near_best[dist(*rng)]];
-    }
-    return *candidates[near_best[0]];
+    return *candidates[min_idx];
 }
 
 // ---------------------------------------------------------------------------
@@ -902,9 +905,9 @@ TrialResult SabreRouter::run_trial(
 
     // Final evaluation pass (deterministic, no RNG)
     auto F_eval = get_initial_layer();
-    auto [pi_final, cost] = heuristic_search(F_eval, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_);
+    auto [pi_final, cost] = heuristic_search(F_eval, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_); // Evaluates cost using a copy under the hood
 
-    return TrialResult{std::move(pi_final), cost};
+    return TrialResult{std::move(pi), cost}; // Return the pi from AFTER the backward pass, BEFORE the eval pass
 }
 
 } // namespace squander::routing

From 4510066348d8df681ce5e951e85e99c1f7f92aa9 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 23 Apr 2026 00:06:44 +0200
Subject: [PATCH 137/232] fix lookahead and swap cost

---
 .../sabre_router/include/sabre_router.hpp     |  2 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 34 ++++++++-----------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index d1c55cf3f..ec67840fc 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -191,7 +191,7 @@ class SabreRouter {
     ) const;
 
     // Lower-bound swap estimate (port of estimate_swap_count)
-    int estimate_swap_count(
+    double estimate_swap_count(
         const CandidateData& cand,
         const std::vector<int>& pi,
         bool reverse
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 50cb2564c..709b8004b 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -214,7 +214,8 @@ std::vector<int> SabreRouter::get_final_layer() const {
 // estimate_swap_count
 // ---------------------------------------------------------------------------
 
-int SabreRouter::estimate_swap_count(
+// Change return type to double!
+double SabreRouter::estimate_swap_count(
     const CandidateData& cand,
     const std::vector<int>& pi,
     bool reverse
@@ -236,7 +237,7 @@ int SabreRouter::estimate_swap_count(
             total += d;
         }
     }
-    return static_cast<int>(total / 2.0);
+    return total / 2.0; // NO MORE STATIC_CAST<INT>!
 }
 
 // ---------------------------------------------------------------------------
@@ -478,15 +479,13 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
         int depth;
     };
 
-    // Seed per front partition, matching Python's per-partition BFS seeding
     for (int front_idx : F) {
         if (static_cast<int>(E.size()) >= config_.max_E_size) break;
 
         std::deque<BFSNode> queue;
+        // EXACT Python logic: No pre-checks before pushing!
         for (int child : children_graph[front_idx]) {
-            if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                queue.push_back({child, 1});
-            }
+            queue.push_back({child, 1});
         }
 
         while (!queue.empty() && static_cast<int>(E.size()) < config_.max_E_size) {
@@ -496,7 +495,6 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
             if (depth > config_.max_lookahead) continue;
             if (in_E[part] || in_F[part] || resolved[part]) continue;
 
-            // Check all parents resolved or in F
             bool parents_ok = true;
             for (int par : parents_graph[part]) {
                 if (!resolved[par] && !in_F[par]) {
@@ -506,12 +504,10 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
             }
             if (!parents_ok) continue;
 
-            if (partition_is_single(part)) {
-                // Single-qubit partitions are free — don't increment depth
+            if (layout_partitions_[part].is_single) {
+                // EXACT Python logic: blindly push grandchildren!
                 for (int child : children_graph[part]) {
-                    if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                        queue.push_back({child, depth});
-                    }
+                    queue.push_back({child, depth});
                 }
                 continue;
             }
@@ -521,9 +517,7 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
 
             if (depth < config_.max_lookahead) {
                 for (int child : children_graph[part]) {
-                    if (!in_F[child] && !in_E[child] && !resolved[child]) {
-                        queue.push_back({child, depth + 1});
-                    }
+                    queue.push_back({child, depth + 1});
                 }
             }
         }
@@ -684,16 +678,16 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     std::vector<Pair> estimated;
     estimated.reserve(candidates.size());
     for (const auto* cand : candidates) {
+        // Now returns double, properly capturing precise costs!
         double est = estimate_swap_count(*cand, pi, reverse) * config_.swap_cost
                      + config_.local_cost_weight * cand->cnot_count;
         estimated.push_back({est, cand});
     }
 
-    int kth = std::min(top_k, static_cast<int>(estimated.size()));
-    std::nth_element(estimated.begin(),
-                     estimated.begin() + kth,
-                     estimated.end(),
-                     [](const Pair& a, const Pair& b) { return a.first < b.first; });
+    // Stable sort perfectly preserves tie-breaker alignments
+    std::stable_sort(estimated.begin(), estimated.end(), [](const Pair& a, const Pair& b) {
+        return a.first < b.first;
+    });
 
     std::vector<const CandidateData*> result;
     result.reserve(top_k);

From 2ae0b0df707144d3c76988a4a30fcbd4514503e4 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 23 Apr 2026 09:13:52 +0200
Subject: [PATCH 138/232] Fix cost function bug

---
 squander/src-cpp/sabre_router/sabre_router.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 709b8004b..6a8b74a26 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -607,9 +607,9 @@ double SabreRouter::score_candidate(
         if (p_idx == cand_idx) continue;
         auto it = canonical_data.find(p_idx);
         if (it == canonical_data.end()) continue;
-        n_other++;
         const auto& entry = it->second;
         if (entry.edges_u.empty()) continue;
+        n_other++;
         for (size_t i = 0; i < entry.edges_u.size(); i++) {
             int u = entry.edges_u[i];
             int v = entry.edges_v[i];

From 36b2043909b3e1bbae1bf0bcab3f93efe40f9ee8 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 23 Apr 2026 11:11:14 +0200
Subject: [PATCH 139/232] fix bindings

---
 squander/synthesis/bindings.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 9b3e21413..9f0662676 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -67,7 +67,9 @@ static std::unordered_map<int, CanonicalEntry> extract_canonical_data(py::dict c
         // val is a dict with 'edges_u', 'edges_v', 'cnot'
         py::dict d = py::reinterpret_borrow<py::dict>(val);
         if (d.contains("edges_u") && !d["edges_u"].is_none()) {
-            auto buf_u = py::array_t<int, py::array::c_style>::ensure(d["edges_u"]);
+            // Python builds these arrays as np.intp; forcecast keeps the C++
+            // router from silently dropping canonical lookahead edges.
+            auto buf_u = py::array_t<int, py::array::c_style | py::array::forcecast>::ensure(d["edges_u"]);
             if (buf_u) {
                 auto acc = buf_u.unchecked<1>();
                 entry.edges_u.resize(acc.shape(0));
@@ -75,7 +77,7 @@ static std::unordered_map<int, CanonicalEntry> extract_canonical_data(py::dict c
             }
         }
         if (d.contains("edges_v") && !d["edges_v"].is_none()) {
-            auto buf_v = py::array_t<int, py::array::c_style>::ensure(d["edges_v"]);
+            auto buf_v = py::array_t<int, py::array::c_style | py::array::forcecast>::ensure(d["edges_v"]);
             if (buf_v) {
                 auto acc = buf_v.unchecked<1>();
                 entry.edges_v.resize(acc.shape(0));

From da0be534a59c2b45dbfaa68ab76c72f26ebee466 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 23 Apr 2026 11:34:33 +0200
Subject: [PATCH 140/232] fix cpp side

---
 .../sabre_router/include/sabre_router.hpp     |   2 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 123 +++++++++++++-----
 2 files changed, 95 insertions(+), 30 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index ec67840fc..d1c55cf3f 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -191,7 +191,7 @@ class SabreRouter {
     ) const;
 
     // Lower-bound swap estimate (port of estimate_swap_count)
-    double estimate_swap_count(
+    int estimate_swap_count(
         const CandidateData& cand,
         const std::vector<int>& pi,
         bool reverse
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 6a8b74a26..78f3b0685 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -9,6 +9,7 @@ C++ backend for the SABRE-style partition-aware routing engine.
 #include <algorithm>
 #include <cmath>
 #include <deque>
+#include <limits>
 #include <numeric>
 #include <queue>
 #include <random>
@@ -214,8 +215,7 @@ std::vector<int> SabreRouter::get_final_layer() const {
 // estimate_swap_count
 // ---------------------------------------------------------------------------
 
-// Change return type to double!
-double SabreRouter::estimate_swap_count(
+int SabreRouter::estimate_swap_count(
     const CandidateData& cand,
     const std::vector<int>& pi,
     bool reverse
@@ -237,7 +237,7 @@ double SabreRouter::estimate_swap_count(
             total += d;
         }
     }
-    return total / 2.0; // NO MORE STATIC_CAST<INT>!
+    return static_cast<int>(total / 2.0);
 }
 
 // ---------------------------------------------------------------------------
@@ -254,18 +254,23 @@ SabreRouter::find_constrained_swaps(
     const std::vector<int>& P_route_inv,
     std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
 ) const {
-    // Build target dict: {q -> target_physical}
+    // Python find_constrained_swaps_partial sorts pi_B_dict.keys().
     int k = static_cast<int>(qbit_map_keys.size());
-    std::vector<int> partition_qubits(k);
+    std::vector<int> order(k);
+    std::iota(order.begin(), order.end(), 0);
+    std::sort(order.begin(), order.end(), [&](int a, int b) {
+        return qbit_map_keys[a] < qbit_map_keys[b];
+    });
+
     std::vector<int> target_positions(k);
     std::vector<int> initial_positions(k);
 
-    for (int i = 0; i < k; i++) {
-        int q = qbit_map_keys[i];
-        int v = qbit_map_vals[i];
-        partition_qubits[i] = q;
-        target_positions[i] = node_mapping_flat[P_route_inv[v]];
-        initial_positions[i] = pi[q];
+    for (int out_idx = 0; out_idx < k; out_idx++) {
+        int src_idx = order[out_idx];
+        int q = qbit_map_keys[src_idx];
+        int v = qbit_map_vals[src_idx];
+        target_positions[out_idx] = node_mapping_flat[P_route_inv[v]];
+        initial_positions[out_idx] = pi[q];
     }
 
     // Check if already at target
@@ -608,8 +613,8 @@ double SabreRouter::score_candidate(
         auto it = canonical_data.find(p_idx);
         if (it == canonical_data.end()) continue;
         const auto& entry = it->second;
-        if (entry.edges_u.empty()) continue;
         n_other++;
+        if (entry.edges_u.empty()) continue;
         for (size_t i = 0; i < entry.edges_u.size(); i++) {
             int u = entry.edges_u[i];
             int v = entry.edges_v[i];
@@ -673,25 +678,29 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     bool reverse
 ) const {
     if (static_cast<int>(candidates.size()) <= top_k) return candidates;
+    if (top_k <= 0) return {};
 
     using Pair = std::pair<double, const CandidateData*>;
     std::vector<Pair> estimated;
     estimated.reserve(candidates.size());
     for (const auto* cand : candidates) {
-        // Now returns double, properly capturing precise costs!
         double est = estimate_swap_count(*cand, pi, reverse) * config_.swap_cost
                      + config_.local_cost_weight * cand->cnot_count;
         estimated.push_back({est, cand});
     }
 
-    // Stable sort perfectly preserves tie-breaker alignments
-    std::stable_sort(estimated.begin(), estimated.end(), [](const Pair& a, const Pair& b) {
-        return a.first < b.first;
-    });
+    std::nth_element(
+        estimated.begin(),
+        estimated.begin() + top_k,
+        estimated.end(),
+        [](const Pair& a, const Pair& b) {
+            return a.first < b.first;
+        }
+    );
 
     std::vector<const CandidateData*> result;
     result.reserve(top_k);
-    for (int i = 0; i < top_k && i < static_cast<int>(estimated.size()); i++) {
+    for (int i = 0; i < top_k; i++) {
         result.push_back(estimated[i].second);
     }
     return result;
@@ -797,10 +806,45 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
     const std::vector<std::vector<int>>& cg,
     const std::vector<std::vector<int>>& pg
 ) const {
-    std::vector<int> F = F_init;
+    std::vector<int> F;
+    std::vector<int> queue;
     std::vector<uint8_t> resolved(num_partitions_, 0);
     int total_swaps = 0;
 
+    // Split F_init into F (multi-qubit) and queue (single-qubit)
+    for (int p : F_init) {
+        if (layout_partitions_[p].is_single) {
+            queue.push_back(p);
+        } else {
+            F.push_back(p);
+        }
+    }
+
+    // Flush initial single-qubit partitions
+    while (!queue.empty()) {
+        int p = queue.back();
+        queue.pop_back();
+
+        if (resolved[p]) continue;
+        resolved[p] = 1;
+
+        for (int child : cg[p]) {
+            if (!resolved[child] && std::find(F.begin(), F.end(), child) == F.end()) {
+                bool parents_ok = true;
+                for (int par : pg[child]) {
+                    if (!resolved[par]) { parents_ok = false; break; }
+                }
+                if (parents_ok) {
+                    if (layout_partitions_[child].is_single) {
+                        queue.push_back(child);
+                    } else {
+                        F.push_back(child);
+                    }
+                }
+            }
+        }
+    }
+
     // Swap cache for this search call (thread-local, on stack)
     std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash> swap_cache;
 
@@ -838,17 +882,38 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
 
         // Update F with newly eligible children
         for (int child : cg[best.partition_idx]) {
-            if (!resolved[child]) {
-                bool in_F = std::find(F.begin(), F.end(), child) != F.end();
-                if (!in_F) {
-                    bool all_parents_resolved = true;
-                    for (int par : pg[child]) {
-                        if (!resolved[par]) {
-                            all_parents_resolved = false;
-                            break;
+            if (!resolved[child] && std::find(F.begin(), F.end(), child) == F.end()) {
+                bool parents_ok = true;
+                for (int par : pg[child]) {
+                    if (!resolved[par]) { parents_ok = false; break; }
+                }
+                
+                if (parents_ok) {
+                    if (layout_partitions_[child].is_single) {
+                        resolved[child] = 1;
+                        std::vector<int> stack;
+                        for (int gc : cg[child]) stack.push_back(gc);
+                        
+                        while (!stack.empty()) {
+                            int gc = stack.back();
+                            stack.pop_back();
+                            
+                            if (!resolved[gc] && std::find(F.begin(), F.end(), gc) == F.end()) {
+                                bool gc_parents_ok = true;
+                                for (int p_gc : pg[gc]) {
+                                    if (!resolved[p_gc]) { gc_parents_ok = false; break; }
+                                }
+                                if (gc_parents_ok) {
+                                    if (layout_partitions_[gc].is_single) {
+                                        resolved[gc] = 1;
+                                        for (int ggc : cg[gc]) stack.push_back(ggc);
+                                    } else {
+                                        F.push_back(gc);
+                                    }
+                                }
+                            }
                         }
-                    }
-                    if (all_parents_resolved) {
+                    } else {
                         F.push_back(child);
                     }
                 }

From bf1b0baa16712c98e91d354e95f4aaedc9cab7a2 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 23 Apr 2026 11:57:29 +0200
Subject: [PATCH 141/232] speed up routing

---
 .../sabre_router/include/sabre_router.hpp     |  47 +--
 .../src-cpp/sabre_router/sabre_router.cpp     | 267 +++++++++++-------
 squander/synthesis/bindings.cpp               |  12 +-
 3 files changed, 196 insertions(+), 130 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index d1c55cf3f..b84195f40 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -54,6 +54,13 @@ struct CandidateData {
 
     // Original circuit qubits involved in this partition
     std::vector<int> involved_qbits;
+
+    // Precomputed routing helpers.
+    std::vector<int> P_i_inv;
+    std::vector<int> P_o_inv;
+    std::vector<int> qbit_map_keys_sorted;
+    std::vector<int> qbit_map_vals_sorted;
+    std::vector<int> qstar_to_q;
 };
 
 struct CanonicalEntry {
@@ -91,24 +98,27 @@ struct TrialResult {
 // ---------------------------------------------------------------------------
 
 struct SwapCacheKey {
-    // Snapshot of pi[] at the involved qubit positions + target positions
-    std::vector<int> pi_snapshot;
-    std::vector<int> targets;
+    int64_t pi_snapshot;
+    int64_t targets;
+    int k;
 
     bool operator==(const SwapCacheKey& o) const {
-        return pi_snapshot == o.pi_snapshot && targets == o.targets;
+        return pi_snapshot == o.pi_snapshot && targets == o.targets && k == o.k;
     }
 };
 
 struct SwapCacheKeyHash {
     size_t operator()(const SwapCacheKey& k) const {
-        size_t h = 0;
-        for (int v : k.pi_snapshot) h = h * 31 + static_cast<size_t>(v);
-        for (int v : k.targets) h = h * 31 + static_cast<size_t>(v);
+        size_t h = static_cast<size_t>(k.pi_snapshot);
+        h ^= static_cast<size_t>(k.targets) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        h ^= static_cast<size_t>(k.k) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
         return h;
     }
 };
 
+using SwapList = std::vector<std::pair<int,int>>;
+using SwapCache = std::unordered_map<SwapCacheKey, SwapList, SwapCacheKeyHash>;
+
 // ---------------------------------------------------------------------------
 // A* state packing helpers
 // ---------------------------------------------------------------------------
@@ -143,14 +153,14 @@ class SabreRouter {
     SabreRouter(
         const SabreConfig& config,
         int N,
-        const std::vector<double>& D,
-        const std::vector<std::vector<int>>& adj,
-        const std::vector<std::vector<int>>& DAG,
-        const std::vector<std::vector<int>>& IDAG,
-        const std::vector<std::vector<CandidateData>>& candidate_cache,
-        const std::vector<LayoutPartInfo>& layout_partitions,
-        const std::unordered_map<int, CanonicalEntry>& canonical_data_fwd,
-        const std::unordered_map<int, CanonicalEntry>& canonical_data_rev
+        std::vector<double> D,
+        std::vector<std::vector<int>> adj,
+        std::vector<std::vector<int>> DAG,
+        std::vector<std::vector<int>> IDAG,
+        std::vector<std::vector<CandidateData>> candidate_cache,
+        std::vector<LayoutPartInfo> layout_partitions,
+        std::unordered_map<int, CanonicalEntry> canonical_data_fwd,
+        std::unordered_map<int, CanonicalEntry> canonical_data_rev
     );
 
     // Thread-safe: all mutable state is stack-local
@@ -187,7 +197,7 @@ class SabreRouter {
         const std::vector<int>& qbit_map_vals,
         const std::vector<int>& node_mapping_flat,
         const std::vector<int>& P_route_inv,
-        std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+        SwapCache* swap_cache
     ) const;
 
     // Lower-bound swap estimate (port of estimate_swap_count)
@@ -213,7 +223,7 @@ class SabreRouter {
         const std::vector<std::pair<int,int>>& E,
         bool reverse,
         const std::unordered_map<int, CanonicalEntry>& canonical_data,
-        std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+        SwapCache* swap_cache
     ) const;
 
     // Route and update layout for a candidate (port of transform_pi)
@@ -222,7 +232,7 @@ class SabreRouter {
         const CandidateData& cand,
         const std::vector<int>& pi,
         bool reverse,
-        std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+        SwapCache* swap_cache
     ) const;
 
     // Release valve for stuck front layers
@@ -315,6 +325,7 @@ class SabreRouter {
     std::vector<LayoutPartInfo> layout_partitions_;
     std::unordered_map<int, CanonicalEntry> canonical_data_fwd_;
     std::unordered_map<int, CanonicalEntry> canonical_data_rev_;
+    std::vector<double> alpha_weights_;
 };
 
 } // namespace squander::routing
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 78f3b0685..0849c0faf 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -9,11 +9,14 @@ C++ backend for the SABRE-style partition-aware routing engine.
 #include <algorithm>
 #include <cmath>
 #include <deque>
+#include <functional>
+#include <initializer_list>
 #include <limits>
 #include <numeric>
 #include <queue>
 #include <random>
 #include <stdexcept>
+#include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -21,6 +24,66 @@ C++ backend for the SABRE-style partition-aware routing engine.
 
 namespace squander::routing {
 
+namespace {
+
+std::vector<int> invert_permutation(const std::vector<int>& P) {
+    std::vector<int> inv(P.size());
+    for (size_t i = 0; i < P.size(); i++) {
+        inv[P[i]] = static_cast<int>(i);
+    }
+    return inv;
+}
+
+void prepare_candidate(CandidateData& cand) {
+    cand.P_i_inv = invert_permutation(cand.P_i);
+    cand.P_o_inv = invert_permutation(cand.P_o);
+
+    const int k = static_cast<int>(cand.qbit_map_keys.size());
+    std::vector<int> order(k);
+    std::iota(order.begin(), order.end(), 0);
+    std::sort(order.begin(), order.end(), [&](int a, int b) {
+        return cand.qbit_map_keys[a] < cand.qbit_map_keys[b];
+    });
+
+    cand.qbit_map_keys_sorted.resize(k);
+    cand.qbit_map_vals_sorted.resize(k);
+    int max_qstar = -1;
+    for (int i = 0; i < k; i++) {
+        const int src_idx = order[i];
+        const int qstar = cand.qbit_map_vals[src_idx];
+        cand.qbit_map_keys_sorted[i] = cand.qbit_map_keys[src_idx];
+        cand.qbit_map_vals_sorted[i] = qstar;
+        if (qstar > max_qstar) max_qstar = qstar;
+    }
+
+    const int dense_size = std::max(
+        {max_qstar + 1,
+         static_cast<int>(cand.P_i.size()),
+         static_cast<int>(cand.P_o.size()),
+         static_cast<int>(cand.node_mapping_flat.size())}
+    );
+    cand.qstar_to_q.assign(dense_size, -1);
+    for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
+        const int qstar = cand.qbit_map_vals[i];
+        if (qstar >= 0) {
+            if (qstar >= static_cast<int>(cand.qstar_to_q.size())) {
+                cand.qstar_to_q.resize(qstar + 1, -1);
+            }
+            cand.qstar_to_q[qstar] = cand.qbit_map_keys[i];
+        }
+    }
+}
+
+inline void unpack_state_into(int64_t packed, int k, int N, std::vector<int>& positions) {
+    positions.resize(k);
+    for (int i = 0; i < k; i++) {
+        positions[i] = static_cast<int>(packed % N);
+        packed /= N;
+    }
+}
+
+} // namespace
+
 // ---------------------------------------------------------------------------
 // Constructor
 // ---------------------------------------------------------------------------
@@ -28,30 +91,44 @@ namespace squander::routing {
 SabreRouter::SabreRouter(
     const SabreConfig& config,
     int N,
-    const std::vector<double>& D,
-    const std::vector<std::vector<int>>& adj,
-    const std::vector<std::vector<int>>& DAG,
-    const std::vector<std::vector<int>>& IDAG,
-    const std::vector<std::vector<CandidateData>>& candidate_cache,
-    const std::vector<LayoutPartInfo>& layout_partitions,
-    const std::unordered_map<int, CanonicalEntry>& canonical_data_fwd,
-    const std::unordered_map<int, CanonicalEntry>& canonical_data_rev
+    std::vector<double> D,
+    std::vector<std::vector<int>> adj,
+    std::vector<std::vector<int>> DAG,
+    std::vector<std::vector<int>> IDAG,
+    std::vector<std::vector<CandidateData>> candidate_cache,
+    std::vector<LayoutPartInfo> layout_partitions,
+    std::unordered_map<int, CanonicalEntry> canonical_data_fwd,
+    std::unordered_map<int, CanonicalEntry> canonical_data_rev
 )
     : config_(config)
     , N_(N)
     , num_partitions_(static_cast<int>(DAG.size()))
-    , D_(D)
-    , adj_(adj)
-    , DAG_(DAG)
-    , IDAG_(IDAG)
-    , candidate_cache_(candidate_cache)
-    , layout_partitions_(layout_partitions)
-    , canonical_data_fwd_(canonical_data_fwd)
-    , canonical_data_rev_(canonical_data_rev)
+    , D_(std::move(D))
+    , adj_(std::move(adj))
+    , DAG_(std::move(DAG))
+    , IDAG_(std::move(IDAG))
+    , candidate_cache_(std::move(candidate_cache))
+    , layout_partitions_(std::move(layout_partitions))
+    , canonical_data_fwd_(std::move(canonical_data_fwd))
+    , canonical_data_rev_(std::move(canonical_data_rev))
 {
     if (static_cast<int>(D_.size()) != N_ * N_) {
         throw std::invalid_argument("Distance matrix D must be N x N");
     }
+    for (auto& partition_candidates : candidate_cache_) {
+        for (auto& cand : partition_candidates) {
+            prepare_candidate(cand);
+        }
+    }
+
+    const int max_depth = std::max(0, config_.max_lookahead);
+    alpha_weights_.resize(max_depth + 1);
+    if (!alpha_weights_.empty()) {
+        alpha_weights_[0] = 1.0;
+        for (int depth = 1; depth <= max_depth; depth++) {
+            alpha_weights_[depth] = alpha_weights_[depth - 1] * config_.E_alpha;
+        }
+    }
 }
 
 // ---------------------------------------------------------------------------
@@ -95,11 +172,7 @@ void SabreRouter::build_target_positions(
     std::vector<int>& out_keys,
     std::vector<int>& out_targets
 ) const {
-    const std::vector<int>& P_route = reverse ? cand.P_o : cand.P_i;
-    std::vector<int> P_route_inv(P_route.size());
-    for (size_t i = 0; i < P_route.size(); i++) {
-        P_route_inv[P_route[i]] = static_cast<int>(i);
-    }
+    const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
 
     out_keys = cand.qbit_map_keys;
     out_targets.resize(cand.qbit_map_keys.size());
@@ -220,11 +293,7 @@ int SabreRouter::estimate_swap_count(
     const std::vector<int>& pi,
     bool reverse
 ) const {
-    const std::vector<int>& P_route = reverse ? cand.P_o : cand.P_i;
-    std::vector<int> P_route_inv(P_route.size());
-    for (size_t i = 0; i < P_route.size(); i++) {
-        P_route_inv[P_route[i]] = static_cast<int>(i);
-    }
+    const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
 
     double total = 0.0;
     for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
@@ -252,25 +321,17 @@ SabreRouter::find_constrained_swaps(
     const std::vector<int>& qbit_map_vals,
     const std::vector<int>& node_mapping_flat,
     const std::vector<int>& P_route_inv,
-    std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+    SwapCache* swap_cache
 ) const {
-    // Python find_constrained_swaps_partial sorts pi_B_dict.keys().
     int k = static_cast<int>(qbit_map_keys.size());
-    std::vector<int> order(k);
-    std::iota(order.begin(), order.end(), 0);
-    std::sort(order.begin(), order.end(), [&](int a, int b) {
-        return qbit_map_keys[a] < qbit_map_keys[b];
-    });
-
     std::vector<int> target_positions(k);
     std::vector<int> initial_positions(k);
 
-    for (int out_idx = 0; out_idx < k; out_idx++) {
-        int src_idx = order[out_idx];
-        int q = qbit_map_keys[src_idx];
-        int v = qbit_map_vals[src_idx];
-        target_positions[out_idx] = node_mapping_flat[P_route_inv[v]];
-        initial_positions[out_idx] = pi[q];
+    for (int i = 0; i < k; i++) {
+        const int q = qbit_map_keys[i];
+        const int v = qbit_map_vals[i];
+        target_positions[i] = node_mapping_flat[P_route_inv[v]];
+        initial_positions[i] = pi[q];
     }
 
     // Check if already at target
@@ -285,20 +346,16 @@ SabreRouter::find_constrained_swaps(
         return {{}, pi};
     }
 
-    // Check swap cache
+    int64_t initial_packed = pack_state(initial_positions, N_);
+    int64_t target_packed = pack_state(target_positions, N_);
+    const SwapCacheKey cache_key{initial_packed, target_packed, k};
+
     if (swap_cache) {
-        SwapCacheKey key;
-        key.pi_snapshot.resize(k);
-        key.targets.resize(k);
-        for (int i = 0; i < k; i++) {
-            key.pi_snapshot[i] = initial_positions[i];
-            key.targets[i] = target_positions[i];
-        }
-        auto it = swap_cache->find(key);
+        auto it = swap_cache->find(cache_key);
         if (it != swap_cache->end()) {
             // Replay cached swaps on current pi
-            auto result_pi = apply_swaps_to_pi(pi, it->second.first);
-            return {it->second.first, result_pi};
+            auto result_pi = apply_swaps_to_pi(pi, it->second);
+            return {it->second, result_pi};
         }
     }
 
@@ -306,9 +363,6 @@ SabreRouter::find_constrained_swaps(
     // State: vector of physical positions for each partition qubit
     // Heuristic: sum(D[pos_i][target_i]) / 2
 
-    int64_t initial_packed = pack_state(initial_positions, N_);
-    int64_t target_packed = pack_state(target_positions, N_);
-
     // Compute initial heuristic
     double h0 = 0.0;
     for (int i = 0; i < k; i++) {
@@ -326,14 +380,22 @@ SabreRouter::find_constrained_swaps(
     std::unordered_map<int64_t, int> visited;
     // Parent: packed_state -> (parent_packed_state, swap)
     std::unordered_map<int64_t, std::pair<int64_t, std::pair<int,int>>> parent;
+    visited.reserve(256);
+    parent.reserve(256);
 
     pq.push({h0, 0, counter++, initial_packed});
     visited[initial_packed] = 0;
     parent[initial_packed] = {-1, {-1, -1}};
 
+    std::vector<int> positions;
+    std::vector<int> new_positions(k);
+    positions.reserve(k);
+
     while (!pq.empty()) {
-        auto [f, g, cnt, packed] = pq.top();
+        auto entry = pq.top();
         pq.pop();
+        int g = std::get<1>(entry);
+        int64_t packed = std::get<3>(entry);
 
         if (packed == target_packed) {
             // Reconstruct swap path
@@ -350,14 +412,7 @@ SabreRouter::find_constrained_swaps(
 
             // Store in cache
             if (swap_cache) {
-                SwapCacheKey key;
-                key.pi_snapshot.resize(k);
-                key.targets.resize(k);
-                for (int i = 0; i < k; i++) {
-                    key.pi_snapshot[i] = initial_positions[i];
-                    key.targets[i] = target_positions[i];
-                }
-                (*swap_cache)[key] = {path, result_pi};
+                (*swap_cache)[cache_key] = path;
             }
 
             return {path, result_pi};
@@ -369,24 +424,20 @@ SabreRouter::find_constrained_swaps(
             continue;
         }
 
-        auto positions = unpack_state(packed, k, N_);
-
-        // pos_to_k_idx: physical position -> index in partition_qubits
-        std::unordered_map<int, int> pos_to_k_idx;
-        for (int i = 0; i < k; i++) {
-            pos_to_k_idx[positions[i]] = i;
-        }
+        unpack_state_into(packed, k, N_, positions);
 
         // Try every SWAP that moves at least one partition qubit
         for (int i = 0; i < k; i++) {
             int p = positions[i];
             for (int nb : adj_[p]) {
-                auto new_positions = positions;
+                std::copy(positions.begin(), positions.end(), new_positions.begin());
                 new_positions[i] = nb;
                 // If neighbor also holds a partition qubit, swap it
-                auto it = pos_to_k_idx.find(nb);
-                if (it != pos_to_k_idx.end()) {
-                    new_positions[it->second] = p;
+                for (int j = 0; j < k; j++) {
+                    if (positions[j] == nb) {
+                        new_positions[j] = p;
+                        break;
+                    }
                 }
 
                 int64_t new_packed = pack_state(new_positions, N_);
@@ -424,20 +475,15 @@ SabreRouter::transform_pi(
     const CandidateData& cand,
     const std::vector<int>& pi,
     bool reverse,
-    std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+    SwapCache* swap_cache
 ) const {
-    // Build P_route_inv
-    const std::vector<int>& P_route = reverse ? cand.P_o : cand.P_i;
-    std::vector<int> P_route_inv(P_route.size());
-    for (size_t i = 0; i < P_route.size(); i++) {
-        P_route_inv[P_route[i]] = static_cast<int>(i);
-    }
+    const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
 
     // Route qubits to input positions
     auto [swaps, pi_routed] = find_constrained_swaps(
         pi,
-        cand.qbit_map_keys,
-        cand.qbit_map_vals,
+        cand.qbit_map_keys_sorted,
+        cand.qbit_map_vals_sorted,
         cand.node_mapping_flat,
         P_route_inv,
         swap_cache
@@ -447,16 +493,10 @@ SabreRouter::transform_pi(
     const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
     std::vector<int> pi_output = pi_routed;
 
-    // Build inverse qbit_map: q* -> q
-    std::unordered_map<int, int> qbit_map_inv;
-    for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
-        qbit_map_inv[cand.qbit_map_vals[i]] = cand.qbit_map_keys[i];
-    }
-
     for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
-        auto it = qbit_map_inv.find(static_cast<int>(q_star));
-        if (it != qbit_map_inv.end()) {
-            int k = it->second;
+        if (q_star < cand.qstar_to_q.size()) {
+            int k = cand.qstar_to_q[q_star];
+            if (k < 0) continue;
             pi_output[k] = cand.node_mapping_flat[P_exit[q_star]];
         }
     }
@@ -581,7 +621,10 @@ double SabreRouter::compute_lookahead_cost(
             double cost = d - 1.0;
             if (cost > 0.0) d_cost += config_.swap_cost * cost;
         }
-        total += std::pow(config_.E_alpha, depth) * d_cost;
+        const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+            ? alpha_weights_[depth]
+            : std::pow(config_.E_alpha, depth);
+        total += alpha * d_cost;
     }
     return config_.E_weight * total / static_cast<double>(E.size());
 }
@@ -597,7 +640,7 @@ double SabreRouter::score_candidate(
     const std::vector<std::pair<int,int>>& E,
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data,
-    std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash>* swap_cache
+    SwapCache* swap_cache
 ) const {
     auto [swaps, output_perm] = transform_pi(cand, pi, reverse, swap_cache);
 
@@ -642,7 +685,10 @@ double SabreRouter::score_candidate(
                 double cost = d - 1.0;
                 if (cost > 0.0) d_cost += config_.swap_cost * cost;
             }
-            e_sum += std::pow(config_.E_alpha, depth) * d_cost;
+            const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+                ? alpha_weights_[depth]
+                : std::pow(config_.E_alpha, depth);
+            e_sum += alpha * d_cost;
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
     }
@@ -809,6 +855,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
     std::vector<int> F;
     std::vector<int> queue;
     std::vector<uint8_t> resolved(num_partitions_, 0);
+    std::vector<uint8_t> in_F(num_partitions_, 0);
     int total_swaps = 0;
 
     // Split F_init into F (multi-qubit) and queue (single-qubit)
@@ -817,6 +864,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
             queue.push_back(p);
         } else {
             F.push_back(p);
+            in_F[p] = 1;
         }
     }
 
@@ -829,7 +877,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         resolved[p] = 1;
 
         for (int child : cg[p]) {
-            if (!resolved[child] && std::find(F.begin(), F.end(), child) == F.end()) {
+            if (!resolved[child] && !in_F[child]) {
                 bool parents_ok = true;
                 for (int par : pg[child]) {
                     if (!resolved[par]) { parents_ok = false; break; }
@@ -839,6 +887,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
                         queue.push_back(child);
                     } else {
                         F.push_back(child);
+                        in_F[child] = 1;
                     }
                 }
             }
@@ -846,7 +895,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
     }
 
     // Swap cache for this search call (thread-local, on stack)
-    std::unordered_map<SwapCacheKey, std::pair<std::vector<std::pair<int,int>>, std::vector<int>>, SwapCacheKeyHash> swap_cache;
+    SwapCache swap_cache;
 
     // Main search loop
     while (!F.empty()) {
@@ -873,6 +922,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
 
         // Remove from F and mark resolved
         F.erase(std::remove(F.begin(), F.end(), best.partition_idx), F.end());
+        in_F[best.partition_idx] = 0;
         resolved[best.partition_idx] = 1;
 
         // Apply transform
@@ -882,7 +932,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
 
         // Update F with newly eligible children
         for (int child : cg[best.partition_idx]) {
-            if (!resolved[child] && std::find(F.begin(), F.end(), child) == F.end()) {
+            if (!resolved[child] && !in_F[child]) {
                 bool parents_ok = true;
                 for (int par : pg[child]) {
                     if (!resolved[par]) { parents_ok = false; break; }
@@ -898,7 +948,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
                             int gc = stack.back();
                             stack.pop_back();
                             
-                            if (!resolved[gc] && std::find(F.begin(), F.end(), gc) == F.end()) {
+                            if (!resolved[gc] && !in_F[gc]) {
                                 bool gc_parents_ok = true;
                                 for (int p_gc : pg[gc]) {
                                     if (!resolved[p_gc]) { gc_parents_ok = false; break; }
@@ -909,12 +959,14 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
                                         for (int ggc : cg[gc]) stack.push_back(ggc);
                                     } else {
                                         F.push_back(gc);
+                                        in_F[gc] = 1;
                                     }
                                 }
                             }
                         }
                     } else {
                         F.push_back(child);
+                        in_F[child] = 1;
                     }
                 }
             }
@@ -947,24 +999,25 @@ TrialResult SabreRouter::run_trial(
         pi = random_permutation(N_, rng_gen);
     }
 
+    auto F_rev = get_final_layer();
+    auto F_fwd = get_initial_layer();
+
     // Forward-backward-forward iterations
     for (int iteration = 0; iteration < n_iterations; iteration++) {
         // Backward pass: swap DAG/IDAG
-        auto F_rev = get_final_layer();
-        auto [pi_bwd, _] = heuristic_search(F_rev, pi, true, rng, canonical_data_rev_, IDAG_, DAG_);
-        pi = std::move(pi_bwd);
+        auto bwd_result = heuristic_search(F_rev, pi, true, rng, canonical_data_rev_, IDAG_, DAG_);
+        pi = std::move(bwd_result.first);
 
         // Forward pass (skip on last iteration)
         if (iteration < n_iterations - 1) {
-            auto F_fwd = get_initial_layer();
-            auto [pi_fwd, __] = heuristic_search(F_fwd, pi, false, rng, canonical_data_fwd_, DAG_, IDAG_);
-            pi = std::move(pi_fwd);
+            auto fwd_result = heuristic_search(F_fwd, pi, false, rng, canonical_data_fwd_, DAG_, IDAG_);
+            pi = std::move(fwd_result.first);
         }
     }
 
     // Final evaluation pass (deterministic, no RNG)
-    auto F_eval = get_initial_layer();
-    auto [pi_final, cost] = heuristic_search(F_eval, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_); // Evaluates cost using a copy under the hood
+    auto eval_result = heuristic_search(F_fwd, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_); // Evaluates cost using a copy under the hood
+    int cost = eval_result.second;
 
     return TrialResult{std::move(pi), cost}; // Return the pi from AFTER the backward pass, BEFORE the eval pass
 }
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 9f0662676..45b8e5044 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -8,6 +8,8 @@ pybind11 bindings for the SABRE routing engine.
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 
+#include <utility>
+
 #include "sabre_router.hpp"
 
 namespace py = pybind11;
@@ -134,9 +136,9 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def(py::init(
             [](const SabreConfig& config,
                py::array_t<double, py::array::c_style> D_arr,
-               const std::vector<std::vector<int>>& adj,
-               const std::vector<std::vector<int>>& DAG,
-               const std::vector<std::vector<int>>& IDAG,
+               std::vector<std::vector<int>> adj,
+               std::vector<std::vector<int>> DAG,
+               std::vector<std::vector<int>> IDAG,
                py::list candidate_cache_py,
                py::list layout_partitions_py,
                py::dict canonical_data_fwd_py,
@@ -170,8 +172,8 @@ PYBIND11_MODULE(_sabre_router, m) {
                 auto cd_rev = extract_canonical_data(canonical_data_rev_py);
 
                 return new SabreRouter(
-                    config, N, D_flat, adj, DAG, IDAG,
-                    cc, lp, cd_fwd, cd_rev
+                    config, N, std::move(D_flat), std::move(adj), std::move(DAG), std::move(IDAG),
+                    std::move(cc), std::move(lp), std::move(cd_fwd), std::move(cd_rev)
                 );
             }),
             py::arg("config"),

From 31e8c86243a69c0382faf0c7e2848150e54a618b Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 23 Apr 2026 12:59:20 +0200
Subject: [PATCH 142/232] Better estimation

---
 .../src-cpp/sabre_router/include/sabre_router.hpp  |  1 +
 squander/src-cpp/sabre_router/sabre_router.cpp     |  7 ++++---
 squander/synthesis/PartAM.py                       | 14 +++++++++-----
 squander/synthesis/bindings.cpp                    |  1 +
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index b84195f40..73e545e40 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -83,6 +83,7 @@ struct SabreConfig {
     double local_cost_weight = 0.1;
     double swap_cost = 15.0;
     double score_tolerance = 0.05;
+    int trial_swap_cnot_cost = 3;
     int sabre_iterations = 1;
     int n_layout_trials = 1;
     int random_seed = 42;
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 0849c0faf..9331b59ba 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -856,7 +856,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
     std::vector<int> queue;
     std::vector<uint8_t> resolved(num_partitions_, 0);
     std::vector<uint8_t> in_F(num_partitions_, 0);
-    int total_swaps = 0;
+    int total_cost = 0;
 
     // Split F_init into F (multi-qubit) and queue (single-qubit)
     for (int p : F_init) {
@@ -927,7 +927,8 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
 
         // Apply transform
         auto [swaps, pi_new] = transform_pi(best, pi, reverse, &swap_cache);
-        total_swaps += static_cast<int>(swaps.size());
+        total_cost += config_.trial_swap_cnot_cost * static_cast<int>(swaps.size())
+                      + best.cnot_count;
         pi = std::move(pi_new);
 
         // Update F with newly eligible children
@@ -973,7 +974,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         }
     }
 
-    return {pi, total_swaps};
+    return {pi, total_cost};
 }
 
 // ---------------------------------------------------------------------------
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 9bcfb872a..a9b601d99 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -146,6 +146,7 @@ def __init__(self, config):
         self.config.setdefault('use_osr', 0)
         self.config.setdefault('n_layout_trials', 1)
         self.config.setdefault('score_tolerance', 0.05)
+        self.config.setdefault('trial_swap_cnot_cost', 3)
         self.config.setdefault('random_seed', 42)
         self.config.setdefault('cleanup', True)
         self.config.setdefault('prefilter_top_k', 50)
@@ -911,6 +912,7 @@ def _run_layout_trials_cpp(
         cfg.local_cost_weight = self.config.get('local_cost_weight', 0.1)
         cfg.swap_cost = self.config.get('swap_cost', 15.0)
         cfg.score_tolerance = self.config.get('score_tolerance', 0.05)
+        cfg.trial_swap_cnot_cost = self.config.get('trial_swap_cnot_cost', 3)
         cfg.sabre_iterations = n_iterations
         cfg.n_layout_trials = max(1, n_trials)
         cfg.random_seed = random_seed
@@ -1472,11 +1474,14 @@ def _heuristic_search_layout_only(
                     updates (used for backward passes in SABRE iterations).
 
         Returns:
-            (pi, total_swaps): final layout and total number of SWAPs accumulated.
+            (pi, total_cost): final layout and estimated routed CNOT cost.
+            The online heuristic still uses ``swap_cost`` for lookahead pressure;
+            this accounting is only used to rank completed layout trials.
         """
         F = list(F)
         resolved_partitions = [False] * len(DAG)
-        total_swaps = 0
+        total_cost = 0
+        swap_cnot_cost = self.config.get("trial_swap_cnot_cost", 3)
 
         queue = deque(
             p for p in F if self._partition_is_single(optimized_partitions[p])
@@ -1564,7 +1569,7 @@ def _heuristic_search_layout_only(
                 reverse=reverse,
                 adj=self._adj,
             )
-            total_swaps += len(swaps)
+            total_cost += swap_cnot_cost * len(swaps) + best.cnot_count
 
             for child in DAG[best.partition_idx]:
                 if not resolved_partitions[child] and child not in F:
@@ -1589,7 +1594,7 @@ def _heuristic_search_layout_only(
                         else:
                             F.append(child)
 
-        return pi, total_swaps    
+        return pi, total_cost    
     # ------------------------------------------------------------------------
     # Circuit Construction
     # ------------------------------------------------------------------------
@@ -2187,4 +2192,3 @@ def generate_DAG_levels(self, circuit):
             current_level = next_level
         
         return levels
-
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 45b8e5044..367d8ea76 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -127,6 +127,7 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("local_cost_weight", &SabreConfig::local_cost_weight)
         .def_readwrite("swap_cost", &SabreConfig::swap_cost)
         .def_readwrite("score_tolerance", &SabreConfig::score_tolerance)
+        .def_readwrite("trial_swap_cnot_cost", &SabreConfig::trial_swap_cnot_cost)
         .def_readwrite("sabre_iterations", &SabreConfig::sabre_iterations)
         .def_readwrite("n_layout_trials", &SabreConfig::n_layout_trials)
         .def_readwrite("random_seed", &SabreConfig::random_seed);

From bfa70d88c3387ef82b3e2e8a484db170d243dc02 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 23 Apr 2026 16:17:39 +0200
Subject: [PATCH 143/232] Try new routing things

---
 .../sabre_router/include/sabre_router.hpp     |  23 ++
 .../src-cpp/sabre_router/sabre_router.cpp     | 180 +++++++++++++--
 squander/synthesis/PartAM.py                  | 210 ++++++++++++++++--
 squander/synthesis/bindings.cpp               |   6 +-
 4 files changed, 374 insertions(+), 45 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 73e545e40..69cc75893 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -87,6 +87,11 @@ struct SabreConfig {
     int sabre_iterations = 1;
     int n_layout_trials = 1;
     int random_seed = 42;
+    // 0 = canonical future scoring, 1 = candidate-aware future scoring.
+    int future_cost_mode = 0;
+    double future_candidate_weight = 1.0;
+    int future_candidate_top_k = 0;
+    double order_weight = 0.0;
 };
 
 struct TrialResult {
@@ -227,6 +232,24 @@ class SabreRouter {
         SwapCache* swap_cache
     ) const;
 
+    double candidate_aware_future_cost(
+        int partition_idx,
+        const std::vector<int>& output_perm,
+        bool reverse
+    ) const;
+
+    double output_layout_quality_cost(
+        const std::vector<int>& output_perm,
+        const std::vector<int>& future_indices,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
+    double output_layout_quality_cost(
+        const std::vector<int>& output_perm,
+        const std::vector<std::pair<int,int>>& future_indices,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
     // Route and update layout for a candidate (port of transform_pi)
     std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
     transform_pi(
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 9331b59ba..0d277279c 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -629,6 +629,109 @@ double SabreRouter::compute_lookahead_cost(
     return config_.E_weight * total / static_cast<double>(E.size());
 }
 
+double SabreRouter::candidate_aware_future_cost(
+    int partition_idx,
+    const std::vector<int>& output_perm,
+    bool reverse
+) const {
+    if (partition_idx < 0 || partition_idx >= static_cast<int>(candidate_cache_.size())) {
+        return 0.0;
+    }
+
+    const auto& candidates = candidate_cache_[partition_idx];
+    if (candidates.empty()) {
+        return 0.0;
+    }
+
+    double best = std::numeric_limits<double>::infinity();
+
+    if (config_.future_candidate_top_k > 0 &&
+        config_.future_candidate_top_k < static_cast<int>(candidates.size())) {
+        std::vector<const CandidateData*> ranked;
+        ranked.reserve(candidates.size());
+        for (const auto& cand : candidates) {
+            ranked.push_back(&cand);
+        }
+        const int top_k = config_.future_candidate_top_k;
+        std::nth_element(
+            ranked.begin(),
+            ranked.begin() + top_k,
+            ranked.end(),
+            [&](const CandidateData* a, const CandidateData* b) {
+                const int a_swaps = estimate_swap_count(*a, output_perm, reverse);
+                const int b_swaps = estimate_swap_count(*b, output_perm, reverse);
+                if (a_swaps != b_swaps) return a_swaps < b_swaps;
+                return a->cnot_count < b->cnot_count;
+            }
+        );
+        for (int i = 0; i < top_k; i++) {
+            const auto& cand = *ranked[i];
+            const double estimate =
+                config_.swap_cost * static_cast<double>(estimate_swap_count(cand, output_perm, reverse)) +
+                config_.local_cost_weight * static_cast<double>(cand.cnot_count);
+            if (estimate < best) best = estimate;
+        }
+    } else {
+        for (const auto& cand : candidates) {
+            const double estimate =
+                config_.swap_cost * static_cast<double>(estimate_swap_count(cand, output_perm, reverse)) +
+                config_.local_cost_weight * static_cast<double>(cand.cnot_count);
+            if (estimate < best) best = estimate;
+        }
+    }
+
+    return std::isinf(best) ? 0.0 : best;
+}
+
+double SabreRouter::output_layout_quality_cost(
+    const std::vector<int>& output_perm,
+    const std::vector<int>& future_indices,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    if (future_indices.empty()) return 0.0;
+
+    double total = 0.0;
+    int count = 0;
+    for (int p_idx : future_indices) {
+        auto it = canonical_data.find(p_idx);
+        if (it == canonical_data.end()) continue;
+        const auto& entry = it->second;
+        if (entry.edges_u.empty()) continue;
+
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            total += dist(output_perm[entry.edges_u[i]], output_perm[entry.edges_v[i]]);
+            count++;
+        }
+    }
+    return count > 0 ? total / static_cast<double>(count) : 0.0;
+}
+
+double SabreRouter::output_layout_quality_cost(
+    const std::vector<int>& output_perm,
+    const std::vector<std::pair<int,int>>& future_indices,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    if (future_indices.empty()) return 0.0;
+
+    double total = 0.0;
+    int count = 0;
+    for (auto [p_idx, depth] : future_indices) {
+        auto it = canonical_data.find(p_idx);
+        if (it == canonical_data.end()) continue;
+        const auto& entry = it->second;
+        if (entry.edges_u.empty()) continue;
+
+        const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+            ? alpha_weights_[depth]
+            : std::pow(config_.E_alpha, depth);
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            total += alpha * dist(output_perm[entry.edges_u[i]], output_perm[entry.edges_v[i]]);
+            count++;
+        }
+    }
+    return count > 0 ? total / static_cast<double>(count) : 0.0;
+}
+
 // ---------------------------------------------------------------------------
 // score_candidate (LightSABRE scoring)
 // ---------------------------------------------------------------------------
@@ -653,46 +756,77 @@ double SabreRouter::score_candidate(
     double f_sum = 0.0;
     for (int p_idx : F_snapshot) {
         if (p_idx == cand_idx) continue;
-        auto it = canonical_data.find(p_idx);
-        if (it == canonical_data.end()) continue;
-        const auto& entry = it->second;
-        n_other++;
-        if (entry.edges_u.empty()) continue;
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            int u = entry.edges_u[i];
-            int v = entry.edges_v[i];
-            double d = dist(output_perm[u], output_perm[v]);
-            double cost = d - 1.0;
-            if (cost > 0.0) f_sum += config_.swap_cost * cost;
-        }
-    }
-    if (n_other > 0) score += f_sum / static_cast<double>(n_other);
-
-    // E cost: alpha^depth-decayed lookahead
-    if (!E.empty()) {
-        double e_sum = 0.0;
-        for (auto [p_idx, depth] : E) {
-            if (p_idx == cand_idx) continue;
+        if (config_.future_cost_mode == 1) {
+            n_other++;
+            f_sum += config_.future_candidate_weight *
+                     candidate_aware_future_cost(p_idx, output_perm, reverse);
+        } else {
             auto it = canonical_data.find(p_idx);
             if (it == canonical_data.end()) continue;
             const auto& entry = it->second;
+            n_other++;
             if (entry.edges_u.empty()) continue;
-            double d_cost = 0.0;
             for (size_t i = 0; i < entry.edges_u.size(); i++) {
                 int u = entry.edges_u[i];
                 int v = entry.edges_v[i];
                 double d = dist(output_perm[u], output_perm[v]);
                 double cost = d - 1.0;
-                if (cost > 0.0) d_cost += config_.swap_cost * cost;
+                if (cost > 0.0) f_sum += config_.swap_cost * cost;
             }
+        }
+    }
+    if (n_other > 0) score += f_sum / static_cast<double>(n_other);
+
+    // E cost: alpha^depth-decayed lookahead
+    if (!E.empty()) {
+        double e_sum = 0.0;
+        for (auto [p_idx, depth] : E) {
+            if (p_idx == cand_idx) continue;
             const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
                 ? alpha_weights_[depth]
                 : std::pow(config_.E_alpha, depth);
-            e_sum += alpha * d_cost;
+            if (config_.future_cost_mode == 1) {
+                e_sum += alpha * config_.future_candidate_weight *
+                         candidate_aware_future_cost(p_idx, output_perm, reverse);
+            } else {
+                auto it = canonical_data.find(p_idx);
+                if (it == canonical_data.end()) continue;
+                const auto& entry = it->second;
+                if (entry.edges_u.empty()) continue;
+                double d_cost = 0.0;
+                for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                    int u = entry.edges_u[i];
+                    int v = entry.edges_v[i];
+                    double d = dist(output_perm[u], output_perm[v]);
+                    double cost = d - 1.0;
+                    if (cost > 0.0) d_cost += config_.swap_cost * cost;
+                }
+                e_sum += alpha * d_cost;
+            }
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
     }
 
+    if (config_.order_weight != 0.0) {
+        std::vector<int> future_indices;
+        future_indices.reserve(F_snapshot.size());
+        for (int p_idx : F_snapshot) {
+            if (p_idx != cand_idx) future_indices.push_back(p_idx);
+        }
+        score += config_.order_weight *
+                 output_layout_quality_cost(output_perm, future_indices, canonical_data);
+
+        if (!E.empty()) {
+            std::vector<std::pair<int,int>> future_E;
+            future_E.reserve(E.size());
+            for (auto [p_idx, depth] : E) {
+                if (p_idx != cand_idx) future_E.push_back({p_idx, depth});
+            }
+            score += config_.order_weight * config_.E_weight *
+                     output_layout_quality_cost(output_perm, future_E, canonical_data);
+        }
+    }
+
     return score;
 }
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index a9b601d99..475ec5f2a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -151,6 +151,10 @@ def __init__(self, config):
         self.config.setdefault('cleanup', True)
         self.config.setdefault('prefilter_top_k', 50)
         self.config.setdefault('cleanup_top_k', 3)
+        self.config.setdefault('future_cost_mode', 'canonical')
+        self.config.setdefault('future_candidate_weight', 1.0)
+        self.config.setdefault('future_candidate_top_k', 0)
+        self.config.setdefault('order_weight', 0.0)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -916,6 +920,15 @@ def _run_layout_trials_cpp(
         cfg.sabre_iterations = n_iterations
         cfg.n_layout_trials = max(1, n_trials)
         cfg.random_seed = random_seed
+        future_cost_mode = self.config.get('future_cost_mode', 'canonical')
+        cfg.future_cost_mode = 1 if future_cost_mode == 'candidate_aware' else 0
+        cfg.future_candidate_weight = self.config.get(
+            'future_candidate_weight', 1.0
+        )
+        cfg.future_candidate_top_k = self.config.get(
+            'future_candidate_top_k', 0
+        )
+        cfg.order_weight = self.config.get('order_weight', 0.0)
 
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
@@ -1401,6 +1414,17 @@ def Heuristic_Search(
                     adj=self._adj,
                     local_cost_weight=self.config.get("local_cost_weight", 0.1),
                     swap_cost=self.config.get("swap_cost", 15.0),
+                    candidate_cache=candidate_cache,
+                    future_cost_mode=self.config.get(
+                        "future_cost_mode", "canonical"
+                    ),
+                    future_candidate_weight=self.config.get(
+                        "future_candidate_weight", 1.0
+                    ),
+                    future_candidate_top_k=self.config.get(
+                        "future_candidate_top_k", 0
+                    ),
+                    order_weight=self.config.get("order_weight", 0.0),
                 )
                 for partition_candidate in partition_candidates
             ]
@@ -1552,6 +1576,17 @@ def _heuristic_search_layout_only(
                     adj=self._adj,
                     local_cost_weight=self.config.get("local_cost_weight", 0.1),
                     swap_cost=self.config.get("swap_cost", 15.0),
+                    candidate_cache=candidate_cache,
+                    future_cost_mode=self.config.get(
+                        "future_cost_mode", "canonical"
+                    ),
+                    future_candidate_weight=self.config.get(
+                        "future_candidate_weight", 1.0
+                    ),
+                    future_candidate_top_k=self.config.get(
+                        "future_candidate_top_k", 0
+                    ),
+                    order_weight=self.config.get("order_weight", 0.0),
                 )
                 for pc in partition_candidates
             ]
@@ -1672,11 +1707,101 @@ def _build_canonical_neighbor_data(scoring_partitions, reverse=False):
             data[idx] = {'edges_u': eu, 'edges_v': ev, 'cnot': best_cnot}
         return data
 
+    @staticmethod
+    def _candidate_aware_future_cost(
+        partition_idx,
+        output_perm,
+        D,
+        candidate_cache,
+        reverse=False,
+        local_cost_weight=0.1,
+        swap_cost=15.0,
+        future_candidate_top_k=0,
+    ):
+        """Estimate a future partition by its best existing candidate.
+
+        This does not synthesize any extra candidates.  It only asks: from the
+        candidate output layout, how expensive would each already stored
+        candidate be to enter?
+        """
+        if candidate_cache is None:
+            return 0.0
+        if partition_idx < 0 or partition_idx >= len(candidate_cache):
+            return 0.0
+
+        candidates = candidate_cache[partition_idx]
+        if not candidates:
+            return 0.0
+
+        if future_candidate_top_k and future_candidate_top_k > 0:
+            candidates = sorted(
+                candidates,
+                key=lambda pc: (
+                    pc.estimate_swap_count(output_perm, D, reverse=reverse),
+                    pc.cnot_count,
+                ),
+            )[:future_candidate_top_k]
+
+        best = float("inf")
+        for candidate in candidates:
+            estimate = (
+                swap_cost
+                * candidate.estimate_swap_count(output_perm, D, reverse=reverse)
+                + local_cost_weight * candidate.cnot_count
+            )
+            if estimate < best:
+                best = estimate
+
+        return 0.0 if best == float("inf") else best
+
+    @staticmethod
+    def _output_layout_quality_cost(
+        output_perm_arr,
+        future_indices,
+        canonical_data,
+        D_arr,
+        alpha=0.9,
+        weighted_depths=None,
+    ):
+        """Penalize output layouts that leave future interaction edges far apart."""
+        if not future_indices or canonical_data is None:
+            return 0.0
+
+        total = 0.0
+        count = 0
+        for item in future_indices:
+            if isinstance(item, tuple):
+                partition_idx, depth = item
+            else:
+                partition_idx, depth = item, 0
+
+            entry = canonical_data.get(partition_idx)
+            if entry is None:
+                continue
+            eu = entry["edges_u"]
+            if eu is None:
+                continue
+
+            phys_u = output_perm_arr[eu]
+            phys_v = output_perm_arr[entry["edges_v"]]
+            weight = weighted_depths.get(partition_idx, 1.0) if weighted_depths else 1.0
+            if depth:
+                weight *= alpha ** depth
+            total += weight * D_arr[phys_u, phys_v].sum()
+            count += len(eu)
+
+        return total / count if count else 0.0
+
     @staticmethod
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
                                   canonical_data=None, adj=None,
-                                  local_cost_weight=0.1, swap_cost=15.0):
+                                  local_cost_weight=0.1, swap_cost=15.0,
+                                  candidate_cache=None,
+                                  future_cost_mode="canonical",
+                                  future_candidate_weight=1.0,
+                                  future_candidate_top_k=0,
+                                  order_weight=0.0):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = swap_cost * |swaps|
@@ -1703,16 +1828,29 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
         for partition_idx in F:
             if partition_idx == cand_idx:
                 continue
-            entry = canonical_data.get(partition_idx)
-            if entry is None:
-                continue
-            n_other += 1
-            eu = entry['edges_u']
-            if eu is None:
-                continue
-            phys_u = output_perm_arr[eu]
-            phys_v = output_perm_arr[entry['edges_v']]
-            f_sum += swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+            if future_cost_mode == "candidate_aware":
+                n_other += 1
+                f_sum += future_candidate_weight * qgd_Partition_Aware_Mapping._candidate_aware_future_cost(
+                    partition_idx,
+                    output_perm,
+                    D,
+                    candidate_cache,
+                    reverse=reverse,
+                    local_cost_weight=local_cost_weight,
+                    swap_cost=swap_cost,
+                    future_candidate_top_k=future_candidate_top_k,
+                )
+            else:
+                entry = canonical_data.get(partition_idx)
+                if entry is None:
+                    continue
+                n_other += 1
+                eu = entry['edges_u']
+                if eu is None:
+                    continue
+                phys_u = output_perm_arr[eu]
+                phys_v = output_perm_arr[entry['edges_v']]
+                f_sum += swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
         if n_other > 0:
             score += f_sum / n_other
 
@@ -1722,18 +1860,48 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             for partition_idx, depth in E:
                 if partition_idx == cand_idx:
                     continue
-                entry = canonical_data.get(partition_idx)
-                if entry is None:
-                    continue
-                eu = entry['edges_u']
-                if eu is None:
-                    continue
-                phys_u = output_perm_arr[eu]
-                phys_v = output_perm_arr[entry['edges_v']]
-                d_cost = swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
-                e_sum += (alpha ** depth) * d_cost
+                if future_cost_mode == "candidate_aware":
+                    d_cost = qgd_Partition_Aware_Mapping._candidate_aware_future_cost(
+                        partition_idx,
+                        output_perm,
+                        D,
+                        candidate_cache,
+                        reverse=reverse,
+                        local_cost_weight=local_cost_weight,
+                        swap_cost=swap_cost,
+                        future_candidate_top_k=future_candidate_top_k,
+                    )
+                    e_sum += (alpha ** depth) * future_candidate_weight * d_cost
+                else:
+                    entry = canonical_data.get(partition_idx)
+                    if entry is None:
+                        continue
+                    eu = entry['edges_u']
+                    if eu is None:
+                        continue
+                    phys_u = output_perm_arr[eu]
+                    phys_v = output_perm_arr[entry['edges_v']]
+                    d_cost = swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+                    e_sum += (alpha ** depth) * d_cost
             score += W * e_sum / len(E)
 
+        if order_weight:
+            future_indices = [idx for idx in F if idx != cand_idx]
+            score += order_weight * qgd_Partition_Aware_Mapping._output_layout_quality_cost(
+                output_perm_arr,
+                future_indices,
+                canonical_data,
+                D_arr,
+            )
+            if E:
+                score += order_weight * W * qgd_Partition_Aware_Mapping._output_layout_quality_cost(
+                    output_perm_arr,
+                    [(idx, depth) for idx, depth in E if idx != cand_idx],
+                    canonical_data,
+                    D_arr,
+                    alpha=alpha,
+                )
+
         return score
 
     # ------------------------------------------------------------------------
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 367d8ea76..2aa16fa65 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -130,7 +130,11 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("trial_swap_cnot_cost", &SabreConfig::trial_swap_cnot_cost)
         .def_readwrite("sabre_iterations", &SabreConfig::sabre_iterations)
         .def_readwrite("n_layout_trials", &SabreConfig::n_layout_trials)
-        .def_readwrite("random_seed", &SabreConfig::random_seed);
+        .def_readwrite("random_seed", &SabreConfig::random_seed)
+        .def_readwrite("future_cost_mode", &SabreConfig::future_cost_mode)
+        .def_readwrite("future_candidate_weight", &SabreConfig::future_candidate_weight)
+        .def_readwrite("future_candidate_top_k", &SabreConfig::future_candidate_top_k)
+        .def_readwrite("order_weight", &SabreConfig::order_weight);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")

From bb3d08ae8a1739949fe45bee313def00b0950a80 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 00:25:54 +0200
Subject: [PATCH 144/232] update routing

---
 .../sabre_router/include/sabre_router.hpp     |  82 ++-
 .../src-cpp/sabre_router/sabre_router.cpp     | 657 +++++++++++-------
 squander/synthesis/PartAM.py                  | 458 ++++++------
 squander/synthesis/bindings.cpp               |   9 +-
 4 files changed, 687 insertions(+), 519 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 69cc75893..e2e739e42 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -87,11 +87,11 @@ struct SabreConfig {
     int sabre_iterations = 1;
     int n_layout_trials = 1;
     int random_seed = 42;
-    // 0 = canonical future scoring, 1 = candidate-aware future scoring.
-    int future_cost_mode = 0;
-    double future_candidate_weight = 1.0;
-    int future_candidate_top_k = 0;
-    double order_weight = 0.0;
+    double decay_delta = 0.1;
+    int decay_reset_interval = 5;
+    bool release_valve_enabled = true;
+    int release_valve_threshold = 20;
+    double path_tiebreak_weight = 0.2;
 };
 
 struct TrialResult {
@@ -99,6 +99,23 @@ struct TrialResult {
     int total_cost;
 };
 
+struct NeighborEdge {
+    int u_idx;
+    int v_idx;
+    double weight;
+};
+
+struct NeighborInfo {
+    std::vector<int> neighbor_vqs;
+    std::vector<int> initial_pos;
+    std::vector<NeighborEdge> edges;
+    double weight = 0.0;
+
+    bool uses_tiebreak() const {
+        return weight > 0.0 && !edges.empty();
+    }
+};
+
 // ---------------------------------------------------------------------------
 // Swap cache key for deduplication within a single heuristic_search call
 // ---------------------------------------------------------------------------
@@ -203,7 +220,8 @@ class SabreRouter {
         const std::vector<int>& qbit_map_vals,
         const std::vector<int>& node_mapping_flat,
         const std::vector<int>& P_route_inv,
-        SwapCache* swap_cache
+        SwapCache* swap_cache,
+        const NeighborInfo* neighbor_info = nullptr
     ) const;
 
     // Lower-bound swap estimate (port of estimate_swap_count)
@@ -229,25 +247,8 @@ class SabreRouter {
         const std::vector<std::pair<int,int>>& E,
         bool reverse,
         const std::unordered_map<int, CanonicalEntry>& canonical_data,
-        SwapCache* swap_cache
-    ) const;
-
-    double candidate_aware_future_cost(
-        int partition_idx,
-        const std::vector<int>& output_perm,
-        bool reverse
-    ) const;
-
-    double output_layout_quality_cost(
-        const std::vector<int>& output_perm,
-        const std::vector<int>& future_indices,
-        const std::unordered_map<int, CanonicalEntry>& canonical_data
-    ) const;
-
-    double output_layout_quality_cost(
-        const std::vector<int>& output_perm,
-        const std::vector<std::pair<int,int>>& future_indices,
-        const std::unordered_map<int, CanonicalEntry>& canonical_data
+        SwapCache* swap_cache,
+        const std::vector<double>* decay = nullptr
     ) const;
 
     // Route and update layout for a candidate (port of transform_pi)
@@ -256,20 +257,38 @@ class SabreRouter {
         const CandidateData& cand,
         const std::vector<int>& pi,
         bool reverse,
-        SwapCache* swap_cache
+        SwapCache* swap_cache,
+        const NeighborInfo* neighbor_info = nullptr
     ) const;
 
-    // Release valve for stuck front layers
-    std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
-    release_valve(
-        const std::vector<int>& F,
+    NeighborInfo build_neighbor_info(
+        int exclude_partition_idx,
+        const std::vector<int>& F_snapshot,
+        const std::vector<std::pair<int,int>>& E,
         const std::vector<int>& pi,
         const std::unordered_map<int, CanonicalEntry>& canonical_data
     ) const;
 
-    // BFS shortest path on adjacency graph
+    double decay_factor_for_swaps(
+        const std::vector<std::pair<int,int>>& swaps,
+        const std::vector<double>& decay
+    ) const;
+
+    void apply_decay_for_swaps(
+        const std::vector<std::pair<int,int>>& swaps,
+        std::vector<double>& decay
+    ) const;
+
+    void reset_decay(std::vector<double>& decay) const;
+
     std::vector<int> bfs_shortest_path(int src, int dst) const;
 
+    std::pair<std::vector<std::pair<int,int>>, std::vector<int>> release_valve(
+        const std::vector<int>& F,
+        const std::vector<int>& pi,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
     // Apply a list of SWAPs to pi
     std::vector<int> apply_swaps_to_pi(
         const std::vector<int>& pi,
@@ -350,6 +369,7 @@ class SabreRouter {
     std::unordered_map<int, CanonicalEntry> canonical_data_fwd_;
     std::unordered_map<int, CanonicalEntry> canonical_data_rev_;
     std::vector<double> alpha_weights_;
+    double max_finite_distance_ = 1.0;
 };
 
 } // namespace squander::routing
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 0d277279c..cd32c7275 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -129,6 +129,13 @@ SabreRouter::SabreRouter(
             alpha_weights_[depth] = alpha_weights_[depth - 1] * config_.E_alpha;
         }
     }
+
+    max_finite_distance_ = 1.0;
+    for (double d : D_) {
+        if (std::isfinite(d) && d > max_finite_distance_) {
+            max_finite_distance_ = d;
+        }
+    }
 }
 
 // ---------------------------------------------------------------------------
@@ -183,11 +190,141 @@ void SabreRouter::build_target_positions(
 }
 
 // ---------------------------------------------------------------------------
-// BFS shortest path
+// apply_swaps_to_pi
 // ---------------------------------------------------------------------------
 
+std::vector<int> SabreRouter::apply_swaps_to_pi(
+    const std::vector<int>& pi,
+    const std::vector<std::pair<int,int>>& swaps
+) const {
+    std::vector<int> result(pi);
+    std::vector<int> p2v(N_);
+    for (int q = 0; q < N_; q++) p2v[result[q]] = q;
+
+    for (auto [P1, P2] : swaps) {
+        int q1 = p2v[P1];
+        int q2 = p2v[P2];
+        p2v[P1] = q2;
+        p2v[P2] = q1;
+        result[q1] = P2;
+        result[q2] = P1;
+    }
+    return result;
+}
+
+NeighborInfo SabreRouter::build_neighbor_info(
+    int exclude_partition_idx,
+    const std::vector<int>& F_snapshot,
+    const std::vector<std::pair<int,int>>& E,
+    const std::vector<int>& pi,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    NeighborInfo info;
+    info.weight = config_.path_tiebreak_weight;
+    if (info.weight <= 0.0) {
+        return info;
+    }
+
+    std::unordered_map<int, int> q_to_idx;
+    std::unordered_map<uint64_t, double> edge_weights;
+    std::unordered_map<uint64_t, std::pair<int, int>> edge_nodes;
+
+    auto ensure_qubit = [&](int q) -> int {
+        auto it = q_to_idx.find(q);
+        if (it != q_to_idx.end()) {
+            return it->second;
+        }
+        const int idx = static_cast<int>(info.neighbor_vqs.size());
+        q_to_idx.emplace(q, idx);
+        info.neighbor_vqs.push_back(q);
+        info.initial_pos.push_back(pi[q]);
+        return idx;
+    };
+
+    auto add_partition_edges = [&](int partition_idx, double weight) {
+        if (partition_idx == exclude_partition_idx || weight <= 0.0) {
+            return;
+        }
+        auto it = canonical_data.find(partition_idx);
+        if (it == canonical_data.end()) {
+            return;
+        }
+        const auto& entry = it->second;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            const int u = entry.edges_u[i];
+            const int v = entry.edges_v[i];
+            ensure_qubit(u);
+            ensure_qubit(v);
+            const int lo = std::min(u, v);
+            const int hi = std::max(u, v);
+            const uint64_t key =
+                (static_cast<uint64_t>(static_cast<uint32_t>(lo)) << 32)
+                | static_cast<uint32_t>(hi);
+            edge_weights[key] += weight;
+            edge_nodes[key] = {u, v};
+        }
+    };
+
+    for (int partition_idx : F_snapshot) {
+        add_partition_edges(partition_idx, 1.0);
+    }
+
+    for (auto [partition_idx, depth] : E) {
+        const double alpha =
+            (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+                ? alpha_weights_[depth]
+                : std::pow(config_.E_alpha, depth);
+        add_partition_edges(partition_idx, config_.E_weight * alpha);
+    }
+
+    info.edges.reserve(edge_weights.size());
+    for (const auto& [key, weight] : edge_weights) {
+        (void)key;
+        const auto nodes = edge_nodes.at(key);
+        info.edges.push_back(
+            NeighborEdge{
+                q_to_idx.at(nodes.first),
+                q_to_idx.at(nodes.second),
+                weight,
+            }
+        );
+    }
+
+    return info;
+}
+
+double SabreRouter::decay_factor_for_swaps(
+    const std::vector<std::pair<int,int>>& swaps,
+    const std::vector<double>& decay
+) const {
+    double factor = 1.0;
+    for (auto [u, v] : swaps) {
+        factor = std::max(factor, std::max(decay[u], decay[v]));
+    }
+    return factor;
+}
+
+void SabreRouter::apply_decay_for_swaps(
+    const std::vector<std::pair<int,int>>& swaps,
+    std::vector<double>& decay
+) const {
+    if (config_.decay_delta <= 0.0) {
+        return;
+    }
+    for (auto [u, v] : swaps) {
+        decay[u] += config_.decay_delta;
+        decay[v] += config_.decay_delta;
+    }
+}
+
+void SabreRouter::reset_decay(std::vector<double>& decay) const {
+    std::fill(decay.begin(), decay.end(), 1.0);
+}
+
 std::vector<int> SabreRouter::bfs_shortest_path(int src, int dst) const {
-    if (src == dst) return {src};
+    if (src == dst) {
+        return {src};
+    }
 
     std::vector<int> parent(N_, -1);
     std::vector<uint8_t> visited(N_, 0);
@@ -196,52 +333,97 @@ std::vector<int> SabreRouter::bfs_shortest_path(int src, int dst) const {
     visited[src] = 1;
 
     while (!queue.empty()) {
-        int node = queue.front();
+        const int node = queue.front();
         queue.pop_front();
         for (int nb : adj_[node]) {
-            if (!visited[nb]) {
-                visited[nb] = 1;
-                parent[nb] = node;
-                if (nb == dst) {
-                    // Reconstruct path
-                    std::vector<int> path;
-                    int cur = dst;
-                    while (cur != src) {
-                        path.push_back(cur);
-                        cur = parent[cur];
-                    }
-                    path.push_back(src);
-                    std::reverse(path.begin(), path.end());
-                    return path;
+            if (visited[nb]) {
+                continue;
+            }
+            visited[nb] = 1;
+            parent[nb] = node;
+            if (nb == dst) {
+                std::vector<int> path;
+                int cur = dst;
+                while (cur != src) {
+                    path.push_back(cur);
+                    cur = parent[cur];
                 }
-                queue.push_back(nb);
+                path.push_back(src);
+                std::reverse(path.begin(), path.end());
+                return path;
             }
+            queue.push_back(nb);
         }
     }
-    return {}; // unreachable
-}
 
-// ---------------------------------------------------------------------------
-// apply_swaps_to_pi
-// ---------------------------------------------------------------------------
+    return {};
+}
 
-std::vector<int> SabreRouter::apply_swaps_to_pi(
+std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::release_valve(
+    const std::vector<int>& F,
     const std::vector<int>& pi,
-    const std::vector<std::pair<int,int>>& swaps
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
-    std::vector<int> result(pi);
-    std::vector<int> p2v(N_);
-    for (int q = 0; q < N_; q++) p2v[result[q]] = q;
+    double best_worst_dist = std::numeric_limits<double>::infinity();
+    int best_u = -1;
+    int best_v = -1;
 
-    for (auto [P1, P2] : swaps) {
-        int q1 = p2v[P1];
-        int q2 = p2v[P2];
-        p2v[P1] = q2;
-        p2v[P2] = q1;
-        result[q1] = P2;
-        result[q2] = P1;
+    for (int partition_idx : F) {
+        auto it = canonical_data.find(partition_idx);
+        if (it == canonical_data.end()) {
+            continue;
+        }
+        const auto& entry = it->second;
+        if (entry.edges_u.empty()) {
+            continue;
+        }
+
+        double worst_dist = 0.0;
+        int worst_u = -1;
+        int worst_v = -1;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            const int u = entry.edges_u[i];
+            const int v = entry.edges_v[i];
+            const double d = dist(pi[u], pi[v]);
+            if (d > worst_dist) {
+                worst_dist = d;
+                worst_u = u;
+                worst_v = v;
+            }
+        }
+
+        if (worst_dist <= 1.0 || worst_u < 0) {
+            continue;
+        }
+
+        if (worst_dist < best_worst_dist) {
+            best_worst_dist = worst_dist;
+            best_u = worst_u;
+            best_v = worst_v;
+        }
     }
-    return result;
+
+    if (best_u < 0) {
+        return {{}, pi};
+    }
+
+    const auto path = bfs_shortest_path(pi[best_u], pi[best_v]);
+    if (path.size() < 2) {
+        return {{}, pi};
+    }
+
+    const int k = static_cast<int>(path.size()) - 1;
+    const int m = k / 2;
+    std::vector<std::pair<int,int>> swaps;
+    for (int i = 0; i < m; i++) {
+        swaps.push_back({path[i], path[i + 1]});
+    }
+    for (int i = k; i > m + 1; i--) {
+        swaps.push_back({path[i], path[i - 1]});
+    }
+
+    auto pi_new = apply_swaps_to_pi(pi, swaps);
+    return {swaps, pi_new};
 }
 
 // ---------------------------------------------------------------------------
@@ -321,7 +503,8 @@ SabreRouter::find_constrained_swaps(
     const std::vector<int>& qbit_map_vals,
     const std::vector<int>& node_mapping_flat,
     const std::vector<int>& P_route_inv,
-    SwapCache* swap_cache
+    SwapCache* swap_cache,
+    const NeighborInfo* neighbor_info
 ) const {
     int k = static_cast<int>(qbit_map_keys.size());
     std::vector<int> target_positions(k);
@@ -349,8 +532,10 @@ SabreRouter::find_constrained_swaps(
     int64_t initial_packed = pack_state(initial_positions, N_);
     int64_t target_packed = pack_state(target_positions, N_);
     const SwapCacheKey cache_key{initial_packed, target_packed, k};
+    const bool use_neighbor =
+        neighbor_info != nullptr && neighbor_info->uses_tiebreak();
 
-    if (swap_cache) {
+    if (swap_cache && !use_neighbor) {
         auto it = swap_cache->find(cache_key);
         if (it != swap_cache->end()) {
             // Replay cached swaps on current pi
@@ -370,9 +555,41 @@ SabreRouter::find_constrained_swaps(
     }
     h0 /= 2.0;
 
-    // Priority queue: (f_score, g_score, counter, packed_state)
+    double total_edge_weight = 0.0;
+    if (use_neighbor) {
+        for (const auto& edge : neighbor_info->edges) {
+            total_edge_weight += edge.weight;
+        }
+    }
+    const double neighbor_norm = std::max(
+        1.0,
+        total_edge_weight * std::max(1.0, max_finite_distance_)
+    );
+    auto neighbor_heuristic = [&](const std::vector<int>& neighbor_positions) {
+        if (!use_neighbor) {
+            return 0.0;
+        }
+        double total = 0.0;
+        for (const auto& edge : neighbor_info->edges) {
+            total += edge.weight
+                * dist(
+                    neighbor_positions[edge.u_idx],
+                    neighbor_positions[edge.v_idx]
+                );
+        }
+        return (neighbor_info->weight * total) / neighbor_norm;
+    };
+
+    std::vector<int> initial_neighbor_positions;
+    double nh0 = 0.0;
+    if (use_neighbor) {
+        initial_neighbor_positions = neighbor_info->initial_pos;
+        nh0 = neighbor_heuristic(initial_neighbor_positions);
+    }
+
+    // Priority queue: (f_score, g_score, counter, packed_state, neighbor_state)
     // Counter provides FIFO tie-breaking, matching Python's counter variable
-    using PQEntry = std::tuple<double, int, uint64_t, int64_t>;
+    using PQEntry = std::tuple<double, int, uint64_t, int64_t, std::vector<int>>;
     std::priority_queue<PQEntry, std::vector<PQEntry>, std::greater<PQEntry>> pq;
     uint64_t counter = 0;
 
@@ -383,7 +600,7 @@ SabreRouter::find_constrained_swaps(
     visited.reserve(256);
     parent.reserve(256);
 
-    pq.push({h0, 0, counter++, initial_packed});
+    pq.push({h0 + nh0, 0, counter++, initial_packed, initial_neighbor_positions});
     visited[initial_packed] = 0;
     parent[initial_packed] = {-1, {-1, -1}};
 
@@ -396,6 +613,7 @@ SabreRouter::find_constrained_swaps(
         pq.pop();
         int g = std::get<1>(entry);
         int64_t packed = std::get<3>(entry);
+        const std::vector<int>& neighbor_positions = std::get<4>(entry);
 
         if (packed == target_packed) {
             // Reconstruct swap path
@@ -411,7 +629,7 @@ SabreRouter::find_constrained_swaps(
             auto result_pi = apply_swaps_to_pi(pi, path);
 
             // Store in cache
-            if (swap_cache) {
+            if (swap_cache && !use_neighbor) {
                 (*swap_cache)[cache_key] = path;
             }
 
@@ -448,6 +666,26 @@ SabreRouter::find_constrained_swaps(
                     continue;
                 }
 
+                std::vector<int> new_neighbor_positions = neighbor_positions;
+                double new_nh = 0.0;
+                if (use_neighbor) {
+                    std::unordered_map<int, int> phys_to_neighbor_idx;
+                    phys_to_neighbor_idx.reserve(new_neighbor_positions.size());
+                    for (int idx = 0; idx < static_cast<int>(new_neighbor_positions.size()); idx++) {
+                        phys_to_neighbor_idx.emplace(new_neighbor_positions[idx], idx);
+                    }
+
+                    auto it_nb = phys_to_neighbor_idx.find(nb);
+                    if (it_nb != phys_to_neighbor_idx.end()) {
+                        new_neighbor_positions[it_nb->second] = p;
+                    }
+                    auto it_p = phys_to_neighbor_idx.find(p);
+                    if (it_p != phys_to_neighbor_idx.end()) {
+                        new_neighbor_positions[it_p->second] = nb;
+                    }
+                    new_nh = neighbor_heuristic(new_neighbor_positions);
+                }
+
                 // Compute heuristic
                 double h = 0.0;
                 for (int j = 0; j < k; j++) {
@@ -457,7 +695,13 @@ SabreRouter::find_constrained_swaps(
 
                 visited[new_packed] = new_g;
                 parent[new_packed] = {packed, {std::min(p, nb), std::max(p, nb)}};
-                pq.push({new_g + h, new_g, counter++, new_packed});
+                pq.push({
+                    new_g + h + new_nh,
+                    new_g,
+                    counter++,
+                    new_packed,
+                    std::move(new_neighbor_positions),
+                });
             }
         }
     }
@@ -475,7 +719,8 @@ SabreRouter::transform_pi(
     const CandidateData& cand,
     const std::vector<int>& pi,
     bool reverse,
-    SwapCache* swap_cache
+    SwapCache* swap_cache,
+    const NeighborInfo* neighbor_info
 ) const {
     const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
 
@@ -486,7 +731,8 @@ SabreRouter::transform_pi(
         cand.qbit_map_vals_sorted,
         cand.node_mapping_flat,
         P_route_inv,
-        swap_cache
+        swap_cache,
+        neighbor_info
     );
 
     // Update output positions using P_exit
@@ -629,109 +875,6 @@ double SabreRouter::compute_lookahead_cost(
     return config_.E_weight * total / static_cast<double>(E.size());
 }
 
-double SabreRouter::candidate_aware_future_cost(
-    int partition_idx,
-    const std::vector<int>& output_perm,
-    bool reverse
-) const {
-    if (partition_idx < 0 || partition_idx >= static_cast<int>(candidate_cache_.size())) {
-        return 0.0;
-    }
-
-    const auto& candidates = candidate_cache_[partition_idx];
-    if (candidates.empty()) {
-        return 0.0;
-    }
-
-    double best = std::numeric_limits<double>::infinity();
-
-    if (config_.future_candidate_top_k > 0 &&
-        config_.future_candidate_top_k < static_cast<int>(candidates.size())) {
-        std::vector<const CandidateData*> ranked;
-        ranked.reserve(candidates.size());
-        for (const auto& cand : candidates) {
-            ranked.push_back(&cand);
-        }
-        const int top_k = config_.future_candidate_top_k;
-        std::nth_element(
-            ranked.begin(),
-            ranked.begin() + top_k,
-            ranked.end(),
-            [&](const CandidateData* a, const CandidateData* b) {
-                const int a_swaps = estimate_swap_count(*a, output_perm, reverse);
-                const int b_swaps = estimate_swap_count(*b, output_perm, reverse);
-                if (a_swaps != b_swaps) return a_swaps < b_swaps;
-                return a->cnot_count < b->cnot_count;
-            }
-        );
-        for (int i = 0; i < top_k; i++) {
-            const auto& cand = *ranked[i];
-            const double estimate =
-                config_.swap_cost * static_cast<double>(estimate_swap_count(cand, output_perm, reverse)) +
-                config_.local_cost_weight * static_cast<double>(cand.cnot_count);
-            if (estimate < best) best = estimate;
-        }
-    } else {
-        for (const auto& cand : candidates) {
-            const double estimate =
-                config_.swap_cost * static_cast<double>(estimate_swap_count(cand, output_perm, reverse)) +
-                config_.local_cost_weight * static_cast<double>(cand.cnot_count);
-            if (estimate < best) best = estimate;
-        }
-    }
-
-    return std::isinf(best) ? 0.0 : best;
-}
-
-double SabreRouter::output_layout_quality_cost(
-    const std::vector<int>& output_perm,
-    const std::vector<int>& future_indices,
-    const std::unordered_map<int, CanonicalEntry>& canonical_data
-) const {
-    if (future_indices.empty()) return 0.0;
-
-    double total = 0.0;
-    int count = 0;
-    for (int p_idx : future_indices) {
-        auto it = canonical_data.find(p_idx);
-        if (it == canonical_data.end()) continue;
-        const auto& entry = it->second;
-        if (entry.edges_u.empty()) continue;
-
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            total += dist(output_perm[entry.edges_u[i]], output_perm[entry.edges_v[i]]);
-            count++;
-        }
-    }
-    return count > 0 ? total / static_cast<double>(count) : 0.0;
-}
-
-double SabreRouter::output_layout_quality_cost(
-    const std::vector<int>& output_perm,
-    const std::vector<std::pair<int,int>>& future_indices,
-    const std::unordered_map<int, CanonicalEntry>& canonical_data
-) const {
-    if (future_indices.empty()) return 0.0;
-
-    double total = 0.0;
-    int count = 0;
-    for (auto [p_idx, depth] : future_indices) {
-        auto it = canonical_data.find(p_idx);
-        if (it == canonical_data.end()) continue;
-        const auto& entry = it->second;
-        if (entry.edges_u.empty()) continue;
-
-        const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
-            ? alpha_weights_[depth]
-            : std::pow(config_.E_alpha, depth);
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            total += alpha * dist(output_perm[entry.edges_u[i]], output_perm[entry.edges_v[i]]);
-            count++;
-        }
-    }
-    return count > 0 ? total / static_cast<double>(count) : 0.0;
-}
-
 // ---------------------------------------------------------------------------
 // score_candidate (LightSABRE scoring)
 // ---------------------------------------------------------------------------
@@ -743,12 +886,31 @@ double SabreRouter::score_candidate(
     const std::vector<std::pair<int,int>>& E,
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data,
-    SwapCache* swap_cache
+    SwapCache* swap_cache,
+    const std::vector<double>* decay
 ) const {
-    auto [swaps, output_perm] = transform_pi(cand, pi, reverse, swap_cache);
+    const auto neighbor_info = build_neighbor_info(
+        cand.partition_idx,
+        F_snapshot,
+        E,
+        pi,
+        canonical_data
+    );
+    const NeighborInfo* neighbor_ptr =
+        neighbor_info.uses_tiebreak() ? &neighbor_info : nullptr;
+    auto [swaps, output_perm] = transform_pi(
+        cand,
+        pi,
+        reverse,
+        swap_cache,
+        neighbor_ptr
+    );
 
     double score = config_.swap_cost * static_cast<double>(swaps.size());
     score += config_.local_cost_weight * static_cast<double>(cand.cnot_count);
+    if (decay != nullptr && !swaps.empty()) {
+        score *= decay_factor_for_swaps(swaps, *decay);
+    }
 
     // F cost: average routing cost over F \ {cand}
     int cand_idx = cand.partition_idx;
@@ -756,23 +918,17 @@ double SabreRouter::score_candidate(
     double f_sum = 0.0;
     for (int p_idx : F_snapshot) {
         if (p_idx == cand_idx) continue;
-        if (config_.future_cost_mode == 1) {
-            n_other++;
-            f_sum += config_.future_candidate_weight *
-                     candidate_aware_future_cost(p_idx, output_perm, reverse);
-        } else {
-            auto it = canonical_data.find(p_idx);
-            if (it == canonical_data.end()) continue;
-            const auto& entry = it->second;
-            n_other++;
-            if (entry.edges_u.empty()) continue;
-            for (size_t i = 0; i < entry.edges_u.size(); i++) {
-                int u = entry.edges_u[i];
-                int v = entry.edges_v[i];
-                double d = dist(output_perm[u], output_perm[v]);
-                double cost = d - 1.0;
-                if (cost > 0.0) f_sum += config_.swap_cost * cost;
-            }
+        auto it = canonical_data.find(p_idx);
+        if (it == canonical_data.end()) continue;
+        const auto& entry = it->second;
+        n_other++;
+        if (entry.edges_u.empty()) continue;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            int u = entry.edges_u[i];
+            int v = entry.edges_v[i];
+            double d = dist(output_perm[u], output_perm[v]);
+            double cost = d - 1.0;
+            if (cost > 0.0) f_sum += config_.swap_cost * cost;
         }
     }
     if (n_other > 0) score += f_sum / static_cast<double>(n_other);
@@ -785,48 +941,23 @@ double SabreRouter::score_candidate(
             const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
                 ? alpha_weights_[depth]
                 : std::pow(config_.E_alpha, depth);
-            if (config_.future_cost_mode == 1) {
-                e_sum += alpha * config_.future_candidate_weight *
-                         candidate_aware_future_cost(p_idx, output_perm, reverse);
-            } else {
-                auto it = canonical_data.find(p_idx);
-                if (it == canonical_data.end()) continue;
-                const auto& entry = it->second;
-                if (entry.edges_u.empty()) continue;
-                double d_cost = 0.0;
-                for (size_t i = 0; i < entry.edges_u.size(); i++) {
-                    int u = entry.edges_u[i];
-                    int v = entry.edges_v[i];
-                    double d = dist(output_perm[u], output_perm[v]);
-                    double cost = d - 1.0;
-                    if (cost > 0.0) d_cost += config_.swap_cost * cost;
-                }
-                e_sum += alpha * d_cost;
+            auto it = canonical_data.find(p_idx);
+            if (it == canonical_data.end()) continue;
+            const auto& entry = it->second;
+            if (entry.edges_u.empty()) continue;
+            double d_cost = 0.0;
+            for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                int u = entry.edges_u[i];
+                int v = entry.edges_v[i];
+                double d = dist(output_perm[u], output_perm[v]);
+                double cost = d - 1.0;
+                if (cost > 0.0) d_cost += config_.swap_cost * cost;
             }
+            e_sum += alpha * d_cost;
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
     }
 
-    if (config_.order_weight != 0.0) {
-        std::vector<int> future_indices;
-        future_indices.reserve(F_snapshot.size());
-        for (int p_idx : F_snapshot) {
-            if (p_idx != cand_idx) future_indices.push_back(p_idx);
-        }
-        score += config_.order_weight *
-                 output_layout_quality_cost(output_perm, future_indices, canonical_data);
-
-        if (!E.empty()) {
-            std::vector<std::pair<int,int>> future_E;
-            future_E.reserve(E.size());
-            for (auto [p_idx, depth] : E) {
-                if (p_idx != cand_idx) future_E.push_back({p_idx, depth});
-            }
-            score += config_.order_weight * config_.E_weight *
-                     output_layout_quality_cost(output_perm, future_E, canonical_data);
-        }
-    }
-
     return score;
 }
 
@@ -921,58 +1052,6 @@ const CandidateData& SabreRouter::select_best_candidate(
 }
 
 // ---------------------------------------------------------------------------
-// release_valve
-// ---------------------------------------------------------------------------
-
-std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
-SabreRouter::release_valve(
-    const std::vector<int>& F,
-    const std::vector<int>& pi,
-    const std::unordered_map<int, CanonicalEntry>& canonical_data
-) const {
-    // Find the F partition whose worst-pair distance is smallest
-    int best_d = std::numeric_limits<int>::max();
-    int best_p = -1;
-    int best_u = -1, best_v = -1;
-
-    for (int p_idx : F) {
-        auto it = canonical_data.find(p_idx);
-        if (it == canonical_data.end()) continue;
-        const auto& entry = it->second;
-        if (entry.edges_u.empty()) continue;
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            int u = entry.edges_u[i];
-            int v = entry.edges_v[i];
-            double d = dist(pi[u], pi[v]);
-            int di = static_cast<int>(d);
-            if (di > 1 && (di < best_d || (di == best_d && p_idx < best_p))) {
-                best_d = di;
-                best_p = p_idx;
-                best_u = u;
-                best_v = v;
-            }
-        }
-    }
-
-    if (best_p < 0) return {{}, pi};
-
-    auto path = bfs_shortest_path(pi[best_u], pi[best_v]);
-    if (static_cast<int>(path.size()) < 2) return {{}, pi};
-
-    int k = static_cast<int>(path.size()) - 1;
-    int m = k / 2;
-    std::vector<std::pair<int,int>> swaps;
-    for (int i = 0; i < m; i++) {
-        swaps.push_back({path[i], path[i + 1]});
-    }
-    for (int i = k; i > m; i--) {
-        swaps.push_back({path[i], path[i - 1]});
-    }
-
-    auto pi_new = apply_swaps_to_pi(pi, swaps);
-    return {swaps, pi_new};
-}
-
 // ---------------------------------------------------------------------------
 // heuristic_search (main loop)
 // ---------------------------------------------------------------------------
@@ -1030,9 +1109,32 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
 
     // Swap cache for this search call (thread-local, on stack)
     SwapCache swap_cache;
+    std::vector<double> decay(N_, 1.0);
+    int swap_burst = 0;
+    int swap_heavy_partitions = 0;
 
     // Main search loop
     while (!F.empty()) {
+        if (
+            config_.release_valve_enabled
+            && swap_burst > config_.release_valve_threshold
+        ) {
+            auto [valve_swaps, pi_bridged] = release_valve(
+                F,
+                pi,
+                canonical_data
+            );
+            if (!valve_swaps.empty()) {
+                total_cost += config_.trial_swap_cnot_cost
+                    * static_cast<int>(valve_swaps.size());
+                apply_decay_for_swaps(valve_swaps, decay);
+                pi = std::move(pi_bridged);
+                swap_burst = 0;
+                continue;
+            }
+            swap_burst = 0;
+        }
+
         auto all_candidates = obtain_partition_candidates(F);
         if (all_candidates.empty()) break;
 
@@ -1048,7 +1150,15 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         scores.reserve(candidates.size());
         for (const auto* cand : candidates) {
             scores.push_back(score_candidate(
-                *cand, F, pi, E, reverse, canonical_data, &swap_cache));
+                *cand,
+                F,
+                pi,
+                E,
+                reverse,
+                canonical_data,
+                &swap_cache,
+                &decay
+            ));
         }
 
         // Select best
@@ -1060,10 +1170,41 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         resolved[best.partition_idx] = 1;
 
         // Apply transform
-        auto [swaps, pi_new] = transform_pi(best, pi, reverse, &swap_cache);
+        const auto neighbor_info = build_neighbor_info(
+            best.partition_idx,
+            F,
+            E,
+            pi,
+            canonical_data
+        );
+        const NeighborInfo* neighbor_ptr =
+            neighbor_info.uses_tiebreak() ? &neighbor_info : nullptr;
+        auto [swaps, pi_new] = transform_pi(
+            best,
+            pi,
+            reverse,
+            &swap_cache,
+            neighbor_ptr
+        );
         total_cost += config_.trial_swap_cnot_cost * static_cast<int>(swaps.size())
                       + best.cnot_count;
         pi = std::move(pi_new);
+        apply_decay_for_swaps(swaps, decay);
+        if (swaps.empty()) {
+            swap_burst = 0;
+            swap_heavy_partitions = 0;
+            reset_decay(decay);
+        } else {
+            swap_burst += static_cast<int>(swaps.size());
+            swap_heavy_partitions++;
+            if (
+                config_.decay_reset_interval > 0
+                && swap_heavy_partitions >= config_.decay_reset_interval
+            ) {
+                reset_decay(decay);
+                swap_heavy_partitions = 0;
+            }
+        }
 
         // Update F with newly eligible children
         for (int child : cg[best.partition_idx]) {
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 475ec5f2a..562ea3f67 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -151,10 +151,11 @@ def __init__(self, config):
         self.config.setdefault('cleanup', True)
         self.config.setdefault('prefilter_top_k', 50)
         self.config.setdefault('cleanup_top_k', 3)
-        self.config.setdefault('future_cost_mode', 'canonical')
-        self.config.setdefault('future_candidate_weight', 1.0)
-        self.config.setdefault('future_candidate_top_k', 0)
-        self.config.setdefault('order_weight', 0.0)
+        self.config.setdefault('decay_delta', 0.1)
+        self.config.setdefault('decay_reset_interval', 5)
+        self.config.setdefault('release_valve_enabled', True)
+        self.config.setdefault('release_valve_threshold', 20)
+        self.config.setdefault('path_tiebreak_weight', 0.2)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -920,16 +921,17 @@ def _run_layout_trials_cpp(
         cfg.sabre_iterations = n_iterations
         cfg.n_layout_trials = max(1, n_trials)
         cfg.random_seed = random_seed
-        future_cost_mode = self.config.get('future_cost_mode', 'canonical')
-        cfg.future_cost_mode = 1 if future_cost_mode == 'candidate_aware' else 0
-        cfg.future_candidate_weight = self.config.get(
-            'future_candidate_weight', 1.0
+        cfg.decay_delta = self.config.get('decay_delta', 0.1)
+        cfg.decay_reset_interval = self.config.get('decay_reset_interval', 5)
+        cfg.release_valve_enabled = self.config.get(
+            'release_valve_enabled', True
         )
-        cfg.future_candidate_top_k = self.config.get(
-            'future_candidate_top_k', 0
+        cfg.release_valve_threshold = self.config.get(
+            'release_valve_threshold', 20
+        )
+        cfg.path_tiebreak_weight = self.config.get(
+            'path_tiebreak_weight', 0.2
         )
-        cfg.order_weight = self.config.get('order_weight', 0.0)
-
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
@@ -1199,9 +1201,39 @@ def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=Fals
         top_k_indices = np.argpartition(estimates, top_k)[:top_k]
         return [partition_candidates[i] for i in top_k_indices]
 
+    @staticmethod
+    def _decay_factor_for_swaps(swaps, decay):
+        if not swaps:
+            return 1.0
+        return max(max(decay[u], decay[v]) for u, v in swaps)
+
+    def _apply_decay_for_swaps(self, swaps, decay):
+        delta = self.config.get("decay_delta", 0.1)
+        if delta <= 0:
+            return
+        for u, v in swaps:
+            decay[u] += delta
+            decay[v] += delta
+
+    @staticmethod
+    def _reset_decay(decay):
+        for idx in range(len(decay)):
+            decay[idx] = 1.0
+
+    @staticmethod
+    def _apply_swaps_to_pi(pi, swaps):
+        pi_new = [int(x) for x in pi]
+        n = len(pi_new)
+        p2v = [0] * n
+        for q in range(n):
+            p2v[pi_new[q]] = q
+        for P1, P2 in swaps:
+            q1, q2 = p2v[P1], p2v[P2]
+            p2v[P1], p2v[P2] = q2, q1
+            pi_new[q1], pi_new[q2] = P2, P1
+        return pi_new
+
     def _bfs_shortest_path(self, src, dst):
-        """BFS shortest path on self._adj. Returns list of physical nodes
-        from src to dst (inclusive); empty list if unreachable."""
         if src == dst:
             return [src]
         parent = {src: None}
@@ -1221,36 +1253,13 @@ def _bfs_shortest_path(self, src, dst):
                 q.append(nb)
         return []
 
-    @staticmethod
-    def _apply_swaps_to_pi(pi, swaps):
-        """Return a new pi after applying a list of (phys_a, phys_b) swaps."""
-        pi_new = [int(x) for x in pi]
-        n = len(pi_new)
-        p2v = [0] * n
-        for q in range(n):
-            p2v[pi_new[q]] = q
-        for P1, P2 in swaps:
-            q1, q2 = p2v[P1], p2v[P2]
-            p2v[P1], p2v[P2] = q2, q1
-            pi_new[q1], pi_new[q2] = P2, P1
-        return pi_new
-
     def _release_valve(self, F, pi, D, canonical_data):
-        """Force progress on the easiest F partition's hardest pair.
-
-        Picks the F partition whose worst-pair distance under pi is smallest
-        (cheapest to bridge). BFS-routes that pair along the shortest path,
-        applying swaps from both ends toward the middle — LightSABRE §II.7.
-
-        Returns (swap_list, pi_new). Empty swap list if everything is already
-        adjacent or no eligible partition exists.
-        """
         best = None
         for p_idx in F:
             entry = canonical_data.get(p_idx)
-            if entry is None or entry['edges_u'] is None:
+            if entry is None or entry["edges_u"] is None:
                 continue
-            eu, ev = entry['edges_u'], entry['edges_v']
+            eu, ev = entry["edges_u"], entry["edges_v"]
             worst_d = 0
             worst_pair = None
             for i in range(len(eu)):
@@ -1261,7 +1270,9 @@ def _release_valve(self, F, pi, D, canonical_data):
                     worst_pair = (u, v)
             if worst_d <= 1 or worst_pair is None:
                 continue
-            if best is None or worst_d < best[0] or (worst_d == best[0] and p_idx < best[1]):
+            if best is None or worst_d < best[0] or (
+                worst_d == best[0] and p_idx < best[1]
+            ):
                 best = (worst_d, p_idx, worst_pair[0], worst_pair[1])
 
         if best is None:
@@ -1280,8 +1291,60 @@ def _release_valve(self, F, pi, D, canonical_data):
         for i in range(k, m + 1, -1):
             swaps.append((path[i], path[i - 1]))
 
-        pi_new = self._apply_swaps_to_pi(pi, swaps)
-        return swaps, pi_new
+        return swaps, self._apply_swaps_to_pi(pi, swaps)
+
+    @staticmethod
+    def _build_neighbor_info(
+        partition_idx,
+        F,
+        E,
+        pi,
+        canonical_data,
+        weight=0.2,
+        W=0.5,
+        alpha=0.9,
+    ):
+        if canonical_data is None or weight <= 0:
+            return None
+
+        edge_weights = {}
+        qubits = set()
+
+        def add_edges(target_idx, edge_weight):
+            if target_idx == partition_idx or edge_weight <= 0:
+                return
+            entry = canonical_data.get(target_idx)
+            if entry is None or entry["edges_u"] is None:
+                return
+            for u, v in zip(entry["edges_u"], entry["edges_v"]):
+                u = int(u)
+                v = int(v)
+                qubits.add(u)
+                qubits.add(v)
+                key = (u, v) if u <= v else (v, u)
+                edge_weights[key] = edge_weights.get(key, 0.0) + edge_weight
+
+        for future_idx in F:
+            add_edges(future_idx, 1.0)
+        if E:
+            for future_idx, depth in E:
+                add_edges(future_idx, W * (alpha ** depth))
+
+        if not edge_weights:
+            return None
+
+        neighbor_vqs = sorted(qubits)
+        q_to_idx = {q: idx for idx, q in enumerate(neighbor_vqs)}
+        edges = [
+            (q_to_idx[u], q_to_idx[v], edge_weight)
+            for (u, v), edge_weight in edge_weights.items()
+        ]
+        return {
+            "neighbor_vqs": neighbor_vqs,
+            "initial_pos": tuple(int(pi[q]) for q in neighbor_vqs),
+            "edges": edges,
+            "weight": weight,
+        }
 
     def Heuristic_Search(
         self,
@@ -1356,13 +1419,15 @@ def Heuristic_Search(
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
-
-        valve_enabled = self.config.get("release_valve_enabled", True)
-        valve_threshold = self.config.get("release_valve_threshold", 20)
-        swaps_since_clean = 0
+        decay = [1.0] * len(pi)
+        swap_burst = 0
+        swap_heavy_partitions = 0
 
         while F:
-            if valve_enabled and swaps_since_clean > valve_threshold:
+            if (
+                self.config.get("release_valve_enabled", True)
+                and swap_burst > self.config.get("release_valve_threshold", 20)
+            ):
                 valve_swaps, pi_bridged = self._release_valve(
                     F, pi, D, canonical_data
                 )
@@ -1370,9 +1435,11 @@ def Heuristic_Search(
                     partition_order.append(
                         construct_swap_circuit(valve_swaps, len(pi))
                     )
+                    self._apply_decay_for_swaps(valve_swaps, decay)
                     pi = np.asarray(pi_bridged)
-                swaps_since_clean = 0
-                continue
+                    swap_burst = 0
+                    continue
+                swap_burst = 0
 
             partition_candidates = self.obtain_partition_candidates(
             F,
@@ -1414,17 +1481,10 @@ def Heuristic_Search(
                     adj=self._adj,
                     local_cost_weight=self.config.get("local_cost_weight", 0.1),
                     swap_cost=self.config.get("swap_cost", 15.0),
-                    candidate_cache=candidate_cache,
-                    future_cost_mode=self.config.get(
-                        "future_cost_mode", "canonical"
-                    ),
-                    future_candidate_weight=self.config.get(
-                        "future_candidate_weight", 1.0
+                    path_tiebreak_weight=self.config.get(
+                        "path_tiebreak_weight", 0.2
                     ),
-                    future_candidate_top_k=self.config.get(
-                        "future_candidate_top_k", 0
-                    ),
-                    order_weight=self.config.get("order_weight", 0.0),
+                    decay=decay,
                 )
                 for partition_candidate in partition_candidates
             ]
@@ -1437,14 +1497,39 @@ def Heuristic_Search(
             resolved_count += 1
             pbar.update(1)
 
+            best_neighbor_info = self._build_neighbor_info(
+                min_partition_candidate.partition_idx,
+                F_snapshot,
+                E,
+                pi,
+                canonical_data,
+                weight=self.config.get("path_tiebreak_weight", 0.2),
+                W=E_W,
+                alpha=E_alpha,
+            )
             swap_order, pi = min_partition_candidate.transform_pi(
-                pi, D, self._swap_cache, adj=self._adj
+                pi,
+                D,
+                self._swap_cache,
+                adj=self._adj,
+                neighbor_info=best_neighbor_info,
             )
             if swap_order:
                 partition_order.append(construct_swap_circuit(swap_order, len(pi)))
-                swaps_since_clean += len(swap_order)
+                self._apply_decay_for_swaps(swap_order, decay)
+                swap_burst += len(swap_order)
+                swap_heavy_partitions += 1
+                if (
+                    self.config.get("decay_reset_interval", 5) > 0
+                    and swap_heavy_partitions
+                    >= self.config.get("decay_reset_interval", 5)
+                ):
+                    self._reset_decay(decay)
+                    swap_heavy_partitions = 0
             else:
-                swaps_since_clean = 0
+                swap_burst = 0
+                swap_heavy_partitions = 0
+                self._reset_decay(decay)
 
             partition_order.append(min_partition_candidate)
 
@@ -1534,8 +1619,23 @@ def _heuristic_search_layout_only(
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=reverse
         )
+        decay = [1.0] * len(pi)
+        swap_burst = 0
+        swap_heavy_partitions = 0
 
         while F:
+            if (
+                self.config.get("release_valve_enabled", True)
+                and swap_burst > self.config.get("release_valve_threshold", 20)
+            ):
+                valve_swaps, pi = self._release_valve(F, pi, D, canonical_data)
+                if valve_swaps:
+                    total_cost += swap_cnot_cost * len(valve_swaps)
+                    self._apply_decay_for_swaps(valve_swaps, decay)
+                    swap_burst = 0
+                    continue
+                swap_burst = 0
+
             partition_candidates = self.obtain_partition_candidates(
                 F,
                 optimized_partitions,
@@ -1576,17 +1676,10 @@ def _heuristic_search_layout_only(
                     adj=self._adj,
                     local_cost_weight=self.config.get("local_cost_weight", 0.1),
                     swap_cost=self.config.get("swap_cost", 15.0),
-                    candidate_cache=candidate_cache,
-                    future_cost_mode=self.config.get(
-                        "future_cost_mode", "canonical"
-                    ),
-                    future_candidate_weight=self.config.get(
-                        "future_candidate_weight", 1.0
+                    path_tiebreak_weight=self.config.get(
+                        "path_tiebreak_weight", 0.2
                     ),
-                    future_candidate_top_k=self.config.get(
-                        "future_candidate_top_k", 0
-                    ),
-                    order_weight=self.config.get("order_weight", 0.0),
+                    decay=decay,
                 )
                 for pc in partition_candidates
             ]
@@ -1597,14 +1690,40 @@ def _heuristic_search_layout_only(
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 
+            best_neighbor_info = self._build_neighbor_info(
+                best.partition_idx,
+                F_snapshot,
+                E,
+                pi,
+                canonical_data,
+                weight=self.config.get("path_tiebreak_weight", 0.2),
+                W=E_W,
+                alpha=E_alpha,
+            )
             swaps, pi = best.transform_pi(
                 pi,
                 D,
                 self._swap_cache,
                 reverse=reverse,
                 adj=self._adj,
+                neighbor_info=best_neighbor_info,
             )
             total_cost += swap_cnot_cost * len(swaps) + best.cnot_count
+            if swaps:
+                self._apply_decay_for_swaps(swaps, decay)
+                swap_burst += len(swaps)
+                swap_heavy_partitions += 1
+                if (
+                    self.config.get("decay_reset_interval", 5) > 0
+                    and swap_heavy_partitions
+                    >= self.config.get("decay_reset_interval", 5)
+                ):
+                    self._reset_decay(decay)
+                    swap_heavy_partitions = 0
+            else:
+                swap_burst = 0
+                swap_heavy_partitions = 0
+                self._reset_decay(decay)
 
             for child in DAG[best.partition_idx]:
                 if not resolved_partitions[child] and child not in F:
@@ -1707,101 +1826,12 @@ def _build_canonical_neighbor_data(scoring_partitions, reverse=False):
             data[idx] = {'edges_u': eu, 'edges_v': ev, 'cnot': best_cnot}
         return data
 
-    @staticmethod
-    def _candidate_aware_future_cost(
-        partition_idx,
-        output_perm,
-        D,
-        candidate_cache,
-        reverse=False,
-        local_cost_weight=0.1,
-        swap_cost=15.0,
-        future_candidate_top_k=0,
-    ):
-        """Estimate a future partition by its best existing candidate.
-
-        This does not synthesize any extra candidates.  It only asks: from the
-        candidate output layout, how expensive would each already stored
-        candidate be to enter?
-        """
-        if candidate_cache is None:
-            return 0.0
-        if partition_idx < 0 or partition_idx >= len(candidate_cache):
-            return 0.0
-
-        candidates = candidate_cache[partition_idx]
-        if not candidates:
-            return 0.0
-
-        if future_candidate_top_k and future_candidate_top_k > 0:
-            candidates = sorted(
-                candidates,
-                key=lambda pc: (
-                    pc.estimate_swap_count(output_perm, D, reverse=reverse),
-                    pc.cnot_count,
-                ),
-            )[:future_candidate_top_k]
-
-        best = float("inf")
-        for candidate in candidates:
-            estimate = (
-                swap_cost
-                * candidate.estimate_swap_count(output_perm, D, reverse=reverse)
-                + local_cost_weight * candidate.cnot_count
-            )
-            if estimate < best:
-                best = estimate
-
-        return 0.0 if best == float("inf") else best
-
-    @staticmethod
-    def _output_layout_quality_cost(
-        output_perm_arr,
-        future_indices,
-        canonical_data,
-        D_arr,
-        alpha=0.9,
-        weighted_depths=None,
-    ):
-        """Penalize output layouts that leave future interaction edges far apart."""
-        if not future_indices or canonical_data is None:
-            return 0.0
-
-        total = 0.0
-        count = 0
-        for item in future_indices:
-            if isinstance(item, tuple):
-                partition_idx, depth = item
-            else:
-                partition_idx, depth = item, 0
-
-            entry = canonical_data.get(partition_idx)
-            if entry is None:
-                continue
-            eu = entry["edges_u"]
-            if eu is None:
-                continue
-
-            phys_u = output_perm_arr[eu]
-            phys_v = output_perm_arr[entry["edges_v"]]
-            weight = weighted_depths.get(partition_idx, 1.0) if weighted_depths else 1.0
-            if depth:
-                weight *= alpha ** depth
-            total += weight * D_arr[phys_u, phys_v].sum()
-            count += len(eu)
-
-        return total / count if count else 0.0
-
     @staticmethod
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
                                   canonical_data=None, adj=None,
                                   local_cost_weight=0.1, swap_cost=15.0,
-                                  candidate_cache=None,
-                                  future_cost_mode="canonical",
-                                  future_candidate_weight=1.0,
-                                  future_candidate_top_k=0,
-                                  order_weight=0.0):
+                                  path_tiebreak_weight=0.2, decay=None):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = swap_cost * |swaps|
@@ -1809,11 +1839,30 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
           + (1/|F'|) * average routing cost over F \\ {cand}
           + (W/|E|)  * alpha^d-decayed routing cost over E
         """
+        neighbor_info = qgd_Partition_Aware_Mapping._build_neighbor_info(
+            partition_candidate.partition_idx,
+            F,
+            E,
+            pi,
+            canonical_data,
+            weight=path_tiebreak_weight,
+            W=W,
+            alpha=alpha,
+        )
         swaps, output_perm = partition_candidate.transform_pi(
-            pi, D, swap_cache, reverse=reverse, adj=adj, neighbor_info=None,
+            pi,
+            D,
+            swap_cache,
+            reverse=reverse,
+            adj=adj,
+            neighbor_info=neighbor_info,
         )
         score = swap_cost * len(swaps)
         score += local_cost_weight * partition_candidate.cnot_count
+        if decay is not None and swaps:
+            score *= qgd_Partition_Aware_Mapping._decay_factor_for_swaps(
+                swaps, decay
+            )
 
         if canonical_data is None:
             return score
@@ -1828,29 +1877,16 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
         for partition_idx in F:
             if partition_idx == cand_idx:
                 continue
-            if future_cost_mode == "candidate_aware":
-                n_other += 1
-                f_sum += future_candidate_weight * qgd_Partition_Aware_Mapping._candidate_aware_future_cost(
-                    partition_idx,
-                    output_perm,
-                    D,
-                    candidate_cache,
-                    reverse=reverse,
-                    local_cost_weight=local_cost_weight,
-                    swap_cost=swap_cost,
-                    future_candidate_top_k=future_candidate_top_k,
-                )
-            else:
-                entry = canonical_data.get(partition_idx)
-                if entry is None:
-                    continue
-                n_other += 1
-                eu = entry['edges_u']
-                if eu is None:
-                    continue
-                phys_u = output_perm_arr[eu]
-                phys_v = output_perm_arr[entry['edges_v']]
-                f_sum += swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+            entry = canonical_data.get(partition_idx)
+            if entry is None:
+                continue
+            n_other += 1
+            eu = entry['edges_u']
+            if eu is None:
+                continue
+            phys_u = output_perm_arr[eu]
+            phys_v = output_perm_arr[entry['edges_v']]
+            f_sum += swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
         if n_other > 0:
             score += f_sum / n_other
 
@@ -1860,48 +1896,18 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             for partition_idx, depth in E:
                 if partition_idx == cand_idx:
                     continue
-                if future_cost_mode == "candidate_aware":
-                    d_cost = qgd_Partition_Aware_Mapping._candidate_aware_future_cost(
-                        partition_idx,
-                        output_perm,
-                        D,
-                        candidate_cache,
-                        reverse=reverse,
-                        local_cost_weight=local_cost_weight,
-                        swap_cost=swap_cost,
-                        future_candidate_top_k=future_candidate_top_k,
-                    )
-                    e_sum += (alpha ** depth) * future_candidate_weight * d_cost
-                else:
-                    entry = canonical_data.get(partition_idx)
-                    if entry is None:
-                        continue
-                    eu = entry['edges_u']
-                    if eu is None:
-                        continue
-                    phys_u = output_perm_arr[eu]
-                    phys_v = output_perm_arr[entry['edges_v']]
-                    d_cost = swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
-                    e_sum += (alpha ** depth) * d_cost
+                entry = canonical_data.get(partition_idx)
+                if entry is None:
+                    continue
+                eu = entry['edges_u']
+                if eu is None:
+                    continue
+                phys_u = output_perm_arr[eu]
+                phys_v = output_perm_arr[entry['edges_v']]
+                d_cost = swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+                e_sum += (alpha ** depth) * d_cost
             score += W * e_sum / len(E)
 
-        if order_weight:
-            future_indices = [idx for idx in F if idx != cand_idx]
-            score += order_weight * qgd_Partition_Aware_Mapping._output_layout_quality_cost(
-                output_perm_arr,
-                future_indices,
-                canonical_data,
-                D_arr,
-            )
-            if E:
-                score += order_weight * W * qgd_Partition_Aware_Mapping._output_layout_quality_cost(
-                    output_perm_arr,
-                    [(idx, depth) for idx, depth in E if idx != cand_idx],
-                    canonical_data,
-                    D_arr,
-                    alpha=alpha,
-                )
-
         return score
 
     # ------------------------------------------------------------------------
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 2aa16fa65..3b1d167d8 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -131,10 +131,11 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("sabre_iterations", &SabreConfig::sabre_iterations)
         .def_readwrite("n_layout_trials", &SabreConfig::n_layout_trials)
         .def_readwrite("random_seed", &SabreConfig::random_seed)
-        .def_readwrite("future_cost_mode", &SabreConfig::future_cost_mode)
-        .def_readwrite("future_candidate_weight", &SabreConfig::future_candidate_weight)
-        .def_readwrite("future_candidate_top_k", &SabreConfig::future_candidate_top_k)
-        .def_readwrite("order_weight", &SabreConfig::order_weight);
+        .def_readwrite("decay_delta", &SabreConfig::decay_delta)
+        .def_readwrite("decay_reset_interval", &SabreConfig::decay_reset_interval)
+        .def_readwrite("release_valve_enabled", &SabreConfig::release_valve_enabled)
+        .def_readwrite("release_valve_threshold", &SabreConfig::release_valve_threshold)
+        .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")

From a674041184b705fa514b6d804d702ae271769282 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 01:13:42 +0200
Subject: [PATCH 145/232] speedup

---
 .../sabre_router/include/sabre_router.hpp     |  17 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 633 +++++++++++-------
 2 files changed, 414 insertions(+), 236 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index e2e739e42..983ad2581 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -239,6 +239,13 @@ class SabreRouter {
         const std::vector<std::vector<int>>& parents_graph
     ) const;
 
+    // Pre-resolved canonical entries for an F-step (avoids hash lookups per candidate)
+    struct ResolvedEntry {
+        int partition_idx;
+        const CanonicalEntry* entry; // may be null
+        double alpha; // 1.0 for F, alpha^depth for E
+    };
+
     // LightSABRE scoring (port of score_partition_candidate)
     double score_candidate(
         const CandidateData& cand,
@@ -248,7 +255,12 @@ class SabreRouter {
         bool reverse,
         const std::unordered_map<int, CanonicalEntry>& canonical_data,
         SwapCache* swap_cache,
-        const std::vector<double>* decay = nullptr
+        const std::vector<double>* decay = nullptr,
+        std::vector<std::pair<int,int>>* out_swaps = nullptr,
+        std::vector<int>* out_pi_new = nullptr,
+        const std::vector<ResolvedEntry>* resolved_F = nullptr,
+        const std::vector<ResolvedEntry>* resolved_E = nullptr,
+        const NeighborInfo* cached_neighbor_info = nullptr
     ) const;
 
     // Route and update layout for a candidate (port of transform_pi)
@@ -362,6 +374,9 @@ class SabreRouter {
     int num_partitions_;
     std::vector<double> D_; // flat N*N distance matrix (owned copy)
     std::vector<std::vector<int>> adj_;
+    // CSR view of adj_ for tight inner loops
+    std::vector<int> adj_offsets_;
+    std::vector<int> adj_flat_;
     std::vector<std::vector<int>> DAG_;
     std::vector<std::vector<int>> IDAG_;
     std::vector<std::vector<CandidateData>> candidate_cache_;
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index cd32c7275..f7fe6ae62 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -115,6 +115,18 @@ SabreRouter::SabreRouter(
     if (static_cast<int>(D_.size()) != N_ * N_) {
         throw std::invalid_argument("Distance matrix D must be N x N");
     }
+    // Build CSR view of adj_
+    adj_offsets_.resize(N_ + 1);
+    adj_offsets_[0] = 0;
+    for (int i = 0; i < N_; i++) {
+        adj_offsets_[i + 1] = adj_offsets_[i] + static_cast<int>(adj_[i].size());
+    }
+    adj_flat_.resize(adj_offsets_[N_]);
+    for (int i = 0; i < N_; i++) {
+        for (size_t j = 0; j < adj_[i].size(); j++) {
+            adj_flat_[adj_offsets_[i] + j] = adj_[i][j];
+        }
+    }
     for (auto& partition_candidates : candidate_cache_) {
         for (auto& cand : partition_candidates) {
             prepare_candidate(cand);
@@ -198,7 +210,8 @@ std::vector<int> SabreRouter::apply_swaps_to_pi(
     const std::vector<std::pair<int,int>>& swaps
 ) const {
     std::vector<int> result(pi);
-    std::vector<int> p2v(N_);
+    thread_local std::vector<int> p2v;
+    if (static_cast<int>(p2v.size()) < N_) p2v.assign(N_, 0);
     for (int q = 0; q < N_; q++) p2v[result[q]] = q;
 
     for (auto [P1, P2] : swaps) {
@@ -225,50 +238,63 @@ NeighborInfo SabreRouter::build_neighbor_info(
         return info;
     }
 
-    std::unordered_map<int, int> q_to_idx;
-    std::unordered_map<uint64_t, double> edge_weights;
-    std::unordered_map<uint64_t, std::pair<int, int>> edge_nodes;
+    // Per-call scratch via thread_local, reset by tracking touched entries
+    thread_local std::vector<int> q_to_idx;
+    thread_local std::vector<int> q_touched;
+    if (static_cast<int>(q_to_idx.size()) < N_) q_to_idx.assign(N_, -1);
+    q_touched.clear();
 
     auto ensure_qubit = [&](int q) -> int {
-        auto it = q_to_idx.find(q);
-        if (it != q_to_idx.end()) {
-            return it->second;
-        }
-        const int idx = static_cast<int>(info.neighbor_vqs.size());
-        q_to_idx.emplace(q, idx);
+        int idx = q_to_idx[q];
+        if (idx >= 0) return idx;
+        idx = static_cast<int>(info.neighbor_vqs.size());
+        q_to_idx[q] = idx;
+        q_touched.push_back(q);
         info.neighbor_vqs.push_back(q);
         info.initial_pos.push_back(pi[q]);
         return idx;
     };
 
-    auto add_partition_edges = [&](int partition_idx, double weight) {
-        if (partition_idx == exclude_partition_idx || weight <= 0.0) {
-            return;
+    // edges: parallel arrays keyed by (lo, hi) — small linear scan dedup
+    thread_local std::vector<int> ekey_lo;
+    thread_local std::vector<int> ekey_hi;
+    thread_local std::vector<int> eu_idx;
+    thread_local std::vector<int> ev_idx;
+    thread_local std::vector<double> ew;
+    ekey_lo.clear(); ekey_hi.clear();
+    eu_idx.clear(); ev_idx.clear(); ew.clear();
+
+    auto add_edge = [&](int u, int v, double weight) {
+        const int u_idx = ensure_qubit(u);
+        const int v_idx = ensure_qubit(v);
+        const int lo = std::min(u, v);
+        const int hi = std::max(u, v);
+        for (size_t i = 0; i < ekey_lo.size(); i++) {
+            if (ekey_lo[i] == lo && ekey_hi[i] == hi) {
+                ew[i] += weight;
+                return;
+            }
         }
+        ekey_lo.push_back(lo);
+        ekey_hi.push_back(hi);
+        eu_idx.push_back(u_idx);
+        ev_idx.push_back(v_idx);
+        ew.push_back(weight);
+    };
+
+    auto add_partition_edges = [&](int partition_idx, double weight) {
+        if (partition_idx == exclude_partition_idx || weight <= 0.0) return;
         auto it = canonical_data.find(partition_idx);
-        if (it == canonical_data.end()) {
-            return;
-        }
+        if (it == canonical_data.end()) return;
         const auto& entry = it->second;
         for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            const int u = entry.edges_u[i];
-            const int v = entry.edges_v[i];
-            ensure_qubit(u);
-            ensure_qubit(v);
-            const int lo = std::min(u, v);
-            const int hi = std::max(u, v);
-            const uint64_t key =
-                (static_cast<uint64_t>(static_cast<uint32_t>(lo)) << 32)
-                | static_cast<uint32_t>(hi);
-            edge_weights[key] += weight;
-            edge_nodes[key] = {u, v};
+            add_edge(entry.edges_u[i], entry.edges_v[i], weight);
         }
     };
 
     for (int partition_idx : F_snapshot) {
         add_partition_edges(partition_idx, 1.0);
     }
-
     for (auto [partition_idx, depth] : E) {
         const double alpha =
             (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
@@ -277,19 +303,14 @@ NeighborInfo SabreRouter::build_neighbor_info(
         add_partition_edges(partition_idx, config_.E_weight * alpha);
     }
 
-    info.edges.reserve(edge_weights.size());
-    for (const auto& [key, weight] : edge_weights) {
-        (void)key;
-        const auto nodes = edge_nodes.at(key);
-        info.edges.push_back(
-            NeighborEdge{
-                q_to_idx.at(nodes.first),
-                q_to_idx.at(nodes.second),
-                weight,
-            }
-        );
+    info.edges.reserve(ew.size());
+    for (size_t i = 0; i < ew.size(); i++) {
+        info.edges.push_back(NeighborEdge{eu_idx[i], ev_idx[i], ew[i]});
     }
 
+    // Reset q_to_idx via touched-list (avoids O(N) clear)
+    for (int q : q_touched) q_to_idx[q] = -1;
+
     return info;
 }
 
@@ -506,31 +527,40 @@ SabreRouter::find_constrained_swaps(
     SwapCache* swap_cache,
     const NeighborInfo* neighbor_info
 ) const {
-    int k = static_cast<int>(qbit_map_keys.size());
-    std::vector<int> target_positions(k);
-    std::vector<int> initial_positions(k);
-
-    for (int i = 0; i < k; i++) {
-        const int q = qbit_map_keys[i];
-        const int v = qbit_map_vals[i];
-        target_positions[i] = node_mapping_flat[P_route_inv[v]];
-        initial_positions[i] = pi[q];
+    const int k = static_cast<int>(qbit_map_keys.size());
+
+    // ---- Setup: target/initial positions, pow_N, h0 ----
+    thread_local std::vector<int> target_positions;
+    thread_local std::vector<int> initial_positions;
+    thread_local std::vector<int64_t> pow_N;
+    target_positions.resize(k);
+    initial_positions.resize(k);
+    pow_N.resize(k);
+    {
+        int64_t s = 1;
+        for (int i = 0; i < k; i++) { pow_N[i] = s; s *= N_; }
     }
 
-    // Check if already at target
     bool already_there = true;
+    double h0_sum = 0.0;
+    int64_t initial_packed = 0;
+    int64_t target_packed = 0;
     for (int i = 0; i < k; i++) {
-        if (initial_positions[i] != target_positions[i]) {
-            already_there = false;
-            break;
-        }
+        const int q = qbit_map_keys[i];
+        const int v = qbit_map_vals[i];
+        const int t = node_mapping_flat[P_route_inv[v]];
+        const int ip = pi[q];
+        target_positions[i] = t;
+        initial_positions[i] = ip;
+        if (ip != t) already_there = false;
+        h0_sum += dist(ip, t);
+        initial_packed += static_cast<int64_t>(ip) * pow_N[i];
+        target_packed  += static_cast<int64_t>(t)  * pow_N[i];
     }
     if (already_there) {
         return {{}, pi};
     }
 
-    int64_t initial_packed = pack_state(initial_positions, N_);
-    int64_t target_packed = pack_state(target_positions, N_);
     const SwapCacheKey cache_key{initial_packed, target_packed, k};
     const bool use_neighbor =
         neighbor_info != nullptr && neighbor_info->uses_tiebreak();
@@ -538,23 +568,12 @@ SabreRouter::find_constrained_swaps(
     if (swap_cache && !use_neighbor) {
         auto it = swap_cache->find(cache_key);
         if (it != swap_cache->end()) {
-            // Replay cached swaps on current pi
             auto result_pi = apply_swaps_to_pi(pi, it->second);
             return {it->second, result_pi};
         }
     }
 
-    // A* search over k-dimensional state space
-    // State: vector of physical positions for each partition qubit
-    // Heuristic: sum(D[pos_i][target_i]) / 2
-
-    // Compute initial heuristic
-    double h0 = 0.0;
-    for (int i = 0; i < k; i++) {
-        h0 += dist(initial_positions[i], target_positions[i]);
-    }
-    h0 /= 2.0;
-
+    // ---- Neighbor heuristic setup ----
     double total_edge_weight = 0.0;
     if (use_neighbor) {
         for (const auto& edge : neighbor_info->edges) {
@@ -562,146 +581,229 @@ SabreRouter::find_constrained_swaps(
         }
     }
     const double neighbor_norm = std::max(
-        1.0,
-        total_edge_weight * std::max(1.0, max_finite_distance_)
+        1.0, total_edge_weight * std::max(1.0, max_finite_distance_)
     );
-    auto neighbor_heuristic = [&](const std::vector<int>& neighbor_positions) {
-        if (!use_neighbor) {
-            return 0.0;
-        }
+    const double neighbor_scale =
+        use_neighbor ? (neighbor_info->weight / neighbor_norm) : 0.0;
+
+    auto compute_nb_total = [&](const std::vector<int>& pos_nb) {
         double total = 0.0;
         for (const auto& edge : neighbor_info->edges) {
-            total += edge.weight
-                * dist(
-                    neighbor_positions[edge.u_idx],
-                    neighbor_positions[edge.v_idx]
-                );
+            total += edge.weight * dist(pos_nb[edge.u_idx], pos_nb[edge.v_idx]);
         }
-        return (neighbor_info->weight * total) / neighbor_norm;
+        return total;
     };
 
-    std::vector<int> initial_neighbor_positions;
-    double nh0 = 0.0;
+    double initial_nb_total = 0.0;
     if (use_neighbor) {
-        initial_neighbor_positions = neighbor_info->initial_pos;
-        nh0 = neighbor_heuristic(initial_neighbor_positions);
+        initial_nb_total = compute_nb_total(neighbor_info->initial_pos);
     }
 
-    // Priority queue: (f_score, g_score, counter, packed_state, neighbor_state)
-    // Counter provides FIFO tie-breaking, matching Python's counter variable
-    using PQEntry = std::tuple<double, int, uint64_t, int64_t, std::vector<int>>;
-    std::priority_queue<PQEntry, std::vector<PQEntry>, std::greater<PQEntry>> pq;
-    uint64_t counter = 0;
+    // ---- Arena + open-addressed hash table (replaces visited+parent maps) ----
+    struct Node {
+        int64_t packed;
+        int parent_idx;
+        int g;
+        int sw_lo, sw_hi;
+        double h_sum;       // sum(dist(pos[i], target[i])) — twice the admissible h
+        double nb_total;    // sum(edge.weight * dist(...)) — pre-scale
+        int nb_arena_idx;   // -1 if !use_neighbor; else index into nb_arena
+    };
+    thread_local std::vector<Node> arena;
+    thread_local std::vector<int32_t> table;
+    thread_local std::vector<std::vector<int>> nb_arena;
+    arena.clear();
+    nb_arena.clear();
+    arena.reserve(1024);
+
+    // table size: power of 2, ~2x expected entries
+    size_t cap = 1024;
+    table.assign(cap, -1);
+
+    auto hash_packed = [](int64_t v) -> uint64_t {
+        uint64_t x = static_cast<uint64_t>(v);
+        x ^= x >> 33; x *= 0xff51afd7ed558ccdULL;
+        x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53ULL;
+        x ^= x >> 33;
+        return x;
+    };
 
-    // Visited: packed_state -> best g_score
-    std::unordered_map<int64_t, int> visited;
-    // Parent: packed_state -> (parent_packed_state, swap)
-    std::unordered_map<int64_t, std::pair<int64_t, std::pair<int,int>>> parent;
-    visited.reserve(256);
-    parent.reserve(256);
+    auto table_grow = [&]() {
+        std::vector<int32_t> new_table(table.size() * 2, -1);
+        const size_t mask = new_table.size() - 1;
+        for (int32_t idx : table) {
+            if (idx < 0) continue;
+            size_t i = hash_packed(arena[idx].packed) & mask;
+            while (new_table[i] >= 0) i = (i + 1) & mask;
+            new_table[i] = idx;
+        }
+        table = std::move(new_table);
+    };
 
-    pq.push({h0 + nh0, 0, counter++, initial_packed, initial_neighbor_positions});
-    visited[initial_packed] = 0;
-    parent[initial_packed] = {-1, {-1, -1}};
+    // Returns slot index in `table`. *slot is -1 if empty.
+    auto table_slot = [&](int64_t packed) -> size_t {
+        const size_t mask = table.size() - 1;
+        size_t i = hash_packed(packed) & mask;
+        while (true) {
+            int32_t idx = table[i];
+            if (idx < 0) return i;
+            if (arena[idx].packed == packed) return i;
+            i = (i + 1) & mask;
+        }
+    };
 
-    std::vector<int> positions;
-    std::vector<int> new_positions(k);
-    positions.reserve(k);
+    // ---- Push initial node ----
+    {
+        Node n;
+        n.packed = initial_packed;
+        n.parent_idx = -1;
+        n.g = 0;
+        n.sw_lo = -1; n.sw_hi = -1;
+        n.h_sum = h0_sum;
+        n.nb_total = initial_nb_total;
+        n.nb_arena_idx = -1;
+        if (use_neighbor) {
+            n.nb_arena_idx = static_cast<int>(nb_arena.size());
+            nb_arena.push_back(neighbor_info->initial_pos);
+        }
+        arena.push_back(n);
+        table[table_slot(initial_packed)] = 0;
+    }
+
+    // PQ entry: (f, g, counter, arena_idx)
+    using PQEntry = std::tuple<double, int, uint64_t, int32_t>;
+    std::priority_queue<PQEntry, std::vector<PQEntry>, std::greater<PQEntry>> pq;
+    uint64_t counter = 0;
+    pq.push({0.5 * h0_sum + neighbor_scale * initial_nb_total, 0, counter++, 0});
+
+    thread_local std::vector<int> positions;
+    positions.resize(k);
 
     while (!pq.empty()) {
-        auto entry = pq.top();
+        auto [f, g_e, ctr, idx] = pq.top();
         pq.pop();
-        int g = std::get<1>(entry);
-        int64_t packed = std::get<3>(entry);
-        const std::vector<int>& neighbor_positions = std::get<4>(entry);
+        (void)f; (void)ctr;
+        const int g = g_e;
+        const int64_t packed = arena[idx].packed;
 
         if (packed == target_packed) {
-            // Reconstruct swap path
+            // Reconstruct path
             std::vector<std::pair<int,int>> path;
-            int64_t cur = packed;
-            while (parent[cur].first != -1) {
-                path.push_back(parent[cur].second);
-                cur = parent[cur].first;
+            int cur = idx;
+            while (arena[cur].parent_idx != -1) {
+                path.push_back({arena[cur].sw_lo, arena[cur].sw_hi});
+                cur = arena[cur].parent_idx;
             }
             std::reverse(path.begin(), path.end());
 
-            // Replay swaps on full pi
             auto result_pi = apply_swaps_to_pi(pi, path);
-
-            // Store in cache
             if (swap_cache && !use_neighbor) {
                 (*swap_cache)[cache_key] = path;
             }
-
             return {path, result_pi};
         }
 
-        // Skip if we've found a better path to this state
-        auto vis_it = visited.find(packed);
-        if (vis_it != visited.end() && vis_it->second < g) {
-            continue;
-        }
+        // Stale entry?
+        if (arena[idx].g < g) continue;
 
-        unpack_state_into(packed, k, N_, positions);
+        // Unpack positions for this state
+        {
+            int64_t p = packed;
+            for (int i = 0; i < k; i++) {
+                positions[i] = static_cast<int>(p % N_);
+                p /= N_;
+            }
+        }
+        const double cur_h_sum = arena[idx].h_sum;
+        const double cur_nb_total = arena[idx].nb_total;
+        const int cur_nb_arena_idx = arena[idx].nb_arena_idx;
 
-        // Try every SWAP that moves at least one partition qubit
+        // Expand: every SWAP that moves at least one partition qubit
         for (int i = 0; i < k; i++) {
-            int p = positions[i];
-            for (int nb : adj_[p]) {
-                std::copy(positions.begin(), positions.end(), new_positions.begin());
-                new_positions[i] = nb;
-                // If neighbor also holds a partition qubit, swap it
+            const int p = positions[i];
+            const int t_i = target_positions[i];
+            const int adj_lo = adj_offsets_[p];
+            const int adj_hi = adj_offsets_[p + 1];
+            for (int nb_idx = adj_lo; nb_idx < adj_hi; nb_idx++) {
+                const int nb = adj_flat_[nb_idx];
+                // Find j such that positions[j] == nb (if any)
+                int j_swap = -1;
                 for (int j = 0; j < k; j++) {
-                    if (positions[j] == nb) {
-                        new_positions[j] = p;
-                        break;
-                    }
+                    if (positions[j] == nb) { j_swap = j; break; }
                 }
 
-                int64_t new_packed = pack_state(new_positions, N_);
-                int new_g = g + 1;
+                // Incremental packed
+                int64_t new_packed = packed + static_cast<int64_t>(nb - p) * pow_N[i];
+                if (j_swap >= 0) {
+                    new_packed += static_cast<int64_t>(p - nb) * pow_N[j_swap];
+                }
+
+                // Incremental h_sum
+                double new_h_sum = cur_h_sum
+                    - dist(p, t_i) + dist(nb, t_i);
+                if (j_swap >= 0) {
+                    const int t_j = target_positions[j_swap];
+                    new_h_sum += -dist(nb, t_j) + dist(p, t_j);
+                }
 
-                auto new_vis = visited.find(new_packed);
-                if (new_vis != visited.end() && new_vis->second <= new_g) {
+                const int new_g = g + 1;
+                const size_t slot = table_slot(new_packed);
+                const int32_t existing = table[slot];
+                if (existing >= 0 && arena[existing].g <= new_g) {
                     continue;
                 }
 
-                std::vector<int> new_neighbor_positions = neighbor_positions;
-                double new_nh = 0.0;
+                // Neighbor heuristic: simple recompute (cheaper than incremental for small edge counts)
+                double new_nb_total = cur_nb_total;
+                int new_nb_arena_idx = -1;
                 if (use_neighbor) {
-                    std::unordered_map<int, int> phys_to_neighbor_idx;
-                    phys_to_neighbor_idx.reserve(new_neighbor_positions.size());
-                    for (int idx = 0; idx < static_cast<int>(new_neighbor_positions.size()); idx++) {
-                        phys_to_neighbor_idx.emplace(new_neighbor_positions[idx], idx);
+                    std::vector<int> new_pos_nb = nb_arena[cur_nb_arena_idx];
+                    int idx_nb = -1, idx_p = -1;
+                    for (size_t z = 0; z < new_pos_nb.size(); z++) {
+                        const int phys = new_pos_nb[z];
+                        if (phys == nb) idx_nb = static_cast<int>(z);
+                        else if (phys == p) idx_p = static_cast<int>(z);
+                        if (idx_nb >= 0 && idx_p >= 0) break;
                     }
-
-                    auto it_nb = phys_to_neighbor_idx.find(nb);
-                    if (it_nb != phys_to_neighbor_idx.end()) {
-                        new_neighbor_positions[it_nb->second] = p;
-                    }
-                    auto it_p = phys_to_neighbor_idx.find(p);
-                    if (it_p != phys_to_neighbor_idx.end()) {
-                        new_neighbor_positions[it_p->second] = nb;
+                    if (idx_nb >= 0 || idx_p >= 0) {
+                        if (idx_nb >= 0) new_pos_nb[idx_nb] = p;
+                        if (idx_p >= 0)  new_pos_nb[idx_p]  = nb;
+                        new_nb_total = compute_nb_total(new_pos_nb);
+                        new_nb_arena_idx = static_cast<int>(nb_arena.size());
+                        nb_arena.push_back(std::move(new_pos_nb));
+                    } else {
+                        new_nb_arena_idx = cur_nb_arena_idx;
                     }
-                    new_nh = neighbor_heuristic(new_neighbor_positions);
                 }
 
-                // Compute heuristic
-                double h = 0.0;
-                for (int j = 0; j < k; j++) {
-                    h += dist(new_positions[j], target_positions[j]);
+                // Insert/update node
+                Node n;
+                n.packed = new_packed;
+                n.parent_idx = idx;
+                n.g = new_g;
+                const int lo = std::min(p, nb);
+                const int hi = std::max(p, nb);
+                n.sw_lo = lo; n.sw_hi = hi;
+                n.h_sum = new_h_sum;
+                n.nb_total = new_nb_total;
+                n.nb_arena_idx = new_nb_arena_idx;
+
+                int32_t new_idx = static_cast<int32_t>(arena.size());
+                arena.push_back(n);
+
+                // Re-find slot if arena grew (table didn't, but slot is still valid
+                // since we didn't grow `table`); just write
+                table[slot] = new_idx;
+
+                // Grow table if load factor too high (> 0.5)
+                if (arena.size() * 2 > table.size()) {
+                    table_grow();
                 }
-                h /= 2.0;
-
-                visited[new_packed] = new_g;
-                parent[new_packed] = {packed, {std::min(p, nb), std::max(p, nb)}};
-                pq.push({
-                    new_g + h + new_nh,
-                    new_g,
-                    counter++,
-                    new_packed,
-                    std::move(new_neighbor_positions),
-                });
+
+                const double f_new = static_cast<double>(new_g)
+                                   + 0.5 * new_h_sum
+                                   + neighbor_scale * new_nb_total;
+                pq.push({f_new, new_g, counter++, new_idx});
             }
         }
     }
@@ -887,17 +989,22 @@ double SabreRouter::score_candidate(
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data,
     SwapCache* swap_cache,
-    const std::vector<double>* decay
+    const std::vector<double>* decay,
+    std::vector<std::pair<int,int>>* out_swaps,
+    std::vector<int>* out_pi_new,
+    const std::vector<ResolvedEntry>* resolved_F,
+    const std::vector<ResolvedEntry>* resolved_E,
+    const NeighborInfo* cached_neighbor_info
 ) const {
-    const auto neighbor_info = build_neighbor_info(
-        cand.partition_idx,
-        F_snapshot,
-        E,
-        pi,
-        canonical_data
-    );
-    const NeighborInfo* neighbor_ptr =
-        neighbor_info.uses_tiebreak() ? &neighbor_info : nullptr;
+    NeighborInfo local_neighbor_info;
+    const NeighborInfo* neighbor_ptr;
+    if (cached_neighbor_info) {
+        neighbor_ptr = cached_neighbor_info->uses_tiebreak() ? cached_neighbor_info : nullptr;
+    } else {
+        local_neighbor_info = build_neighbor_info(
+            cand.partition_idx, F_snapshot, E, pi, canonical_data);
+        neighbor_ptr = local_neighbor_info.uses_tiebreak() ? &local_neighbor_info : nullptr;
+    }
     auto [swaps, output_perm] = transform_pi(
         cand,
         pi,
@@ -913,22 +1020,35 @@ double SabreRouter::score_candidate(
     }
 
     // F cost: average routing cost over F \ {cand}
-    int cand_idx = cand.partition_idx;
+    const int cand_idx = cand.partition_idx;
     int n_other = 0;
     double f_sum = 0.0;
-    for (int p_idx : F_snapshot) {
-        if (p_idx == cand_idx) continue;
-        auto it = canonical_data.find(p_idx);
-        if (it == canonical_data.end()) continue;
-        const auto& entry = it->second;
-        n_other++;
-        if (entry.edges_u.empty()) continue;
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            int u = entry.edges_u[i];
-            int v = entry.edges_v[i];
-            double d = dist(output_perm[u], output_perm[v]);
-            double cost = d - 1.0;
-            if (cost > 0.0) f_sum += config_.swap_cost * cost;
+    if (resolved_F) {
+        for (const auto& re : *resolved_F) {
+            if (re.partition_idx == cand_idx) continue;
+            if (!re.entry) continue;
+            n_other++;
+            const auto& entry = *re.entry;
+            for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                const double d = dist(output_perm[entry.edges_u[i]],
+                                      output_perm[entry.edges_v[i]]);
+                const double cost = d - 1.0;
+                if (cost > 0.0) f_sum += config_.swap_cost * cost;
+            }
+        }
+    } else {
+        for (int p_idx : F_snapshot) {
+            if (p_idx == cand_idx) continue;
+            auto it = canonical_data.find(p_idx);
+            if (it == canonical_data.end()) continue;
+            const auto& entry = it->second;
+            n_other++;
+            for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                const double d = dist(output_perm[entry.edges_u[i]],
+                                      output_perm[entry.edges_v[i]]);
+                const double cost = d - 1.0;
+                if (cost > 0.0) f_sum += config_.swap_cost * cost;
+            }
         }
     }
     if (n_other > 0) score += f_sum / static_cast<double>(n_other);
@@ -936,28 +1056,44 @@ double SabreRouter::score_candidate(
     // E cost: alpha^depth-decayed lookahead
     if (!E.empty()) {
         double e_sum = 0.0;
-        for (auto [p_idx, depth] : E) {
-            if (p_idx == cand_idx) continue;
-            const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
-                ? alpha_weights_[depth]
-                : std::pow(config_.E_alpha, depth);
-            auto it = canonical_data.find(p_idx);
-            if (it == canonical_data.end()) continue;
-            const auto& entry = it->second;
-            if (entry.edges_u.empty()) continue;
-            double d_cost = 0.0;
-            for (size_t i = 0; i < entry.edges_u.size(); i++) {
-                int u = entry.edges_u[i];
-                int v = entry.edges_v[i];
-                double d = dist(output_perm[u], output_perm[v]);
-                double cost = d - 1.0;
-                if (cost > 0.0) d_cost += config_.swap_cost * cost;
+        if (resolved_E) {
+            for (const auto& re : *resolved_E) {
+                if (re.partition_idx == cand_idx) continue;
+                if (!re.entry) continue;
+                const auto& entry = *re.entry;
+                double d_cost = 0.0;
+                for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                    const double d = dist(output_perm[entry.edges_u[i]],
+                                          output_perm[entry.edges_v[i]]);
+                    const double cost = d - 1.0;
+                    if (cost > 0.0) d_cost += config_.swap_cost * cost;
+                }
+                e_sum += re.alpha * d_cost;
+            }
+        } else {
+            for (auto [p_idx, depth] : E) {
+                if (p_idx == cand_idx) continue;
+                const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+                    ? alpha_weights_[depth]
+                    : std::pow(config_.E_alpha, depth);
+                auto it = canonical_data.find(p_idx);
+                if (it == canonical_data.end()) continue;
+                const auto& entry = it->second;
+                double d_cost = 0.0;
+                for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                    const double d = dist(output_perm[entry.edges_u[i]],
+                                          output_perm[entry.edges_v[i]]);
+                    const double cost = d - 1.0;
+                    if (cost > 0.0) d_cost += config_.swap_cost * cost;
+                }
+                e_sum += alpha * d_cost;
             }
-            e_sum += alpha * d_cost;
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
     }
 
+    if (out_swaps) *out_swaps = std::move(swaps);
+    if (out_pi_new) *out_pi_new = std::move(output_perm);
     return score;
 }
 
@@ -1145,47 +1281,74 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         // Generate extended set
         auto E = generate_extended_set(F, resolved, cg, pg);
 
-        // Score all candidates
-        std::vector<double> scores;
-        scores.reserve(candidates.size());
-        for (const auto* cand : candidates) {
-            scores.push_back(score_candidate(
-                *cand,
-                F,
-                pi,
-                E,
-                reverse,
-                canonical_data,
-                &swap_cache,
-                &decay
-            ));
+        // Pre-resolve canonical entries for F and E once per F-step
+        std::vector<ResolvedEntry> resolved_F;
+        resolved_F.reserve(F.size());
+        for (int p_idx : F) {
+            auto it = canonical_data.find(p_idx);
+            const CanonicalEntry* ent = (it != canonical_data.end()) ? &it->second : nullptr;
+            if (ent && ent->edges_u.empty()) ent = nullptr;
+            resolved_F.push_back({p_idx, ent, 1.0});
+        }
+        std::vector<ResolvedEntry> resolved_E;
+        resolved_E.reserve(E.size());
+        for (auto [p_idx, depth] : E) {
+            auto it = canonical_data.find(p_idx);
+            const CanonicalEntry* ent = (it != canonical_data.end()) ? &it->second : nullptr;
+            if (ent && ent->edges_u.empty()) ent = nullptr;
+            const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+                ? alpha_weights_[depth]
+                : std::pow(config_.E_alpha, depth);
+            resolved_E.push_back({p_idx, ent, alpha});
+        }
+
+        // Group candidates by partition_idx so build_neighbor_info is shared
+        std::vector<size_t> order(candidates.size());
+        std::iota(order.begin(), order.end(), 0);
+        std::sort(order.begin(), order.end(), [&](size_t a, size_t b) {
+            return candidates[a]->partition_idx < candidates[b]->partition_idx;
+        });
+
+        // Score all candidates and cache each one's transform output
+        std::vector<double> scores(candidates.size());
+        std::vector<std::vector<std::pair<int,int>>> cached_swaps(candidates.size());
+        std::vector<std::vector<int>> cached_pi(candidates.size());
+        int prev_partition_idx = -1;
+        NeighborInfo cached_ni;
+        for (size_t k_ord = 0; k_ord < order.size(); k_ord++) {
+            const size_t ci = order[k_ord];
+            const int p_idx = candidates[ci]->partition_idx;
+            if (p_idx != prev_partition_idx) {
+                cached_ni = build_neighbor_info(p_idx, F, E, pi, canonical_data);
+                prev_partition_idx = p_idx;
+            }
+            scores[ci] = score_candidate(
+                *candidates[ci],
+                F, pi, E, reverse, canonical_data,
+                &swap_cache, &decay,
+                &cached_swaps[ci], &cached_pi[ci],
+                &resolved_F, &resolved_E,
+                &cached_ni
+            );
         }
 
         // Select best
         const auto& best = select_best_candidate(candidates, scores, rng);
+        // Find selected index to retrieve cached transform
+        size_t best_ci = 0;
+        for (size_t ci = 0; ci < candidates.size(); ci++) {
+            if (candidates[ci] == &best) { best_ci = ci; break; }
+        }
 
         // Remove from F and mark resolved
         F.erase(std::remove(F.begin(), F.end(), best.partition_idx), F.end());
         in_F[best.partition_idx] = 0;
         resolved[best.partition_idx] = 1;
 
-        // Apply transform
-        const auto neighbor_info = build_neighbor_info(
-            best.partition_idx,
-            F,
-            E,
-            pi,
-            canonical_data
-        );
-        const NeighborInfo* neighbor_ptr =
-            neighbor_info.uses_tiebreak() ? &neighbor_info : nullptr;
-        auto [swaps, pi_new] = transform_pi(
-            best,
-            pi,
-            reverse,
-            &swap_cache,
-            neighbor_ptr
-        );
+        // Reuse cached transform from scoring (F_snapshot \ {best} == F_after_erase
+        // because exclude_partition_idx == best.partition_idx in both cases)
+        std::vector<std::pair<int,int>> swaps = std::move(cached_swaps[best_ci]);
+        std::vector<int> pi_new = std::move(cached_pi[best_ci]);
         total_cost += config_.trial_swap_cnot_cost * static_cast<int>(swaps.size())
                       + best.cnot_count;
         pi = std::move(pi_new);

From 597f15e855e8ca3266c731e5072cc4ed7706d7dc Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 10:15:05 +0200
Subject: [PATCH 146/232] better seeding

---
 .../sabre_router/include/sabre_router.hpp     | 15 ++++
 .../src-cpp/sabre_router/sabre_router.cpp     | 90 +++++++++++++++++--
 squander/synthesis/PartAM.py                  | 58 ++++++++++--
 3 files changed, 150 insertions(+), 13 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 983ad2581..486aa5440 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -341,6 +341,21 @@ class SabreRouter {
     // Random permutation of [0..N-1]
     std::vector<int> random_permutation(int n, std::mt19937& rng) const;
 
+    // Apply a small random walk on topology edges to diversify a seeded layout.
+    std::vector<int> perturb_layout(
+        const std::vector<int>& base,
+        int num_swaps,
+        std::mt19937& rng
+    ) const;
+
+    // Stratified initial-layout sampling with the same total trial budget.
+    std::vector<int> sample_initial_layout(
+        int trial_idx,
+        int n_trials,
+        const std::vector<int>& seeded_pi,
+        std::mt19937& rng
+    ) const;
+
     // Build P_route_inv: the inverse permutation used for routing
     std::vector<int> build_route_inv(const std::vector<int>& P, bool reverse) const;
 
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index f7fe6ae62..88dd693a4 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -167,6 +167,85 @@ std::vector<int> SabreRouter::random_permutation(int n, std::mt19937& rng) const
     return perm;
 }
 
+std::vector<int> SabreRouter::perturb_layout(
+    const std::vector<int>& base,
+    int num_swaps,
+    std::mt19937& rng
+) const {
+    if (num_swaps <= 0 || adj_.empty()) {
+        return base;
+    }
+
+    std::vector<std::pair<int, int>> swaps;
+    swaps.reserve(num_swaps);
+    std::uniform_int_distribution<int> phys_dist(0, N_ - 1);
+
+    for (int step = 0; step < num_swaps; step++) {
+        int phys = phys_dist(rng);
+        int retries = 0;
+        while (adj_[phys].empty() && retries < N_) {
+            phys = (phys + 1) % N_;
+            retries++;
+        }
+        if (adj_[phys].empty()) {
+            break;
+        }
+        std::uniform_int_distribution<int> nb_dist(
+            0, static_cast<int>(adj_[phys].size()) - 1
+        );
+        int nb = adj_[phys][nb_dist(rng)];
+        swaps.push_back({std::min(phys, nb), std::max(phys, nb)});
+    }
+
+    if (swaps.empty()) {
+        return base;
+    }
+
+    return apply_swaps_to_pi(base, swaps);
+}
+
+std::vector<int> SabreRouter::sample_initial_layout(
+    int trial_idx,
+    int n_trials,
+    const std::vector<int>& seeded_pi,
+    std::mt19937& rng
+) const {
+    if (n_trials <= 1) {
+        return seeded_pi;
+    }
+
+    std::vector<int> mirrored_pi(N_);
+    for (int q = 0; q < N_; q++) {
+        mirrored_pi[q] = (N_ - 1) - seeded_pi[q];
+    }
+
+    if (trial_idx == 0) {
+        return seeded_pi;
+    }
+    if (trial_idx == 1) {
+        return mirrored_pi;
+    }
+
+    const int local_cutoff = std::max(
+        3, static_cast<int>(std::ceil(n_trials * 0.6))
+    );
+    if (trial_idx < local_cutoff) {
+        const int local_idx = trial_idx - 2;
+        const int band_idx = local_idx / 2;
+        const int local_budget = std::max(1, local_cutoff - 2);
+        const double phase = static_cast<double>(band_idx)
+            / std::max(1, local_budget / 2);
+        const int num_swaps = (phase < 0.5)
+            ? (1 + (band_idx % 3))
+            : (4 + (band_idx % 5));
+        const std::vector<int>& base =
+            (local_idx % 2 == 0) ? seeded_pi : mirrored_pi;
+        return perturb_layout(base, num_swaps, rng);
+    }
+
+    return random_permutation(N_, rng);
+}
+
 // ---------------------------------------------------------------------------
 // Helper: build P_route_inv
 // ---------------------------------------------------------------------------
@@ -1429,14 +1508,9 @@ TrialResult SabreRouter::run_trial(
     std::mt19937 rng_gen(config_.random_seed + trial_idx);
     std::mt19937* rng = (n_trials > 1) ? &rng_gen : nullptr;
 
-    // vf2_cutoff: first 5% of trials use seeded layout
-    int vf2_cutoff = std::max(1, static_cast<int>(n_trials * 0.05));
-    std::vector<int> pi;
-    if (trial_idx < vf2_cutoff) {
-        pi = seeded_pi;
-    } else {
-        pi = random_permutation(N_, rng_gen);
-    }
+    std::vector<int> pi = sample_initial_layout(
+        trial_idx, n_trials, seeded_pi, rng_gen
+    );
 
     auto F_rev = get_final_layer();
     auto F_fwd = get_initial_layer();
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 562ea3f67..dfbc0c344 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -772,11 +772,7 @@ def _run_single_layout_trial(
             if n_trials > 1
             else None
         )
-        vf2_cutoff = max(1, int(n_trials * 0.05))
-        if trial_idx < vf2_cutoff:
-            pi = seeded_pi.copy()
-        else:
-            pi = rng.permutation(N)
+        pi = self._sample_initial_layout(trial_idx, n_trials, seeded_pi, rng)
 
         for iteration in range(n_iterations):
             F_rev = self.get_final_layer(DAG, N, layout_partitions)
@@ -1233,6 +1229,58 @@ def _apply_swaps_to_pi(pi, swaps):
             pi_new[q1], pi_new[q2] = P2, P1
         return pi_new
 
+    def _perturb_layout(self, base_pi, num_swaps, rng):
+        if num_swaps <= 0 or rng is None or not self._adj:
+            return np.asarray(base_pi, dtype=np.int64).copy()
+
+        swaps = []
+        N = len(base_pi)
+        for _ in range(num_swaps):
+            phys = int(rng.randint(N))
+            retries = 0
+            while not self._adj[phys] and retries < N:
+                phys = (phys + 1) % N
+                retries += 1
+            if not self._adj[phys]:
+                break
+            nb = int(self._adj[phys][rng.randint(len(self._adj[phys]))])
+            swaps.append((min(phys, nb), max(phys, nb)))
+
+        if not swaps:
+            return np.asarray(base_pi, dtype=np.int64).copy()
+
+        return np.asarray(
+            self._apply_swaps_to_pi(base_pi, swaps), dtype=np.int64
+        )
+
+    def _sample_initial_layout(self, trial_idx, n_trials, seeded_pi, rng):
+        seeded_pi = np.asarray(seeded_pi, dtype=np.int64)
+        if n_trials <= 1 or rng is None:
+            return seeded_pi.copy()
+
+        mirrored_pi = (len(seeded_pi) - 1) - seeded_pi
+
+        if trial_idx == 0:
+            return seeded_pi.copy()
+        if trial_idx == 1:
+            return mirrored_pi.copy()
+
+        local_cutoff = max(3, int(np.ceil(n_trials * 0.6)))
+        if trial_idx < local_cutoff:
+            local_idx = trial_idx - 2
+            band_idx = local_idx // 2
+            local_budget = max(1, local_cutoff - 2)
+            phase = band_idx / max(1, local_budget // 2)
+            num_swaps = (
+                1 + (band_idx % 3)
+                if phase < 0.5
+                else 4 + (band_idx % 5)
+            )
+            base = seeded_pi if local_idx % 2 == 0 else mirrored_pi
+            return self._perturb_layout(base, num_swaps, rng)
+
+        return rng.permutation(len(seeded_pi))
+
     def _bfs_shortest_path(self, src, dst):
         if src == dst:
             return [src]

From 1adea7eda530df89c61090f9e97d77728657dcfb Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 10:57:59 +0200
Subject: [PATCH 147/232] Routing changes

---
 .../sabre_router/include/sabre_router.hpp     |  27 ++
 .../src-cpp/sabre_router/sabre_router.cpp     | 252 +++++++++----
 squander/synthesis/PartAM.py                  | 342 ++++++++++++++----
 squander/synthesis/bindings.cpp               |  63 +++-
 4 files changed, 531 insertions(+), 153 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 486aa5440..bdd002875 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -17,6 +17,7 @@ Ported from squander/synthesis/PartAM.py and PartAM_utils.py.
 #include <optional>
 #include <queue>
 #include <random>
+#include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
@@ -67,6 +68,12 @@ struct CanonicalEntry {
     std::vector<int> edges_u; // virtual qubit indices
     std::vector<int> edges_v;
     int cnot;
+    struct FutureVariant {
+        std::vector<int> edges_u;
+        std::vector<int> edges_v;
+        int cnot = 0;
+    };
+    std::vector<FutureVariant> variants;
 };
 
 struct LayoutPartInfo {
@@ -92,6 +99,9 @@ struct SabreConfig {
     bool release_valve_enabled = true;
     int release_valve_threshold = 20;
     double path_tiebreak_weight = 0.2;
+    std::string future_cost_mode = "candidate_min";
+    int future_candidate_top_k = 4;
+    double future_candidate_weight = 1.0;
 };
 
 struct TrialResult {
@@ -383,6 +393,23 @@ class SabreRouter {
         const std::unordered_map<int, CanonicalEntry>& canonical_data
     ) const;
 
+    double variant_routing_cost(
+        const CanonicalEntry::FutureVariant& variant,
+        const std::vector<int>& pi
+    ) const;
+
+    double entry_future_cost(
+        const CanonicalEntry& entry,
+        const std::vector<int>& pi
+    ) const;
+
+    double future_partition_cost(
+        int partition_idx,
+        const CanonicalEntry* entry,
+        const std::vector<int>& pi,
+        bool reverse
+    ) const;
+
     // Immutable data members
     SabreConfig config_;
     int N_; // number of physical qubits
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 88dd693a4..154618349 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -366,8 +366,32 @@ NeighborInfo SabreRouter::build_neighbor_info(
         auto it = canonical_data.find(partition_idx);
         if (it == canonical_data.end()) return;
         const auto& entry = it->second;
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            add_edge(entry.edges_u[i], entry.edges_v[i], weight);
+        std::vector<const CanonicalEntry::FutureVariant*> active_variants;
+        if (!entry.variants.empty()) {
+            const size_t limit = (config_.future_cost_mode == "canonical")
+                ? std::min<size_t>(1, entry.variants.size())
+                : entry.variants.size();
+            active_variants.reserve(limit);
+            for (size_t i = 0; i < limit; i++) {
+                if (!entry.variants[i].edges_u.empty()) {
+                    active_variants.push_back(&entry.variants[i]);
+                }
+            }
+        } else if (!entry.edges_u.empty()) {
+            CanonicalEntry::FutureVariant primary;
+            primary.edges_u = entry.edges_u;
+            primary.edges_v = entry.edges_v;
+            primary.cnot = entry.cnot;
+            active_variants.push_back(&primary);
+        }
+        if (active_variants.empty()) {
+            return;
+        }
+        const double variant_weight = weight / static_cast<double>(active_variants.size());
+        for (const auto* variant : active_variants) {
+            for (size_t i = 0; i < variant->edges_u.size(); i++) {
+                add_edge(variant->edges_u[i], variant->edges_v[i], variant_weight);
+            }
         }
     };
 
@@ -464,7 +488,8 @@ std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::releas
     const std::vector<int>& pi,
     const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
-    double best_worst_dist = std::numeric_limits<double>::infinity();
+    double best_worst_dist = -std::numeric_limits<double>::infinity();
+    int best_partition_idx = -1;
     int best_u = -1;
     int best_v = -1;
 
@@ -474,30 +499,71 @@ std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::releas
             continue;
         }
         const auto& entry = it->second;
-        if (entry.edges_u.empty()) {
-            continue;
-        }
-
+        double chosen_route_cost = std::numeric_limits<double>::infinity();
         double worst_dist = 0.0;
         int worst_u = -1;
         int worst_v = -1;
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            const int u = entry.edges_u[i];
-            const int v = entry.edges_v[i];
-            const double d = dist(pi[u], pi[v]);
-            if (d > worst_dist) {
-                worst_dist = d;
-                worst_u = u;
-                worst_v = v;
+
+        auto consider_variant = [&](const CanonicalEntry::FutureVariant& variant) {
+            if (variant.edges_u.empty()) {
+                return;
             }
+            double route_cost = 0.0;
+            double variant_worst_dist = 0.0;
+            int variant_worst_u = -1;
+            int variant_worst_v = -1;
+            for (size_t i = 0; i < variant.edges_u.size(); i++) {
+                const int u = variant.edges_u[i];
+                const int v = variant.edges_v[i];
+                const double d = dist(pi[u], pi[v]);
+                const double cost = d - 1.0;
+                if (cost > 0.0) {
+                    route_cost += config_.swap_cost * cost;
+                }
+                if (d > variant_worst_dist) {
+                    variant_worst_dist = d;
+                    variant_worst_u = u;
+                    variant_worst_v = v;
+                }
+            }
+            if (
+                route_cost < chosen_route_cost
+                || (route_cost == chosen_route_cost
+                    && variant_worst_dist < worst_dist)
+            ) {
+                chosen_route_cost = route_cost;
+                worst_dist = variant_worst_dist;
+                worst_u = variant_worst_u;
+                worst_v = variant_worst_v;
+            }
+        };
+
+        if (!entry.variants.empty()) {
+            const size_t limit = (config_.future_cost_mode == "canonical")
+                ? std::min<size_t>(1, entry.variants.size())
+                : entry.variants.size();
+            for (size_t i = 0; i < limit; i++) {
+                consider_variant(entry.variants[i]);
+            }
+        } else {
+            CanonicalEntry::FutureVariant primary;
+            primary.edges_u = entry.edges_u;
+            primary.edges_v = entry.edges_v;
+            primary.cnot = entry.cnot;
+            consider_variant(primary);
         }
 
         if (worst_dist <= 1.0 || worst_u < 0) {
             continue;
         }
 
-        if (worst_dist < best_worst_dist) {
+        if (
+            worst_dist > best_worst_dist
+            || (worst_dist == best_worst_dist
+                && (best_partition_idx < 0 || partition_idx < best_partition_idx))
+        ) {
             best_worst_dist = worst_dist;
+            best_partition_idx = partition_idx;
             best_u = worst_u;
             best_v = worst_v;
         }
@@ -1002,6 +1068,85 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
 // Routing cost helpers
 // ---------------------------------------------------------------------------
 
+double SabreRouter::variant_routing_cost(
+    const CanonicalEntry::FutureVariant& variant,
+    const std::vector<int>& pi
+) const {
+    double total = 0.0;
+    for (size_t i = 0; i < variant.edges_u.size(); i++) {
+        const double d = dist(pi[variant.edges_u[i]], pi[variant.edges_v[i]]);
+        const double cost = d - 1.0;
+        if (cost > 0.0) {
+            total += config_.swap_cost * cost;
+        }
+    }
+    return total;
+}
+
+double SabreRouter::entry_future_cost(
+    const CanonicalEntry& entry,
+    const std::vector<int>& pi
+) const {
+    if (!entry.variants.empty()) {
+        const size_t limit = (config_.future_cost_mode == "canonical")
+            ? std::min<size_t>(1, entry.variants.size())
+            : entry.variants.size();
+        double best = std::numeric_limits<double>::infinity();
+        for (size_t i = 0; i < limit; i++) {
+            const auto& variant = entry.variants[i];
+            double cost = variant_routing_cost(variant, pi);
+            if (config_.future_cost_mode != "canonical") {
+                cost += config_.future_candidate_weight
+                    * config_.local_cost_weight
+                    * static_cast<double>(variant.cnot);
+            }
+            if (cost < best) {
+                best = cost;
+            }
+        }
+        return std::isfinite(best) ? best : 0.0;
+    }
+
+    CanonicalEntry::FutureVariant primary;
+    primary.edges_u = entry.edges_u;
+    primary.edges_v = entry.edges_v;
+    primary.cnot = entry.cnot;
+    return variant_routing_cost(primary, pi);
+}
+
+double SabreRouter::future_partition_cost(
+    int partition_idx,
+    const CanonicalEntry* entry,
+    const std::vector<int>& pi,
+    bool reverse
+) const {
+    if (
+        config_.future_cost_mode == "candidate_min"
+        && partition_idx >= 0
+        && partition_idx < static_cast<int>(candidate_cache_.size())
+        && !candidate_cache_[partition_idx].empty()
+    ) {
+        double best = std::numeric_limits<double>::infinity();
+        for (const auto& cand : candidate_cache_[partition_idx]) {
+            double cost = config_.swap_cost
+                * static_cast<double>(estimate_swap_count(cand, pi, reverse));
+            cost += config_.future_candidate_weight
+                * config_.local_cost_weight
+                * static_cast<double>(cand.cnot_count);
+            if (cost < best) {
+                best = cost;
+            }
+        }
+        if (std::isfinite(best)) {
+            return best;
+        }
+    }
+    if (entry == nullptr) {
+        return 0.0;
+    }
+    return entry_future_cost(*entry, pi);
+}
+
 double SabreRouter::compute_routing_cost(
     const std::vector<int>& pi,
     int exclude_partition_idx,
@@ -1013,15 +1158,7 @@ double SabreRouter::compute_routing_cost(
         if (p_idx == exclude_partition_idx) continue;
         auto it = canonical_data.find(p_idx);
         if (it == canonical_data.end()) continue;
-        const auto& entry = it->second;
-        if (entry.edges_u.empty()) continue;
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            int u = entry.edges_u[i];
-            int v = entry.edges_v[i];
-            double d = dist(pi[u], pi[v]);
-            double cost = d - 1.0;
-            if (cost > 0.0) total += config_.swap_cost * cost;
-        }
+        total += entry_future_cost(it->second, pi);
     }
     return total;
 }
@@ -1038,16 +1175,7 @@ double SabreRouter::compute_lookahead_cost(
         if (p_idx == exclude_partition_idx) continue;
         auto it = canonical_data.find(p_idx);
         if (it == canonical_data.end()) continue;
-        const auto& entry = it->second;
-        if (entry.edges_u.empty()) continue;
-        double d_cost = 0.0;
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            int u = entry.edges_u[i];
-            int v = entry.edges_v[i];
-            double d = dist(pi[u], pi[v]);
-            double cost = d - 1.0;
-            if (cost > 0.0) d_cost += config_.swap_cost * cost;
-        }
+        const double d_cost = entry_future_cost(it->second, pi);
         const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
             ? alpha_weights_[depth]
             : std::pow(config_.E_alpha, depth);
@@ -1105,29 +1233,21 @@ double SabreRouter::score_candidate(
     if (resolved_F) {
         for (const auto& re : *resolved_F) {
             if (re.partition_idx == cand_idx) continue;
-            if (!re.entry) continue;
             n_other++;
-            const auto& entry = *re.entry;
-            for (size_t i = 0; i < entry.edges_u.size(); i++) {
-                const double d = dist(output_perm[entry.edges_u[i]],
-                                      output_perm[entry.edges_v[i]]);
-                const double cost = d - 1.0;
-                if (cost > 0.0) f_sum += config_.swap_cost * cost;
-            }
+            f_sum += future_partition_cost(
+                re.partition_idx, re.entry, output_perm, reverse
+            );
         }
     } else {
         for (int p_idx : F_snapshot) {
             if (p_idx == cand_idx) continue;
             auto it = canonical_data.find(p_idx);
-            if (it == canonical_data.end()) continue;
-            const auto& entry = it->second;
             n_other++;
-            for (size_t i = 0; i < entry.edges_u.size(); i++) {
-                const double d = dist(output_perm[entry.edges_u[i]],
-                                      output_perm[entry.edges_v[i]]);
-                const double cost = d - 1.0;
-                if (cost > 0.0) f_sum += config_.swap_cost * cost;
-            }
+            const CanonicalEntry* entry =
+                (it != canonical_data.end()) ? &it->second : nullptr;
+            f_sum += future_partition_cost(
+                p_idx, entry, output_perm, reverse
+            );
         }
     }
     if (n_other > 0) score += f_sum / static_cast<double>(n_other);
@@ -1138,16 +1258,9 @@ double SabreRouter::score_candidate(
         if (resolved_E) {
             for (const auto& re : *resolved_E) {
                 if (re.partition_idx == cand_idx) continue;
-                if (!re.entry) continue;
-                const auto& entry = *re.entry;
-                double d_cost = 0.0;
-                for (size_t i = 0; i < entry.edges_u.size(); i++) {
-                    const double d = dist(output_perm[entry.edges_u[i]],
-                                          output_perm[entry.edges_v[i]]);
-                    const double cost = d - 1.0;
-                    if (cost > 0.0) d_cost += config_.swap_cost * cost;
-                }
-                e_sum += re.alpha * d_cost;
+                e_sum += re.alpha * future_partition_cost(
+                    re.partition_idx, re.entry, output_perm, reverse
+                );
             }
         } else {
             for (auto [p_idx, depth] : E) {
@@ -1156,16 +1269,11 @@ double SabreRouter::score_candidate(
                     ? alpha_weights_[depth]
                     : std::pow(config_.E_alpha, depth);
                 auto it = canonical_data.find(p_idx);
-                if (it == canonical_data.end()) continue;
-                const auto& entry = it->second;
-                double d_cost = 0.0;
-                for (size_t i = 0; i < entry.edges_u.size(); i++) {
-                    const double d = dist(output_perm[entry.edges_u[i]],
-                                          output_perm[entry.edges_v[i]]);
-                    const double cost = d - 1.0;
-                    if (cost > 0.0) d_cost += config_.swap_cost * cost;
-                }
-                e_sum += alpha * d_cost;
+                const CanonicalEntry* entry =
+                    (it != canonical_data.end()) ? &it->second : nullptr;
+                e_sum += alpha * future_partition_cost(
+                    p_idx, entry, output_perm, reverse
+                );
             }
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
@@ -1366,7 +1474,6 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         for (int p_idx : F) {
             auto it = canonical_data.find(p_idx);
             const CanonicalEntry* ent = (it != canonical_data.end()) ? &it->second : nullptr;
-            if (ent && ent->edges_u.empty()) ent = nullptr;
             resolved_F.push_back({p_idx, ent, 1.0});
         }
         std::vector<ResolvedEntry> resolved_E;
@@ -1374,7 +1481,6 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         for (auto [p_idx, depth] : E) {
             auto it = canonical_data.find(p_idx);
             const CanonicalEntry* ent = (it != canonical_data.end()) ? &it->second : nullptr;
-            if (ent && ent->edges_u.empty()) ent = nullptr;
             const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
                 ? alpha_weights_[depth]
                 : std::pow(config_.E_alpha, depth);
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index dfbc0c344..c91f483d5 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -156,6 +156,9 @@ def __init__(self, config):
         self.config.setdefault('release_valve_enabled', True)
         self.config.setdefault('release_valve_threshold', 20)
         self.config.setdefault('path_tiebreak_weight', 0.2)
+        self.config.setdefault('future_cost_mode', 'candidate_min')
+        self.config.setdefault('future_candidate_top_k', 4)
+        self.config.setdefault('future_candidate_weight', 1.0)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -928,6 +931,15 @@ def _run_layout_trials_cpp(
         cfg.path_tiebreak_weight = self.config.get(
             'path_tiebreak_weight', 0.2
         )
+        cfg.future_cost_mode = self.config.get(
+            'future_cost_mode', 'candidate_min'
+        )
+        cfg.future_candidate_top_k = self.config.get(
+            'future_candidate_top_k', 4
+        )
+        cfg.future_candidate_weight = self.config.get(
+            'future_candidate_weight', 1.0
+        )
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
@@ -1301,24 +1313,149 @@ def _bfs_shortest_path(self, src, dst):
                 q.append(nb)
         return []
 
-    def _release_valve(self, F, pi, D, canonical_data):
+    @staticmethod
+    def _entry_variants(entry, future_cost_mode="canonical"):
+        variants = entry.get("variants")
+        if variants:
+            if future_cost_mode == "canonical":
+                return variants[:1]
+            return variants
+        return (entry,)
+
+    @staticmethod
+    def _variant_routing_cost(variant, output_perm_arr, D_arr, swap_cost):
+        eu = variant["edges_u"]
+        if eu is None:
+            return 0.0
+        phys_u = output_perm_arr[eu]
+        phys_v = output_perm_arr[variant["edges_v"]]
+        return float(swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum())
+
+    @staticmethod
+    def _entry_future_cost(
+        entry,
+        output_perm_arr,
+        D_arr,
+        swap_cost,
+        local_cost_weight,
+        future_cost_mode="canonical",
+        future_candidate_weight=1.0,
+    ):
+        variants = qgd_Partition_Aware_Mapping._entry_variants(
+            entry, future_cost_mode=future_cost_mode
+        )
+        best = None
+        for variant in variants:
+            route_cost = qgd_Partition_Aware_Mapping._variant_routing_cost(
+                variant, output_perm_arr, D_arr, swap_cost
+            )
+            if future_cost_mode == "canonical":
+                variant_cost = route_cost
+            else:
+                variant_cost = route_cost + (
+                    future_candidate_weight
+                    * local_cost_weight
+                    * variant["cnot"]
+                )
+            if best is None or variant_cost < best:
+                best = variant_cost
+        return 0.0 if best is None else best
+
+    @staticmethod
+    def _future_partition_cost(
+        partition_idx,
+        entry,
+        output_perm_arr,
+        D_arr,
+        swap_cost,
+        local_cost_weight,
+        reverse=False,
+        candidate_cache=None,
+        future_cost_mode="canonical",
+        future_candidate_weight=1.0,
+    ):
+        if future_cost_mode == "candidate_min" and candidate_cache is not None:
+            candidates = candidate_cache[partition_idx]
+            if candidates:
+                best = None
+                for candidate in candidates:
+                    candidate_cost = (
+                        swap_cost
+                        * candidate.estimate_swap_count(
+                            output_perm_arr, D_arr, reverse=reverse
+                        )
+                        + future_candidate_weight
+                        * local_cost_weight
+                        * candidate.cnot_count
+                    )
+                    if best is None or candidate_cost < best:
+                        best = candidate_cost
+                if best is not None:
+                    return float(best)
+        if entry is None:
+            return 0.0
+        return qgd_Partition_Aware_Mapping._entry_future_cost(
+            entry,
+            output_perm_arr,
+            D_arr,
+            swap_cost,
+            local_cost_weight,
+            future_cost_mode=future_cost_mode,
+            future_candidate_weight=future_candidate_weight,
+        )
+
+    @staticmethod
+    def _best_release_variant(entry, pi_arr, D_arr, future_cost_mode="canonical"):
+        best = None
+        for variant in qgd_Partition_Aware_Mapping._entry_variants(
+            entry, future_cost_mode=future_cost_mode
+        ):
+            eu = variant["edges_u"]
+            if eu is None:
+                continue
+            ev = variant["edges_v"]
+            phys_u = pi_arr[eu]
+            phys_v = pi_arr[ev]
+            dists = D_arr[phys_u, phys_v]
+            if dists.size == 0:
+                continue
+            route_sum = float(np.maximum(0, dists - 1).sum())
+            worst_idx = int(np.argmax(dists))
+            worst_d = float(dists[worst_idx])
+            worst_pair = (int(eu[worst_idx]), int(ev[worst_idx]))
+            if (
+                best is None
+                or route_sum < best[0]
+                or (route_sum == best[0] and worst_d < best[1])
+                or (
+                    route_sum == best[0]
+                    and worst_d == best[1]
+                    and worst_pair < best[2]
+                )
+            ):
+                best = (route_sum, worst_d, worst_pair)
+        return best
+
+    def _release_valve(self, F, pi, D, canonical_data, future_cost_mode="canonical"):
+        pi_arr = np.asarray(pi, dtype=np.intp)
+        D_arr = np.asarray(D)
         best = None
         for p_idx in F:
             entry = canonical_data.get(p_idx)
-            if entry is None or entry["edges_u"] is None:
+            if entry is None:
+                continue
+            best_variant = self._best_release_variant(
+                entry,
+                pi_arr,
+                D_arr,
+                future_cost_mode=future_cost_mode,
+            )
+            if best_variant is None:
                 continue
-            eu, ev = entry["edges_u"], entry["edges_v"]
-            worst_d = 0
-            worst_pair = None
-            for i in range(len(eu)):
-                u, v = int(eu[i]), int(ev[i])
-                d = D[int(pi[u])][int(pi[v])]
-                if d > worst_d:
-                    worst_d = d
-                    worst_pair = (u, v)
+            _, worst_d, worst_pair = best_variant
             if worst_d <= 1 or worst_pair is None:
                 continue
-            if best is None or worst_d < best[0] or (
+            if best is None or worst_d > best[0] or (
                 worst_d == best[0] and p_idx < best[1]
             ):
                 best = (worst_d, p_idx, worst_pair[0], worst_pair[1])
@@ -1351,6 +1488,7 @@ def _build_neighbor_info(
         weight=0.2,
         W=0.5,
         alpha=0.9,
+        future_cost_mode="canonical",
     ):
         if canonical_data is None or weight <= 0:
             return None
@@ -1362,15 +1500,28 @@ def add_edges(target_idx, edge_weight):
             if target_idx == partition_idx or edge_weight <= 0:
                 return
             entry = canonical_data.get(target_idx)
-            if entry is None or entry["edges_u"] is None:
+            if entry is None:
                 return
-            for u, v in zip(entry["edges_u"], entry["edges_v"]):
-                u = int(u)
-                v = int(v)
-                qubits.add(u)
-                qubits.add(v)
-                key = (u, v) if u <= v else (v, u)
-                edge_weights[key] = edge_weights.get(key, 0.0) + edge_weight
+            variants = [
+                variant
+                for variant in qgd_Partition_Aware_Mapping._entry_variants(
+                    entry, future_cost_mode=future_cost_mode
+                )
+                if variant["edges_u"] is not None
+            ]
+            if not variants:
+                return
+            variant_weight = edge_weight / len(variants)
+            for variant in variants:
+                for u, v in zip(variant["edges_u"], variant["edges_v"]):
+                    u = int(u)
+                    v = int(v)
+                    qubits.add(u)
+                    qubits.add(v)
+                    key = (u, v) if u <= v else (v, u)
+                    edge_weights[key] = (
+                        edge_weights.get(key, 0.0) + variant_weight
+                    )
 
         for future_idx in F:
             add_edges(future_idx, 1.0)
@@ -1463,6 +1614,10 @@ def Heuristic_Search(
         max_lookahead = self.config.get("max_lookahead", 4)
         E_W = self.config.get("E_weight", 0.5)
         E_alpha = self.config.get("E_alpha", 0.9)
+        future_cost_mode = self.config.get("future_cost_mode", "candidate_min")
+        future_candidate_weight = self.config.get(
+            "future_candidate_weight", 1.0
+        )
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
@@ -1477,7 +1632,11 @@ def Heuristic_Search(
                 and swap_burst > self.config.get("release_valve_threshold", 20)
             ):
                 valve_swaps, pi_bridged = self._release_valve(
-                    F, pi, D, canonical_data
+                    F,
+                    pi,
+                    D,
+                    canonical_data,
+                    future_cost_mode=future_cost_mode,
                 )
                 if valve_swaps:
                     partition_order.append(
@@ -1533,6 +1692,9 @@ def Heuristic_Search(
                         "path_tiebreak_weight", 0.2
                     ),
                     decay=decay,
+                    future_cost_mode=future_cost_mode,
+                    future_candidate_weight=future_candidate_weight,
+                    candidate_cache=candidate_cache,
                 )
                 for partition_candidate in partition_candidates
             ]
@@ -1554,6 +1716,7 @@ def Heuristic_Search(
                 weight=self.config.get("path_tiebreak_weight", 0.2),
                 W=E_W,
                 alpha=E_alpha,
+                future_cost_mode=future_cost_mode,
             )
             swap_order, pi = min_partition_candidate.transform_pi(
                 pi,
@@ -1663,6 +1826,10 @@ def _heuristic_search_layout_only(
         max_lookahead = self.config.get("max_lookahead", 4)
         E_W = self.config.get("E_weight", 0.5)
         E_alpha = self.config.get("E_alpha", 0.9)
+        future_cost_mode = self.config.get("future_cost_mode", "candidate_min")
+        future_candidate_weight = self.config.get(
+            "future_candidate_weight", 1.0
+        )
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=reverse
@@ -1676,7 +1843,13 @@ def _heuristic_search_layout_only(
                 self.config.get("release_valve_enabled", True)
                 and swap_burst > self.config.get("release_valve_threshold", 20)
             ):
-                valve_swaps, pi = self._release_valve(F, pi, D, canonical_data)
+                valve_swaps, pi = self._release_valve(
+                    F,
+                    pi,
+                    D,
+                    canonical_data,
+                    future_cost_mode=future_cost_mode,
+                )
                 if valve_swaps:
                     total_cost += swap_cnot_cost * len(valve_swaps)
                     self._apply_decay_for_swaps(valve_swaps, decay)
@@ -1728,6 +1901,9 @@ def _heuristic_search_layout_only(
                         "path_tiebreak_weight", 0.2
                     ),
                     decay=decay,
+                    future_cost_mode=future_cost_mode,
+                    future_candidate_weight=future_candidate_weight,
+                    candidate_cache=candidate_cache,
                 )
                 for pc in partition_candidates
             ]
@@ -1747,6 +1923,7 @@ def _heuristic_search_layout_only(
                 weight=self.config.get("path_tiebreak_weight", 0.2),
                 W=E_W,
                 alpha=E_alpha,
+                future_cost_mode=future_cost_mode,
             )
             swaps, pi = best.transform_pi(
                 pi,
@@ -1833,45 +2010,68 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # Scoring
     # ------------------------------------------------------------------------
 
-    @staticmethod
-    def _build_canonical_neighbor_data(scoring_partitions, reverse=False):
-        """Per partition, keep only the virtual-qubit edges of the lowest-CNOT
-        (mini_topology, P_i, P_o) combo — LightSABRE-style: assume each F/E
-        partition will be scheduled with its best combo.
-
-        Returns dict {partition_idx: {'edges_u': np.intp[n_edges],
-                                       'edges_v': np.intp[n_edges],
-                                       'cnot': int}}.
-        Partitions with no mini-topology edges have edges_u = edges_v = None.
+    def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
+        """Build compact future-routing surrogates per partition.
+
+        The first stored variant is the old "canonical" lowest-CNOT surrogate.
+        Additional variants keep distinct future edge patterns alive so the
+        router can score a future partition by its best still-available option.
         """
+        future_cost_mode = self.config.get("future_cost_mode", "candidate_min")
+        top_k = 1 if future_cost_mode == "canonical" else max(
+            1, int(self.config.get("future_candidate_top_k", 4))
+        )
         data = {}
         for idx, partition in enumerate(scoring_partitions):
             if partition is None:
                 continue
             qbit_map_inv = {v: q for q, v in partition.qubit_map.items()}
-            best_cnot = None
-            best_edges = None
+            variant_map = {}
             for tdx, mini_topology in enumerate(partition.mini_topologies):
                 for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
                     cnot = len(partition.circuit_structures[tdx][pdx])
-                    if best_cnot is not None and cnot >= best_cnot:
-                        continue
                     P_route = P_o if reverse else P_i
                     if mini_topology:
-                        edges = [(qbit_map_inv[P_route[u]], qbit_map_inv[P_route[v]])
-                                 for u, v in mini_topology]
+                        edge_key = tuple(
+                            sorted(
+                                tuple(
+                                    sorted(
+                                        (
+                                            qbit_map_inv[P_route[u]],
+                                            qbit_map_inv[P_route[v]],
+                                        )
+                                    )
+                                )
+                                for u, v in mini_topology
+                            )
+                        )
                     else:
-                        edges = []
-                    best_cnot = cnot
-                    best_edges = edges
-            if best_cnot is None:
+                        edge_key = tuple()
+                    prev_cnot = variant_map.get(edge_key)
+                    if prev_cnot is None or cnot < prev_cnot:
+                        variant_map[edge_key] = cnot
+            if not variant_map:
                 continue
-            if best_edges:
-                eu = np.array([e[0] for e in best_edges], dtype=np.intp)
-                ev = np.array([e[1] for e in best_edges], dtype=np.intp)
-            else:
-                eu = ev = None
-            data[idx] = {'edges_u': eu, 'edges_v': ev, 'cnot': best_cnot}
+            variants = []
+            for edge_key, cnot in sorted(
+                variant_map.items(),
+                key=lambda item: (item[1], len(item[0]), item[0]),
+            )[:top_k]:
+                if edge_key:
+                    eu = np.array([e[0] for e in edge_key], dtype=np.intp)
+                    ev = np.array([e[1] for e in edge_key], dtype=np.intp)
+                else:
+                    eu = ev = None
+                variants.append(
+                    {"edges_u": eu, "edges_v": ev, "cnot": cnot}
+                )
+            primary = variants[0]
+            data[idx] = {
+                "edges_u": primary["edges_u"],
+                "edges_v": primary["edges_v"],
+                "cnot": primary["cnot"],
+                "variants": tuple(variants),
+            }
         return data
 
     @staticmethod
@@ -1879,7 +2079,10 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
                                   canonical_data=None, adj=None,
                                   local_cost_weight=0.1, swap_cost=15.0,
-                                  path_tiebreak_weight=0.2, decay=None):
+                                  path_tiebreak_weight=0.2, decay=None,
+                                  future_cost_mode="canonical",
+                                  future_candidate_weight=1.0,
+                                  candidate_cache=None):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = swap_cost * |swaps|
@@ -1896,6 +2099,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             weight=path_tiebreak_weight,
             W=W,
             alpha=alpha,
+            future_cost_mode=future_cost_mode,
         )
         swaps, output_perm = partition_candidate.transform_pi(
             pi,
@@ -1926,15 +2130,19 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             if partition_idx == cand_idx:
                 continue
             entry = canonical_data.get(partition_idx)
-            if entry is None:
-                continue
             n_other += 1
-            eu = entry['edges_u']
-            if eu is None:
-                continue
-            phys_u = output_perm_arr[eu]
-            phys_v = output_perm_arr[entry['edges_v']]
-            f_sum += swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+            f_sum += qgd_Partition_Aware_Mapping._future_partition_cost(
+                partition_idx,
+                entry,
+                output_perm_arr,
+                D_arr,
+                swap_cost,
+                local_cost_weight,
+                reverse=reverse,
+                candidate_cache=candidate_cache,
+                future_cost_mode=future_cost_mode,
+                future_candidate_weight=future_candidate_weight,
+            )
         if n_other > 0:
             score += f_sum / n_other
 
@@ -1945,14 +2153,18 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 if partition_idx == cand_idx:
                     continue
                 entry = canonical_data.get(partition_idx)
-                if entry is None:
-                    continue
-                eu = entry['edges_u']
-                if eu is None:
-                    continue
-                phys_u = output_perm_arr[eu]
-                phys_v = output_perm_arr[entry['edges_v']]
-                d_cost = swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()
+                d_cost = qgd_Partition_Aware_Mapping._future_partition_cost(
+                    partition_idx,
+                    entry,
+                    output_perm_arr,
+                    D_arr,
+                    swap_cost,
+                    local_cost_weight,
+                    reverse=reverse,
+                    candidate_cache=candidate_cache,
+                    future_cost_mode=future_cost_mode,
+                    future_candidate_weight=future_candidate_weight,
+                )
                 e_sum += (alpha ** depth) * d_cost
             score += W * e_sum / len(E)
 
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 3b1d167d8..63e785002 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -61,6 +61,32 @@ static CandidateData extract_candidate(py::handle pc) {
 // Helper: extract canonical_data dict -> unordered_map
 // ---------------------------------------------------------------------------
 
+static std::vector<int> extract_int_array(py::handle obj) {
+    std::vector<int> result;
+    auto arr = py::array_t<int, py::array::c_style | py::array::forcecast>::ensure(obj);
+    if (!arr) {
+        return result;
+    }
+    auto acc = arr.unchecked<1>();
+    result.resize(acc.shape(0));
+    for (ssize_t i = 0; i < acc.shape(0); i++) {
+        result[i] = acc(i);
+    }
+    return result;
+}
+
+static CanonicalEntry::FutureVariant extract_future_variant(py::dict d) {
+    CanonicalEntry::FutureVariant variant;
+    if (d.contains("edges_u") && !d["edges_u"].is_none()) {
+        variant.edges_u = extract_int_array(d["edges_u"]);
+    }
+    if (d.contains("edges_v") && !d["edges_v"].is_none()) {
+        variant.edges_v = extract_int_array(d["edges_v"]);
+    }
+    variant.cnot = d["cnot"].cast<int>();
+    return variant;
+}
+
 static std::unordered_map<int, CanonicalEntry> extract_canonical_data(py::dict cd) {
     std::unordered_map<int, CanonicalEntry> result;
     for (auto [key, val] : cd) {
@@ -69,24 +95,28 @@ static std::unordered_map<int, CanonicalEntry> extract_canonical_data(py::dict c
         // val is a dict with 'edges_u', 'edges_v', 'cnot'
         py::dict d = py::reinterpret_borrow<py::dict>(val);
         if (d.contains("edges_u") && !d["edges_u"].is_none()) {
-            // Python builds these arrays as np.intp; forcecast keeps the C++
-            // router from silently dropping canonical lookahead edges.
-            auto buf_u = py::array_t<int, py::array::c_style | py::array::forcecast>::ensure(d["edges_u"]);
-            if (buf_u) {
-                auto acc = buf_u.unchecked<1>();
-                entry.edges_u.resize(acc.shape(0));
-                for (ssize_t i = 0; i < acc.shape(0); i++) entry.edges_u[i] = acc(i);
-            }
+            entry.edges_u = extract_int_array(d["edges_u"]);
         }
         if (d.contains("edges_v") && !d["edges_v"].is_none()) {
-            auto buf_v = py::array_t<int, py::array::c_style | py::array::forcecast>::ensure(d["edges_v"]);
-            if (buf_v) {
-                auto acc = buf_v.unchecked<1>();
-                entry.edges_v.resize(acc.shape(0));
-                for (ssize_t i = 0; i < acc.shape(0); i++) entry.edges_v[i] = acc(i);
-            }
+            entry.edges_v = extract_int_array(d["edges_v"]);
         }
         entry.cnot = d["cnot"].cast<int>();
+
+        if (d.contains("variants") && !d["variants"].is_none()) {
+            py::iterable variants = py::reinterpret_borrow<py::iterable>(d["variants"]);
+            for (auto item : variants) {
+                entry.variants.push_back(
+                    extract_future_variant(py::reinterpret_borrow<py::dict>(item))
+                );
+            }
+        }
+        if (entry.variants.empty()) {
+            CanonicalEntry::FutureVariant primary;
+            primary.edges_u = entry.edges_u;
+            primary.edges_v = entry.edges_v;
+            primary.cnot = entry.cnot;
+            entry.variants.push_back(std::move(primary));
+        }
         result[pidx] = std::move(entry);
     }
     return result;
@@ -135,7 +165,10 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("decay_reset_interval", &SabreConfig::decay_reset_interval)
         .def_readwrite("release_valve_enabled", &SabreConfig::release_valve_enabled)
         .def_readwrite("release_valve_threshold", &SabreConfig::release_valve_threshold)
-        .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight);
+        .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
+        .def_readwrite("future_cost_mode", &SabreConfig::future_cost_mode)
+        .def_readwrite("future_candidate_top_k", &SabreConfig::future_candidate_top_k)
+        .def_readwrite("future_candidate_weight", &SabreConfig::future_candidate_weight);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")

From 3e18e4a5fcdb8b4927fe2825e5c781b9bd63a7c4 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 11:09:31 +0200
Subject: [PATCH 148/232] Fix oopsie

---
 .../sabre_router/include/sabre_router.hpp     |  9 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 77 ++++------------
 squander/synthesis/PartAM.py                  | 90 +++++--------------
 3 files changed, 41 insertions(+), 135 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index bdd002875..dcb762d75 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -99,7 +99,7 @@ struct SabreConfig {
     bool release_valve_enabled = true;
     int release_valve_threshold = 20;
     double path_tiebreak_weight = 0.2;
-    std::string future_cost_mode = "candidate_min";
+    std::string future_cost_mode = "canonical";
     int future_candidate_top_k = 4;
     double future_candidate_weight = 1.0;
 };
@@ -403,13 +403,6 @@ class SabreRouter {
         const std::vector<int>& pi
     ) const;
 
-    double future_partition_cost(
-        int partition_idx,
-        const CanonicalEntry* entry,
-        const std::vector<int>& pi,
-        bool reverse
-    ) const;
-
     // Immutable data members
     SabreConfig config_;
     int N_; // number of physical qubits
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 154618349..5318a3a03 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -368,9 +368,9 @@ NeighborInfo SabreRouter::build_neighbor_info(
         const auto& entry = it->second;
         std::vector<const CanonicalEntry::FutureVariant*> active_variants;
         if (!entry.variants.empty()) {
-            const size_t limit = (config_.future_cost_mode == "canonical")
-                ? std::min<size_t>(1, entry.variants.size())
-                : entry.variants.size();
+            const size_t limit = (config_.future_cost_mode == "topk_min")
+                ? entry.variants.size()
+                : std::min<size_t>(1, entry.variants.size());
             active_variants.reserve(limit);
             for (size_t i = 0; i < limit; i++) {
                 if (!entry.variants[i].edges_u.empty()) {
@@ -539,9 +539,9 @@ std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::releas
         };
 
         if (!entry.variants.empty()) {
-            const size_t limit = (config_.future_cost_mode == "canonical")
-                ? std::min<size_t>(1, entry.variants.size())
-                : entry.variants.size();
+            const size_t limit = (config_.future_cost_mode == "topk_min")
+                ? entry.variants.size()
+                : std::min<size_t>(1, entry.variants.size());
             for (size_t i = 0; i < limit; i++) {
                 consider_variant(entry.variants[i]);
             }
@@ -1088,14 +1088,14 @@ double SabreRouter::entry_future_cost(
     const std::vector<int>& pi
 ) const {
     if (!entry.variants.empty()) {
-        const size_t limit = (config_.future_cost_mode == "canonical")
-            ? std::min<size_t>(1, entry.variants.size())
-            : entry.variants.size();
+        const size_t limit = (config_.future_cost_mode == "topk_min")
+            ? entry.variants.size()
+            : std::min<size_t>(1, entry.variants.size());
         double best = std::numeric_limits<double>::infinity();
         for (size_t i = 0; i < limit; i++) {
             const auto& variant = entry.variants[i];
             double cost = variant_routing_cost(variant, pi);
-            if (config_.future_cost_mode != "canonical") {
+            if (config_.future_cost_mode == "topk_min") {
                 cost += config_.future_candidate_weight
                     * config_.local_cost_weight
                     * static_cast<double>(variant.cnot);
@@ -1114,39 +1114,6 @@ double SabreRouter::entry_future_cost(
     return variant_routing_cost(primary, pi);
 }
 
-double SabreRouter::future_partition_cost(
-    int partition_idx,
-    const CanonicalEntry* entry,
-    const std::vector<int>& pi,
-    bool reverse
-) const {
-    if (
-        config_.future_cost_mode == "candidate_min"
-        && partition_idx >= 0
-        && partition_idx < static_cast<int>(candidate_cache_.size())
-        && !candidate_cache_[partition_idx].empty()
-    ) {
-        double best = std::numeric_limits<double>::infinity();
-        for (const auto& cand : candidate_cache_[partition_idx]) {
-            double cost = config_.swap_cost
-                * static_cast<double>(estimate_swap_count(cand, pi, reverse));
-            cost += config_.future_candidate_weight
-                * config_.local_cost_weight
-                * static_cast<double>(cand.cnot_count);
-            if (cost < best) {
-                best = cost;
-            }
-        }
-        if (std::isfinite(best)) {
-            return best;
-        }
-    }
-    if (entry == nullptr) {
-        return 0.0;
-    }
-    return entry_future_cost(*entry, pi);
-}
-
 double SabreRouter::compute_routing_cost(
     const std::vector<int>& pi,
     int exclude_partition_idx,
@@ -1233,21 +1200,17 @@ double SabreRouter::score_candidate(
     if (resolved_F) {
         for (const auto& re : *resolved_F) {
             if (re.partition_idx == cand_idx) continue;
+            if (!re.entry) continue;
             n_other++;
-            f_sum += future_partition_cost(
-                re.partition_idx, re.entry, output_perm, reverse
-            );
+            f_sum += entry_future_cost(*re.entry, output_perm);
         }
     } else {
         for (int p_idx : F_snapshot) {
             if (p_idx == cand_idx) continue;
             auto it = canonical_data.find(p_idx);
+            if (it == canonical_data.end()) continue;
             n_other++;
-            const CanonicalEntry* entry =
-                (it != canonical_data.end()) ? &it->second : nullptr;
-            f_sum += future_partition_cost(
-                p_idx, entry, output_perm, reverse
-            );
+            f_sum += entry_future_cost(it->second, output_perm);
         }
     }
     if (n_other > 0) score += f_sum / static_cast<double>(n_other);
@@ -1258,9 +1221,8 @@ double SabreRouter::score_candidate(
         if (resolved_E) {
             for (const auto& re : *resolved_E) {
                 if (re.partition_idx == cand_idx) continue;
-                e_sum += re.alpha * future_partition_cost(
-                    re.partition_idx, re.entry, output_perm, reverse
-                );
+                if (!re.entry) continue;
+                e_sum += re.alpha * entry_future_cost(*re.entry, output_perm);
             }
         } else {
             for (auto [p_idx, depth] : E) {
@@ -1269,11 +1231,8 @@ double SabreRouter::score_candidate(
                     ? alpha_weights_[depth]
                     : std::pow(config_.E_alpha, depth);
                 auto it = canonical_data.find(p_idx);
-                const CanonicalEntry* entry =
-                    (it != canonical_data.end()) ? &it->second : nullptr;
-                e_sum += alpha * future_partition_cost(
-                    p_idx, entry, output_perm, reverse
-                );
+                if (it == canonical_data.end()) continue;
+                e_sum += alpha * entry_future_cost(it->second, output_perm);
             }
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index c91f483d5..1de5ff12e 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -156,7 +156,7 @@ def __init__(self, config):
         self.config.setdefault('release_valve_enabled', True)
         self.config.setdefault('release_valve_threshold', 20)
         self.config.setdefault('path_tiebreak_weight', 0.2)
-        self.config.setdefault('future_cost_mode', 'candidate_min')
+        self.config.setdefault('future_cost_mode', 'canonical')
         self.config.setdefault('future_candidate_top_k', 4)
         self.config.setdefault('future_candidate_weight', 1.0)
         strategy = self.config['strategy']
@@ -932,7 +932,7 @@ def _run_layout_trials_cpp(
             'path_tiebreak_weight', 0.2
         )
         cfg.future_cost_mode = self.config.get(
-            'future_cost_mode', 'candidate_min'
+            'future_cost_mode', 'canonical'
         )
         cfg.future_candidate_top_k = self.config.get(
             'future_candidate_top_k', 4
@@ -1317,9 +1317,9 @@ def _bfs_shortest_path(self, src, dst):
     def _entry_variants(entry, future_cost_mode="canonical"):
         variants = entry.get("variants")
         if variants:
-            if future_cost_mode == "canonical":
-                return variants[:1]
-            return variants
+            if future_cost_mode == "topk_min":
+                return variants
+            return variants[:1]
         return (entry,)
 
     @staticmethod
@@ -1349,61 +1349,18 @@ def _entry_future_cost(
             route_cost = qgd_Partition_Aware_Mapping._variant_routing_cost(
                 variant, output_perm_arr, D_arr, swap_cost
             )
-            if future_cost_mode == "canonical":
-                variant_cost = route_cost
-            else:
+            if future_cost_mode == "topk_min":
                 variant_cost = route_cost + (
                     future_candidate_weight
                     * local_cost_weight
                     * variant["cnot"]
                 )
+            else:
+                variant_cost = route_cost
             if best is None or variant_cost < best:
                 best = variant_cost
         return 0.0 if best is None else best
 
-    @staticmethod
-    def _future_partition_cost(
-        partition_idx,
-        entry,
-        output_perm_arr,
-        D_arr,
-        swap_cost,
-        local_cost_weight,
-        reverse=False,
-        candidate_cache=None,
-        future_cost_mode="canonical",
-        future_candidate_weight=1.0,
-    ):
-        if future_cost_mode == "candidate_min" and candidate_cache is not None:
-            candidates = candidate_cache[partition_idx]
-            if candidates:
-                best = None
-                for candidate in candidates:
-                    candidate_cost = (
-                        swap_cost
-                        * candidate.estimate_swap_count(
-                            output_perm_arr, D_arr, reverse=reverse
-                        )
-                        + future_candidate_weight
-                        * local_cost_weight
-                        * candidate.cnot_count
-                    )
-                    if best is None or candidate_cost < best:
-                        best = candidate_cost
-                if best is not None:
-                    return float(best)
-        if entry is None:
-            return 0.0
-        return qgd_Partition_Aware_Mapping._entry_future_cost(
-            entry,
-            output_perm_arr,
-            D_arr,
-            swap_cost,
-            local_cost_weight,
-            future_cost_mode=future_cost_mode,
-            future_candidate_weight=future_candidate_weight,
-        )
-
     @staticmethod
     def _best_release_variant(entry, pi_arr, D_arr, future_cost_mode="canonical"):
         best = None
@@ -1614,7 +1571,7 @@ def Heuristic_Search(
         max_lookahead = self.config.get("max_lookahead", 4)
         E_W = self.config.get("E_weight", 0.5)
         E_alpha = self.config.get("E_alpha", 0.9)
-        future_cost_mode = self.config.get("future_cost_mode", "candidate_min")
+        future_cost_mode = self.config.get("future_cost_mode", "canonical")
         future_candidate_weight = self.config.get(
             "future_candidate_weight", 1.0
         )
@@ -1694,7 +1651,6 @@ def Heuristic_Search(
                     decay=decay,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
-                    candidate_cache=candidate_cache,
                 )
                 for partition_candidate in partition_candidates
             ]
@@ -1826,7 +1782,7 @@ def _heuristic_search_layout_only(
         max_lookahead = self.config.get("max_lookahead", 4)
         E_W = self.config.get("E_weight", 0.5)
         E_alpha = self.config.get("E_alpha", 0.9)
-        future_cost_mode = self.config.get("future_cost_mode", "candidate_min")
+        future_cost_mode = self.config.get("future_cost_mode", "canonical")
         future_candidate_weight = self.config.get(
             "future_candidate_weight", 1.0
         )
@@ -1903,7 +1859,6 @@ def _heuristic_search_layout_only(
                     decay=decay,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
-                    candidate_cache=candidate_cache,
                 )
                 for pc in partition_candidates
             ]
@@ -2017,9 +1972,11 @@ def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
         Additional variants keep distinct future edge patterns alive so the
         router can score a future partition by its best still-available option.
         """
-        future_cost_mode = self.config.get("future_cost_mode", "candidate_min")
-        top_k = 1 if future_cost_mode == "canonical" else max(
-            1, int(self.config.get("future_candidate_top_k", 4))
+        future_cost_mode = self.config.get("future_cost_mode", "canonical")
+        top_k = (
+            max(1, int(self.config.get("future_candidate_top_k", 4)))
+            if future_cost_mode == "topk_min"
+            else 1
         )
         data = {}
         for idx, partition in enumerate(scoring_partitions):
@@ -2081,8 +2038,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   local_cost_weight=0.1, swap_cost=15.0,
                                   path_tiebreak_weight=0.2, decay=None,
                                   future_cost_mode="canonical",
-                                  future_candidate_weight=1.0,
-                                  candidate_cache=None):
+                                  future_candidate_weight=1.0):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = swap_cost * |swaps|
@@ -2130,16 +2086,15 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             if partition_idx == cand_idx:
                 continue
             entry = canonical_data.get(partition_idx)
+            if entry is None:
+                continue
             n_other += 1
-            f_sum += qgd_Partition_Aware_Mapping._future_partition_cost(
-                partition_idx,
+            f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
                 entry,
                 output_perm_arr,
                 D_arr,
                 swap_cost,
                 local_cost_weight,
-                reverse=reverse,
-                candidate_cache=candidate_cache,
                 future_cost_mode=future_cost_mode,
                 future_candidate_weight=future_candidate_weight,
             )
@@ -2153,15 +2108,14 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 if partition_idx == cand_idx:
                     continue
                 entry = canonical_data.get(partition_idx)
-                d_cost = qgd_Partition_Aware_Mapping._future_partition_cost(
-                    partition_idx,
+                if entry is None:
+                    continue
+                d_cost = qgd_Partition_Aware_Mapping._entry_future_cost(
                     entry,
                     output_perm_arr,
                     D_arr,
                     swap_cost,
                     local_cost_weight,
-                    reverse=reverse,
-                    candidate_cache=candidate_cache,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
                 )

From ac798ba95b398c87731cb0164aabd81c93d8f35b Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 11:22:11 +0200
Subject: [PATCH 149/232] Refine PartAM routing heuristics

---
 .../sabre_router/include/sabre_router.hpp     |   7 +
 .../src-cpp/sabre_router/sabre_router.cpp     | 115 +++++++++++++-
 squander/synthesis/PartAM.py                  | 145 +++++++++++++++---
 3 files changed, 237 insertions(+), 30 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index dcb762d75..b67a6ed3b 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -403,6 +403,13 @@ class SabreRouter {
         const std::vector<int>& pi
     ) const;
 
+    double future_partition_cost_fullpas(
+        int partition_idx,
+        const std::vector<int>& pi,
+        bool reverse,
+        SwapCache* swap_cache
+    ) const;
+
     // Immutable data members
     SabreConfig config_;
     int N_; // number of physical qubits
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 5318a3a03..6caa1a6c6 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -368,7 +368,10 @@ NeighborInfo SabreRouter::build_neighbor_info(
         const auto& entry = it->second;
         std::vector<const CanonicalEntry::FutureVariant*> active_variants;
         if (!entry.variants.empty()) {
-            const size_t limit = (config_.future_cost_mode == "topk_min")
+            const size_t limit = (
+                config_.future_cost_mode == "topk_min"
+                || config_.future_cost_mode == "fullpas_min"
+            )
                 ? entry.variants.size()
                 : std::min<size_t>(1, entry.variants.size());
             active_variants.reserve(limit);
@@ -539,7 +542,10 @@ std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::releas
         };
 
         if (!entry.variants.empty()) {
-            const size_t limit = (config_.future_cost_mode == "topk_min")
+            const size_t limit = (
+                config_.future_cost_mode == "topk_min"
+                || config_.future_cost_mode == "fullpas_min"
+            )
                 ? entry.variants.size()
                 : std::min<size_t>(1, entry.variants.size());
             for (size_t i = 0; i < limit; i++) {
@@ -1088,14 +1094,20 @@ double SabreRouter::entry_future_cost(
     const std::vector<int>& pi
 ) const {
     if (!entry.variants.empty()) {
-        const size_t limit = (config_.future_cost_mode == "topk_min")
+        const size_t limit = (
+            config_.future_cost_mode == "topk_min"
+            || config_.future_cost_mode == "fullpas_min"
+        )
             ? entry.variants.size()
             : std::min<size_t>(1, entry.variants.size());
         double best = std::numeric_limits<double>::infinity();
         for (size_t i = 0; i < limit; i++) {
             const auto& variant = entry.variants[i];
             double cost = variant_routing_cost(variant, pi);
-            if (config_.future_cost_mode == "topk_min") {
+            if (
+                config_.future_cost_mode == "topk_min"
+                || config_.future_cost_mode == "fullpas_min"
+            ) {
                 cost += config_.future_candidate_weight
                     * config_.local_cost_weight
                     * static_cast<double>(variant.cnot);
@@ -1114,6 +1126,69 @@ double SabreRouter::entry_future_cost(
     return variant_routing_cost(primary, pi);
 }
 
+double SabreRouter::future_partition_cost_fullpas(
+    int partition_idx,
+    const std::vector<int>& pi,
+    bool reverse,
+    SwapCache* swap_cache
+) const {
+    if (partition_idx < 0 || partition_idx >= static_cast<int>(candidate_cache_.size())) {
+        return 0.0;
+    }
+    const auto& candidates_all = candidate_cache_[partition_idx];
+    if (candidates_all.empty()) {
+        return 0.0;
+    }
+
+    const int top_k = std::max(1, config_.future_candidate_top_k);
+    std::vector<const CandidateData*> candidates;
+    candidates.reserve(candidates_all.size());
+    for (const auto& cand : candidates_all) {
+        candidates.push_back(&cand);
+    }
+
+    if (static_cast<int>(candidates.size()) > top_k) {
+        using Pair = std::pair<double, const CandidateData*>;
+        std::vector<Pair> estimated;
+        estimated.reserve(candidates.size());
+        for (const auto* cand : candidates) {
+            double est = estimate_swap_count(*cand, pi, reverse) * config_.swap_cost
+                         + config_.future_candidate_weight
+                           * config_.local_cost_weight
+                           * static_cast<double>(cand->cnot_count);
+            estimated.push_back({est, cand});
+        }
+        std::nth_element(
+            estimated.begin(),
+            estimated.begin() + (top_k - 1),
+            estimated.end(),
+            [](const Pair& a, const Pair& b) {
+                return a.first < b.first;
+            }
+        );
+        candidates.clear();
+        candidates.reserve(top_k);
+        for (int i = 0; i < top_k; i++) {
+            candidates.push_back(estimated[i].second);
+        }
+    }
+
+    double best = std::numeric_limits<double>::infinity();
+    for (const auto* cand : candidates) {
+        auto [swaps, pi_out] = transform_pi(*cand, pi, reverse, swap_cache, nullptr);
+        (void)pi_out;
+        double cost = config_.swap_cost * static_cast<double>(swaps.size())
+                    + config_.future_candidate_weight
+                      * config_.local_cost_weight
+                      * static_cast<double>(cand->cnot_count);
+        if (cost < best) {
+            best = cost;
+        }
+    }
+
+    return std::isfinite(best) ? best : 0.0;
+}
+
 double SabreRouter::compute_routing_cost(
     const std::vector<int>& pi,
     int exclude_partition_idx,
@@ -1202,7 +1277,13 @@ double SabreRouter::score_candidate(
             if (re.partition_idx == cand_idx) continue;
             if (!re.entry) continue;
             n_other++;
-            f_sum += entry_future_cost(*re.entry, output_perm);
+            if (config_.future_cost_mode == "fullpas_min") {
+                f_sum += future_partition_cost_fullpas(
+                    re.partition_idx, output_perm, reverse, swap_cache
+                );
+            } else {
+                f_sum += entry_future_cost(*re.entry, output_perm);
+            }
         }
     } else {
         for (int p_idx : F_snapshot) {
@@ -1210,7 +1291,13 @@ double SabreRouter::score_candidate(
             auto it = canonical_data.find(p_idx);
             if (it == canonical_data.end()) continue;
             n_other++;
-            f_sum += entry_future_cost(it->second, output_perm);
+            if (config_.future_cost_mode == "fullpas_min") {
+                f_sum += future_partition_cost_fullpas(
+                    p_idx, output_perm, reverse, swap_cache
+                );
+            } else {
+                f_sum += entry_future_cost(it->second, output_perm);
+            }
         }
     }
     if (n_other > 0) score += f_sum / static_cast<double>(n_other);
@@ -1222,7 +1309,13 @@ double SabreRouter::score_candidate(
             for (const auto& re : *resolved_E) {
                 if (re.partition_idx == cand_idx) continue;
                 if (!re.entry) continue;
-                e_sum += re.alpha * entry_future_cost(*re.entry, output_perm);
+                if (config_.future_cost_mode == "fullpas_min") {
+                    e_sum += re.alpha * future_partition_cost_fullpas(
+                        re.partition_idx, output_perm, reverse, swap_cache
+                    );
+                } else {
+                    e_sum += re.alpha * entry_future_cost(*re.entry, output_perm);
+                }
             }
         } else {
             for (auto [p_idx, depth] : E) {
@@ -1232,7 +1325,13 @@ double SabreRouter::score_candidate(
                     : std::pow(config_.E_alpha, depth);
                 auto it = canonical_data.find(p_idx);
                 if (it == canonical_data.end()) continue;
-                e_sum += alpha * entry_future_cost(it->second, output_perm);
+                if (config_.future_cost_mode == "fullpas_min") {
+                    e_sum += alpha * future_partition_cost_fullpas(
+                        p_idx, output_perm, reverse, swap_cache
+                    );
+                } else {
+                    e_sum += alpha * entry_future_cost(it->second, output_perm);
+                }
             }
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 1de5ff12e..6ff03d6fc 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1317,7 +1317,7 @@ def _bfs_shortest_path(self, src, dst):
     def _entry_variants(entry, future_cost_mode="canonical"):
         variants = entry.get("variants")
         if variants:
-            if future_cost_mode == "topk_min":
+            if future_cost_mode in ("topk_min", "fullpas_min"):
                 return variants
             return variants[:1]
         return (entry,)
@@ -1349,7 +1349,7 @@ def _entry_future_cost(
             route_cost = qgd_Partition_Aware_Mapping._variant_routing_cost(
                 variant, output_perm_arr, D_arr, swap_cost
             )
-            if future_cost_mode == "topk_min":
+            if future_cost_mode in ("topk_min", "fullpas_min"):
                 variant_cost = route_cost + (
                     future_candidate_weight
                     * local_cost_weight
@@ -1361,6 +1361,65 @@ def _entry_future_cost(
                 best = variant_cost
         return 0.0 if best is None else best
 
+    @staticmethod
+    def _future_partition_cost_fullpas(
+        partition_idx,
+        output_perm_arr,
+        D,
+        swap_cache,
+        adj=None,
+        reverse=False,
+        candidate_cache=None,
+        future_candidate_top_k=4,
+        local_cost_weight=0.1,
+        swap_cost=15.0,
+        future_candidate_weight=1.0,
+    ):
+        if candidate_cache is None:
+            return 0.0
+        candidates = candidate_cache[partition_idx]
+        if not candidates:
+            return 0.0
+
+        candidates = list(candidates)
+        top_k = max(1, int(future_candidate_top_k))
+        if len(candidates) > top_k:
+            estimates = np.array(
+                [
+                    candidate.estimate_swap_count(
+                        output_perm_arr, D, reverse=reverse
+                    )
+                    * swap_cost
+                    + future_candidate_weight
+                    * local_cost_weight
+                    * candidate.cnot_count
+                    for candidate in candidates
+                ],
+                dtype=float,
+            )
+            idx = np.argpartition(estimates, top_k - 1)[:top_k]
+            candidates = [candidates[i] for i in idx]
+
+        best = None
+        for candidate in candidates:
+            swaps, _ = candidate.transform_pi(
+                output_perm_arr,
+                D,
+                swap_cache,
+                reverse=reverse,
+                adj=adj,
+                neighbor_info=None,
+            )
+            cost = (
+                swap_cost * len(swaps)
+                + future_candidate_weight
+                * local_cost_weight
+                * candidate.cnot_count
+            )
+            if best is None or cost < best:
+                best = cost
+        return 0.0 if best is None else float(best)
+
     @staticmethod
     def _best_release_variant(entry, pi_arr, D_arr, future_cost_mode="canonical"):
         best = None
@@ -1575,6 +1634,9 @@ def Heuristic_Search(
         future_candidate_weight = self.config.get(
             "future_candidate_weight", 1.0
         )
+        future_candidate_top_k = self.config.get(
+            "future_candidate_top_k", 4
+        )
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
@@ -1651,6 +1713,8 @@ def Heuristic_Search(
                     decay=decay,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
+                    future_candidate_top_k=future_candidate_top_k,
+                    candidate_cache=candidate_cache,
                 )
                 for partition_candidate in partition_candidates
             ]
@@ -1786,6 +1850,9 @@ def _heuristic_search_layout_only(
         future_candidate_weight = self.config.get(
             "future_candidate_weight", 1.0
         )
+        future_candidate_top_k = self.config.get(
+            "future_candidate_top_k", 4
+        )
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=reverse
@@ -1859,6 +1926,8 @@ def _heuristic_search_layout_only(
                     decay=decay,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
+                    future_candidate_top_k=future_candidate_top_k,
+                    candidate_cache=candidate_cache,
                 )
                 for pc in partition_candidates
             ]
@@ -1975,7 +2044,7 @@ def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
         future_cost_mode = self.config.get("future_cost_mode", "canonical")
         top_k = (
             max(1, int(self.config.get("future_candidate_top_k", 4)))
-            if future_cost_mode == "topk_min"
+            if future_cost_mode in ("topk_min", "fullpas_min")
             else 1
         )
         data = {}
@@ -2038,7 +2107,9 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   local_cost_weight=0.1, swap_cost=15.0,
                                   path_tiebreak_weight=0.2, decay=None,
                                   future_cost_mode="canonical",
-                                  future_candidate_weight=1.0):
+                                  future_candidate_weight=1.0,
+                                  future_candidate_top_k=4,
+                                  candidate_cache=None):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = swap_cost * |swaps|
@@ -2089,15 +2160,30 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             if entry is None:
                 continue
             n_other += 1
-            f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
-                entry,
-                output_perm_arr,
-                D_arr,
-                swap_cost,
-                local_cost_weight,
-                future_cost_mode=future_cost_mode,
-                future_candidate_weight=future_candidate_weight,
-            )
+            if future_cost_mode == "fullpas_min":
+                f_sum += qgd_Partition_Aware_Mapping._future_partition_cost_fullpas(
+                    partition_idx,
+                    output_perm_arr,
+                    D,
+                    swap_cache,
+                    adj=adj,
+                    reverse=reverse,
+                    candidate_cache=candidate_cache,
+                    future_candidate_top_k=future_candidate_top_k,
+                    local_cost_weight=local_cost_weight,
+                    swap_cost=swap_cost,
+                    future_candidate_weight=future_candidate_weight,
+                )
+            else:
+                f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
+                    entry,
+                    output_perm_arr,
+                    D_arr,
+                    swap_cost,
+                    local_cost_weight,
+                    future_cost_mode=future_cost_mode,
+                    future_candidate_weight=future_candidate_weight,
+                )
         if n_other > 0:
             score += f_sum / n_other
 
@@ -2110,15 +2196,30 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 entry = canonical_data.get(partition_idx)
                 if entry is None:
                     continue
-                d_cost = qgd_Partition_Aware_Mapping._entry_future_cost(
-                    entry,
-                    output_perm_arr,
-                    D_arr,
-                    swap_cost,
-                    local_cost_weight,
-                    future_cost_mode=future_cost_mode,
-                    future_candidate_weight=future_candidate_weight,
-                )
+                if future_cost_mode == "fullpas_min":
+                    d_cost = qgd_Partition_Aware_Mapping._future_partition_cost_fullpas(
+                        partition_idx,
+                        output_perm_arr,
+                        D,
+                        swap_cache,
+                        adj=adj,
+                        reverse=reverse,
+                        candidate_cache=candidate_cache,
+                        future_candidate_top_k=future_candidate_top_k,
+                        local_cost_weight=local_cost_weight,
+                        swap_cost=swap_cost,
+                        future_candidate_weight=future_candidate_weight,
+                    )
+                else:
+                    d_cost = qgd_Partition_Aware_Mapping._entry_future_cost(
+                        entry,
+                        output_perm_arr,
+                        D_arr,
+                        swap_cost,
+                        local_cost_weight,
+                        future_cost_mode=future_cost_mode,
+                        future_candidate_weight=future_candidate_weight,
+                    )
                 e_sum += (alpha ** depth) * d_cost
             score += W * e_sum / len(E)
 

From 29f28007dc0ac6148454ae0505b99bb17d886bfe Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 13:28:51 +0200
Subject: [PATCH 150/232] fix partitioning

---
 .../sabre_router/include/sabre_router.hpp     |  11 +-
 .../src-cpp/sabre_router/sabre_router.cpp     |  91 +++++++++-----
 squander/synthesis/PartAM.py                  | 114 ++++++++++++------
 3 files changed, 149 insertions(+), 67 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index b67a6ed3b..cffe629d9 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -106,7 +106,7 @@ struct SabreConfig {
 
 struct TrialResult {
     std::vector<int> pi;
-    int total_cost;
+    double total_cost;
 };
 
 struct NeighborEdge {
@@ -212,7 +212,7 @@ class SabreRouter {
 
     // Heuristic search (port of _heuristic_search_layout_only)
     // children_graph/parents_graph: swapped for backward passes
-    std::pair<std::vector<int>, int> heuristic_search(
+    std::pair<std::vector<int>, double> heuristic_search(
         const std::vector<int>& F_init,
         std::vector<int> pi,
         bool reverse,
@@ -296,6 +296,13 @@ class SabreRouter {
         const std::vector<double>& decay
     ) const;
 
+    double routing_objective(
+        double route_cost,
+        int cnot_count,
+        double cnot_weight = 1.0,
+        double decay_factor = 1.0
+    ) const;
+
     void apply_decay_for_swaps(
         const std::vector<std::pair<int,int>>& swaps,
         std::vector<double>& decay
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 6caa1a6c6..6b3c04bf6 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -431,6 +431,18 @@ double SabreRouter::decay_factor_for_swaps(
     return factor;
 }
 
+double SabreRouter::routing_objective(
+    double route_cost,
+    int cnot_count,
+    double cnot_weight,
+    double decay_factor
+) const {
+    return decay_factor * (
+        route_cost
+        + cnot_weight * config_.local_cost_weight * static_cast<double>(cnot_count)
+    );
+}
+
 void SabreRouter::apply_decay_for_swaps(
     const std::vector<std::pair<int,int>>& swaps,
     std::vector<double>& decay
@@ -1103,15 +1115,17 @@ double SabreRouter::entry_future_cost(
         double best = std::numeric_limits<double>::infinity();
         for (size_t i = 0; i < limit; i++) {
             const auto& variant = entry.variants[i];
-            double cost = variant_routing_cost(variant, pi);
-            if (
+            const double cnot_weight = (
                 config_.future_cost_mode == "topk_min"
                 || config_.future_cost_mode == "fullpas_min"
-            ) {
-                cost += config_.future_candidate_weight
-                    * config_.local_cost_weight
-                    * static_cast<double>(variant.cnot);
-            }
+            )
+                ? config_.future_candidate_weight
+                : 0.0;
+            const double cost = routing_objective(
+                variant_routing_cost(variant, pi),
+                variant.cnot,
+                cnot_weight
+            );
             if (cost < best) {
                 best = cost;
             }
@@ -1123,7 +1137,7 @@ double SabreRouter::entry_future_cost(
     primary.edges_u = entry.edges_u;
     primary.edges_v = entry.edges_v;
     primary.cnot = entry.cnot;
-    return variant_routing_cost(primary, pi);
+    return routing_objective(variant_routing_cost(primary, pi), primary.cnot, 0.0);
 }
 
 double SabreRouter::future_partition_cost_fullpas(
@@ -1152,10 +1166,11 @@ double SabreRouter::future_partition_cost_fullpas(
         std::vector<Pair> estimated;
         estimated.reserve(candidates.size());
         for (const auto* cand : candidates) {
-            double est = estimate_swap_count(*cand, pi, reverse) * config_.swap_cost
-                         + config_.future_candidate_weight
-                           * config_.local_cost_weight
-                           * static_cast<double>(cand->cnot_count);
+            const double est = routing_objective(
+                estimate_swap_count(*cand, pi, reverse) * config_.swap_cost,
+                cand->cnot_count,
+                config_.future_candidate_weight
+            );
             estimated.push_back({est, cand});
         }
         std::nth_element(
@@ -1177,10 +1192,11 @@ double SabreRouter::future_partition_cost_fullpas(
     for (const auto* cand : candidates) {
         auto [swaps, pi_out] = transform_pi(*cand, pi, reverse, swap_cache, nullptr);
         (void)pi_out;
-        double cost = config_.swap_cost * static_cast<double>(swaps.size())
-                    + config_.future_candidate_weight
-                      * config_.local_cost_weight
-                      * static_cast<double>(cand->cnot_count);
+        const double cost = routing_objective(
+            config_.swap_cost * static_cast<double>(swaps.size()),
+            cand->cnot_count,
+            config_.future_candidate_weight
+        );
         if (cost < best) {
             best = cost;
         }
@@ -1262,11 +1278,16 @@ double SabreRouter::score_candidate(
         neighbor_ptr
     );
 
-    double score = config_.swap_cost * static_cast<double>(swaps.size());
-    score += config_.local_cost_weight * static_cast<double>(cand.cnot_count);
+    double decay_factor = 1.0;
     if (decay != nullptr && !swaps.empty()) {
-        score *= decay_factor_for_swaps(swaps, *decay);
+        decay_factor = decay_factor_for_swaps(swaps, *decay);
     }
+    double score = routing_objective(
+        config_.swap_cost * static_cast<double>(swaps.size()),
+        cand.cnot_count,
+        1.0,
+        decay_factor
+    );
 
     // F cost: average routing cost over F \ {cand}
     const int cand_idx = cand.partition_idx;
@@ -1376,8 +1397,10 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     std::vector<Pair> estimated;
     estimated.reserve(candidates.size());
     for (const auto* cand : candidates) {
-        double est = estimate_swap_count(*cand, pi, reverse) * config_.swap_cost
-                     + config_.local_cost_weight * cand->cnot_count;
+        const double est = routing_objective(
+            estimate_swap_count(*cand, pi, reverse) * config_.swap_cost,
+            cand->cnot_count
+        );
         estimated.push_back({est, cand});
     }
 
@@ -1437,7 +1460,7 @@ const CandidateData& SabreRouter::select_best_candidate(
 // heuristic_search (main loop)
 // ---------------------------------------------------------------------------
 
-std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
+std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
     const std::vector<int>& F_init,
     std::vector<int> pi,
     bool reverse,
@@ -1450,7 +1473,7 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
     std::vector<int> queue;
     std::vector<uint8_t> resolved(num_partitions_, 0);
     std::vector<uint8_t> in_F(num_partitions_, 0);
-    int total_cost = 0;
+    double total_cost = 0.0;
 
     // Split F_init into F (multi-qubit) and queue (single-qubit)
     for (int p : F_init) {
@@ -1506,8 +1529,13 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
                 canonical_data
             );
             if (!valve_swaps.empty()) {
-                total_cost += config_.trial_swap_cnot_cost
-                    * static_cast<int>(valve_swaps.size());
+                total_cost += routing_objective(
+                    config_.swap_cost
+                        * static_cast<double>(valve_swaps.size()),
+                    0,
+                    1.0,
+                    decay_factor_for_swaps(valve_swaps, decay)
+                );
                 apply_decay_for_swaps(valve_swaps, decay);
                 pi = std::move(pi_bridged);
                 swap_burst = 0;
@@ -1592,8 +1620,15 @@ std::pair<std::vector<int>, int> SabreRouter::heuristic_search(
         // because exclude_partition_idx == best.partition_idx in both cases)
         std::vector<std::pair<int,int>> swaps = std::move(cached_swaps[best_ci]);
         std::vector<int> pi_new = std::move(cached_pi[best_ci]);
-        total_cost += config_.trial_swap_cnot_cost * static_cast<int>(swaps.size())
-                      + best.cnot_count;
+        const double decay_factor = swaps.empty()
+            ? 1.0
+            : decay_factor_for_swaps(swaps, decay);
+        total_cost += routing_objective(
+            config_.swap_cost * static_cast<double>(swaps.size()),
+            best.cnot_count,
+            1.0,
+            decay_factor
+        );
         pi = std::move(pi_new);
         apply_decay_for_swaps(swaps, decay);
         if (swaps.empty()) {
@@ -1694,7 +1729,7 @@ TrialResult SabreRouter::run_trial(
 
     // Final evaluation pass (deterministic, no RNG)
     auto eval_result = heuristic_search(F_fwd, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_); // Evaluates cost using a copy under the hood
-    int cost = eval_result.second;
+    double cost = eval_result.second;
 
     return TrialResult{std::move(pi), cost}; // Return the pi from AFTER the backward pass, BEFORE the eval pass
 }
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6ff03d6fc..57376b56b 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -483,13 +483,10 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
         # ---- Phase 2: ILP partition selection ----
-        # 2-qubit partitions are free (weight 0) since they are trivially
-        # synthesized as themselves; 3+ qubit partitions cost 1.
-        weights = [
-            0 if len({q for gate in part for q in gate_to_qubit[gate]}) == 2 else 1
-            for part in allparts
-        ]
-        L_parts, _ = ilp_global_optimal(allparts, g, weights=weights)
+        # Minimize total partition count so PAM gets the largest blocks possible
+        # under max_partition_size. Larger blocks = more (P_i, P_o) freedom to
+        # absorb routing SWAPs.
+        L_parts, _ = ilp_global_optimal(allparts, g)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----
         selected_surrounded_starts = set()
@@ -1203,7 +1200,11 @@ def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=Fals
         local_cost_weight = self.config.get('local_cost_weight', 0.1)
         swap_cost = self.config.get('swap_cost', 15.0)
         estimates = np.array([
-            pc.estimate_swap_count(pi, D, reverse=reverse) * swap_cost + local_cost_weight * pc.cnot_count
+            self._routing_objective(
+                pc.estimate_swap_count(pi, D, reverse=reverse) * swap_cost,
+                pc.cnot_count,
+                local_cost_weight,
+            )
             for pc in partition_candidates
         ])
         top_k_indices = np.argpartition(estimates, top_k)[:top_k]
@@ -1215,6 +1216,19 @@ def _decay_factor_for_swaps(swaps, decay):
             return 1.0
         return max(max(decay[u], decay[v]) for u, v in swaps)
 
+    @staticmethod
+    def _routing_objective(
+        route_cost,
+        cnot_count,
+        local_cost_weight,
+        cnot_weight=1.0,
+        decay_factor=1.0,
+    ):
+        return decay_factor * (
+            float(route_cost)
+            + cnot_weight * local_cost_weight * float(cnot_count)
+        )
+
     def _apply_decay_for_swaps(self, swaps, decay):
         delta = self.config.get("decay_delta", 0.1)
         if delta <= 0:
@@ -1349,14 +1363,17 @@ def _entry_future_cost(
             route_cost = qgd_Partition_Aware_Mapping._variant_routing_cost(
                 variant, output_perm_arr, D_arr, swap_cost
             )
-            if future_cost_mode in ("topk_min", "fullpas_min"):
-                variant_cost = route_cost + (
-                    future_candidate_weight
-                    * local_cost_weight
-                    * variant["cnot"]
-                )
-            else:
-                variant_cost = route_cost
+            cnot_weight = (
+                future_candidate_weight
+                if future_cost_mode in ("topk_min", "fullpas_min")
+                else 0.0
+            )
+            variant_cost = qgd_Partition_Aware_Mapping._routing_objective(
+                route_cost,
+                variant["cnot"],
+                local_cost_weight,
+                cnot_weight=cnot_weight,
+            )
             if best is None or variant_cost < best:
                 best = variant_cost
         return 0.0 if best is None else best
@@ -1386,13 +1403,15 @@ def _future_partition_cost_fullpas(
         if len(candidates) > top_k:
             estimates = np.array(
                 [
-                    candidate.estimate_swap_count(
-                        output_perm_arr, D, reverse=reverse
+                    qgd_Partition_Aware_Mapping._routing_objective(
+                        candidate.estimate_swap_count(
+                            output_perm_arr, D, reverse=reverse
+                        )
+                        * swap_cost,
+                        candidate.cnot_count,
+                        local_cost_weight,
+                        cnot_weight=future_candidate_weight,
                     )
-                    * swap_cost
-                    + future_candidate_weight
-                    * local_cost_weight
-                    * candidate.cnot_count
                     for candidate in candidates
                 ],
                 dtype=float,
@@ -1410,11 +1429,11 @@ def _future_partition_cost_fullpas(
                 adj=adj,
                 neighbor_info=None,
             )
-            cost = (
-                swap_cost * len(swaps)
-                + future_candidate_weight
-                * local_cost_weight
-                * candidate.cnot_count
+            cost = qgd_Partition_Aware_Mapping._routing_objective(
+                swap_cost * len(swaps),
+                candidate.cnot_count,
+                local_cost_weight,
+                cnot_weight=future_candidate_weight,
             )
             if best is None or cost < best:
                 best = cost
@@ -1814,14 +1833,13 @@ def _heuristic_search_layout_only(
                     updates (used for backward passes in SABRE iterations).
 
         Returns:
-            (pi, total_cost): final layout and estimated routed CNOT cost.
-            The online heuristic still uses ``swap_cost`` for lookahead pressure;
-            this accounting is only used to rank completed layout trials.
+            (pi, total_cost): final layout and heuristic trial score.
+            Trial ranking uses the same immediate routing objective as the
+            online scorer: weighted SWAP pressure plus weighted local CNOT cost.
         """
         F = list(F)
         resolved_partitions = [False] * len(DAG)
-        total_cost = 0
-        swap_cnot_cost = self.config.get("trial_swap_cnot_cost", 3)
+        total_cost = 0.0
 
         queue = deque(
             p for p in F if self._partition_is_single(optimized_partitions[p])
@@ -1846,6 +1864,8 @@ def _heuristic_search_layout_only(
         max_lookahead = self.config.get("max_lookahead", 4)
         E_W = self.config.get("E_weight", 0.5)
         E_alpha = self.config.get("E_alpha", 0.9)
+        local_cost_weight = self.config.get("local_cost_weight", 0.1)
+        swap_cost = self.config.get("swap_cost", 15.0)
         future_cost_mode = self.config.get("future_cost_mode", "canonical")
         future_candidate_weight = self.config.get(
             "future_candidate_weight", 1.0
@@ -1874,7 +1894,14 @@ def _heuristic_search_layout_only(
                     future_cost_mode=future_cost_mode,
                 )
                 if valve_swaps:
-                    total_cost += swap_cnot_cost * len(valve_swaps)
+                    total_cost += self._routing_objective(
+                        swap_cost * len(valve_swaps),
+                        0,
+                        local_cost_weight,
+                        decay_factor=self._decay_factor_for_swaps(
+                            valve_swaps, decay
+                        ),
+                    )
                     self._apply_decay_for_swaps(valve_swaps, decay)
                     swap_burst = 0
                     continue
@@ -1957,7 +1984,15 @@ def _heuristic_search_layout_only(
                 adj=self._adj,
                 neighbor_info=best_neighbor_info,
             )
-            total_cost += swap_cnot_cost * len(swaps) + best.cnot_count
+            decay_factor = 1.0
+            if swaps:
+                decay_factor = self._decay_factor_for_swaps(swaps, decay)
+            total_cost += self._routing_objective(
+                swap_cost * len(swaps),
+                best.cnot_count,
+                local_cost_weight,
+                decay_factor=decay_factor,
+            )
             if swaps:
                 self._apply_decay_for_swaps(swaps, decay)
                 swap_burst += len(swaps)
@@ -2136,12 +2171,17 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             adj=adj,
             neighbor_info=neighbor_info,
         )
-        score = swap_cost * len(swaps)
-        score += local_cost_weight * partition_candidate.cnot_count
+        decay_factor = 1.0
         if decay is not None and swaps:
-            score *= qgd_Partition_Aware_Mapping._decay_factor_for_swaps(
+            decay_factor = qgd_Partition_Aware_Mapping._decay_factor_for_swaps(
                 swaps, decay
             )
+        score = qgd_Partition_Aware_Mapping._routing_objective(
+            swap_cost * len(swaps),
+            partition_candidate.cnot_count,
+            local_cost_weight,
+            decay_factor=decay_factor,
+        )
 
         if canonical_data is None:
             return score

From 5e7c29be87a85b4fd9bfa8655b51ebf454886e6d Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 13:55:48 +0200
Subject: [PATCH 151/232] Fix remove fullpass config

---
 .../sabre_router/include/sabre_router.hpp     |   7 -
 .../src-cpp/sabre_router/sabre_router.cpp     | 101 +-----------
 squander/synthesis/PartAM.py                  | 147 +++---------------
 3 files changed, 26 insertions(+), 229 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index cffe629d9..7a95bbe8b 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -410,13 +410,6 @@ class SabreRouter {
         const std::vector<int>& pi
     ) const;
 
-    double future_partition_cost_fullpas(
-        int partition_idx,
-        const std::vector<int>& pi,
-        bool reverse,
-        SwapCache* swap_cache
-    ) const;
-
     // Immutable data members
     SabreConfig config_;
     int N_; // number of physical qubits
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 6b3c04bf6..dd64c688f 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -370,7 +370,6 @@ NeighborInfo SabreRouter::build_neighbor_info(
         if (!entry.variants.empty()) {
             const size_t limit = (
                 config_.future_cost_mode == "topk_min"
-                || config_.future_cost_mode == "fullpas_min"
             )
                 ? entry.variants.size()
                 : std::min<size_t>(1, entry.variants.size());
@@ -556,7 +555,6 @@ std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::releas
         if (!entry.variants.empty()) {
             const size_t limit = (
                 config_.future_cost_mode == "topk_min"
-                || config_.future_cost_mode == "fullpas_min"
             )
                 ? entry.variants.size()
                 : std::min<size_t>(1, entry.variants.size());
@@ -1108,7 +1106,6 @@ double SabreRouter::entry_future_cost(
     if (!entry.variants.empty()) {
         const size_t limit = (
             config_.future_cost_mode == "topk_min"
-            || config_.future_cost_mode == "fullpas_min"
         )
             ? entry.variants.size()
             : std::min<size_t>(1, entry.variants.size());
@@ -1117,7 +1114,6 @@ double SabreRouter::entry_future_cost(
             const auto& variant = entry.variants[i];
             const double cnot_weight = (
                 config_.future_cost_mode == "topk_min"
-                || config_.future_cost_mode == "fullpas_min"
             )
                 ? config_.future_candidate_weight
                 : 0.0;
@@ -1140,71 +1136,6 @@ double SabreRouter::entry_future_cost(
     return routing_objective(variant_routing_cost(primary, pi), primary.cnot, 0.0);
 }
 
-double SabreRouter::future_partition_cost_fullpas(
-    int partition_idx,
-    const std::vector<int>& pi,
-    bool reverse,
-    SwapCache* swap_cache
-) const {
-    if (partition_idx < 0 || partition_idx >= static_cast<int>(candidate_cache_.size())) {
-        return 0.0;
-    }
-    const auto& candidates_all = candidate_cache_[partition_idx];
-    if (candidates_all.empty()) {
-        return 0.0;
-    }
-
-    const int top_k = std::max(1, config_.future_candidate_top_k);
-    std::vector<const CandidateData*> candidates;
-    candidates.reserve(candidates_all.size());
-    for (const auto& cand : candidates_all) {
-        candidates.push_back(&cand);
-    }
-
-    if (static_cast<int>(candidates.size()) > top_k) {
-        using Pair = std::pair<double, const CandidateData*>;
-        std::vector<Pair> estimated;
-        estimated.reserve(candidates.size());
-        for (const auto* cand : candidates) {
-            const double est = routing_objective(
-                estimate_swap_count(*cand, pi, reverse) * config_.swap_cost,
-                cand->cnot_count,
-                config_.future_candidate_weight
-            );
-            estimated.push_back({est, cand});
-        }
-        std::nth_element(
-            estimated.begin(),
-            estimated.begin() + (top_k - 1),
-            estimated.end(),
-            [](const Pair& a, const Pair& b) {
-                return a.first < b.first;
-            }
-        );
-        candidates.clear();
-        candidates.reserve(top_k);
-        for (int i = 0; i < top_k; i++) {
-            candidates.push_back(estimated[i].second);
-        }
-    }
-
-    double best = std::numeric_limits<double>::infinity();
-    for (const auto* cand : candidates) {
-        auto [swaps, pi_out] = transform_pi(*cand, pi, reverse, swap_cache, nullptr);
-        (void)pi_out;
-        const double cost = routing_objective(
-            config_.swap_cost * static_cast<double>(swaps.size()),
-            cand->cnot_count,
-            config_.future_candidate_weight
-        );
-        if (cost < best) {
-            best = cost;
-        }
-    }
-
-    return std::isfinite(best) ? best : 0.0;
-}
-
 double SabreRouter::compute_routing_cost(
     const std::vector<int>& pi,
     int exclude_partition_idx,
@@ -1298,13 +1229,7 @@ double SabreRouter::score_candidate(
             if (re.partition_idx == cand_idx) continue;
             if (!re.entry) continue;
             n_other++;
-            if (config_.future_cost_mode == "fullpas_min") {
-                f_sum += future_partition_cost_fullpas(
-                    re.partition_idx, output_perm, reverse, swap_cache
-                );
-            } else {
-                f_sum += entry_future_cost(*re.entry, output_perm);
-            }
+            f_sum += entry_future_cost(*re.entry, output_perm);
         }
     } else {
         for (int p_idx : F_snapshot) {
@@ -1312,13 +1237,7 @@ double SabreRouter::score_candidate(
             auto it = canonical_data.find(p_idx);
             if (it == canonical_data.end()) continue;
             n_other++;
-            if (config_.future_cost_mode == "fullpas_min") {
-                f_sum += future_partition_cost_fullpas(
-                    p_idx, output_perm, reverse, swap_cache
-                );
-            } else {
-                f_sum += entry_future_cost(it->second, output_perm);
-            }
+            f_sum += entry_future_cost(it->second, output_perm);
         }
     }
     if (n_other > 0) score += f_sum / static_cast<double>(n_other);
@@ -1330,13 +1249,7 @@ double SabreRouter::score_candidate(
             for (const auto& re : *resolved_E) {
                 if (re.partition_idx == cand_idx) continue;
                 if (!re.entry) continue;
-                if (config_.future_cost_mode == "fullpas_min") {
-                    e_sum += re.alpha * future_partition_cost_fullpas(
-                        re.partition_idx, output_perm, reverse, swap_cache
-                    );
-                } else {
-                    e_sum += re.alpha * entry_future_cost(*re.entry, output_perm);
-                }
+                e_sum += re.alpha * entry_future_cost(*re.entry, output_perm);
             }
         } else {
             for (auto [p_idx, depth] : E) {
@@ -1346,13 +1259,7 @@ double SabreRouter::score_candidate(
                     : std::pow(config_.E_alpha, depth);
                 auto it = canonical_data.find(p_idx);
                 if (it == canonical_data.end()) continue;
-                if (config_.future_cost_mode == "fullpas_min") {
-                    e_sum += alpha * future_partition_cost_fullpas(
-                        p_idx, output_perm, reverse, swap_cache
-                    );
-                } else {
-                    e_sum += alpha * entry_future_cost(it->second, output_perm);
-                }
+                e_sum += alpha * entry_future_cost(it->second, output_perm);
             }
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 57376b56b..858d7da3d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1331,7 +1331,7 @@ def _bfs_shortest_path(self, src, dst):
     def _entry_variants(entry, future_cost_mode="canonical"):
         variants = entry.get("variants")
         if variants:
-            if future_cost_mode in ("topk_min", "fullpas_min"):
+            if future_cost_mode == "topk_min":
                 return variants
             return variants[:1]
         return (entry,)
@@ -1365,7 +1365,7 @@ def _entry_future_cost(
             )
             cnot_weight = (
                 future_candidate_weight
-                if future_cost_mode in ("topk_min", "fullpas_min")
+                if future_cost_mode == "topk_min"
                 else 0.0
             )
             variant_cost = qgd_Partition_Aware_Mapping._routing_objective(
@@ -1378,67 +1378,6 @@ def _entry_future_cost(
                 best = variant_cost
         return 0.0 if best is None else best
 
-    @staticmethod
-    def _future_partition_cost_fullpas(
-        partition_idx,
-        output_perm_arr,
-        D,
-        swap_cache,
-        adj=None,
-        reverse=False,
-        candidate_cache=None,
-        future_candidate_top_k=4,
-        local_cost_weight=0.1,
-        swap_cost=15.0,
-        future_candidate_weight=1.0,
-    ):
-        if candidate_cache is None:
-            return 0.0
-        candidates = candidate_cache[partition_idx]
-        if not candidates:
-            return 0.0
-
-        candidates = list(candidates)
-        top_k = max(1, int(future_candidate_top_k))
-        if len(candidates) > top_k:
-            estimates = np.array(
-                [
-                    qgd_Partition_Aware_Mapping._routing_objective(
-                        candidate.estimate_swap_count(
-                            output_perm_arr, D, reverse=reverse
-                        )
-                        * swap_cost,
-                        candidate.cnot_count,
-                        local_cost_weight,
-                        cnot_weight=future_candidate_weight,
-                    )
-                    for candidate in candidates
-                ],
-                dtype=float,
-            )
-            idx = np.argpartition(estimates, top_k - 1)[:top_k]
-            candidates = [candidates[i] for i in idx]
-
-        best = None
-        for candidate in candidates:
-            swaps, _ = candidate.transform_pi(
-                output_perm_arr,
-                D,
-                swap_cache,
-                reverse=reverse,
-                adj=adj,
-                neighbor_info=None,
-            )
-            cost = qgd_Partition_Aware_Mapping._routing_objective(
-                swap_cost * len(swaps),
-                candidate.cnot_count,
-                local_cost_weight,
-                cnot_weight=future_candidate_weight,
-            )
-            if best is None or cost < best:
-                best = cost
-        return 0.0 if best is None else float(best)
-
     @staticmethod
     def _best_release_variant(entry, pi_arr, D_arr, future_cost_mode="canonical"):
         best = None
@@ -1653,9 +1592,6 @@ def Heuristic_Search(
         future_candidate_weight = self.config.get(
             "future_candidate_weight", 1.0
         )
-        future_candidate_top_k = self.config.get(
-            "future_candidate_top_k", 4
-        )
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
@@ -1732,8 +1668,6 @@ def Heuristic_Search(
                     decay=decay,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
-                    future_candidate_top_k=future_candidate_top_k,
-                    candidate_cache=candidate_cache,
                 )
                 for partition_candidate in partition_candidates
             ]
@@ -1870,9 +1804,6 @@ def _heuristic_search_layout_only(
         future_candidate_weight = self.config.get(
             "future_candidate_weight", 1.0
         )
-        future_candidate_top_k = self.config.get(
-            "future_candidate_top_k", 4
-        )
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=reverse
@@ -1953,8 +1884,6 @@ def _heuristic_search_layout_only(
                     decay=decay,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
-                    future_candidate_top_k=future_candidate_top_k,
-                    candidate_cache=candidate_cache,
                 )
                 for pc in partition_candidates
             ]
@@ -2079,7 +2008,7 @@ def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
         future_cost_mode = self.config.get("future_cost_mode", "canonical")
         top_k = (
             max(1, int(self.config.get("future_candidate_top_k", 4)))
-            if future_cost_mode in ("topk_min", "fullpas_min")
+            if future_cost_mode == "topk_min"
             else 1
         )
         data = {}
@@ -2142,9 +2071,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   local_cost_weight=0.1, swap_cost=15.0,
                                   path_tiebreak_weight=0.2, decay=None,
                                   future_cost_mode="canonical",
-                                  future_candidate_weight=1.0,
-                                  future_candidate_top_k=4,
-                                  candidate_cache=None):
+                                  future_candidate_weight=1.0):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = swap_cost * |swaps|
@@ -2200,30 +2127,15 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             if entry is None:
                 continue
             n_other += 1
-            if future_cost_mode == "fullpas_min":
-                f_sum += qgd_Partition_Aware_Mapping._future_partition_cost_fullpas(
-                    partition_idx,
-                    output_perm_arr,
-                    D,
-                    swap_cache,
-                    adj=adj,
-                    reverse=reverse,
-                    candidate_cache=candidate_cache,
-                    future_candidate_top_k=future_candidate_top_k,
-                    local_cost_weight=local_cost_weight,
-                    swap_cost=swap_cost,
-                    future_candidate_weight=future_candidate_weight,
-                )
-            else:
-                f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
-                    entry,
-                    output_perm_arr,
-                    D_arr,
-                    swap_cost,
-                    local_cost_weight,
-                    future_cost_mode=future_cost_mode,
-                    future_candidate_weight=future_candidate_weight,
-                )
+            f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
+                entry,
+                output_perm_arr,
+                D_arr,
+                swap_cost,
+                local_cost_weight,
+                future_cost_mode=future_cost_mode,
+                future_candidate_weight=future_candidate_weight,
+            )
         if n_other > 0:
             score += f_sum / n_other
 
@@ -2236,30 +2148,15 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 entry = canonical_data.get(partition_idx)
                 if entry is None:
                     continue
-                if future_cost_mode == "fullpas_min":
-                    d_cost = qgd_Partition_Aware_Mapping._future_partition_cost_fullpas(
-                        partition_idx,
-                        output_perm_arr,
-                        D,
-                        swap_cache,
-                        adj=adj,
-                        reverse=reverse,
-                        candidate_cache=candidate_cache,
-                        future_candidate_top_k=future_candidate_top_k,
-                        local_cost_weight=local_cost_weight,
-                        swap_cost=swap_cost,
-                        future_candidate_weight=future_candidate_weight,
-                    )
-                else:
-                    d_cost = qgd_Partition_Aware_Mapping._entry_future_cost(
-                        entry,
-                        output_perm_arr,
-                        D_arr,
-                        swap_cost,
-                        local_cost_weight,
-                        future_cost_mode=future_cost_mode,
-                        future_candidate_weight=future_candidate_weight,
-                    )
+                d_cost = qgd_Partition_Aware_Mapping._entry_future_cost(
+                    entry,
+                    output_perm_arr,
+                    D_arr,
+                    swap_cost,
+                    local_cost_weight,
+                    future_cost_mode=future_cost_mode,
+                    future_candidate_weight=future_candidate_weight,
+                )
                 e_sum += (alpha ** depth) * d_cost
             score += W * e_sum / len(E)
 

From 473a62be58d0b38210a1ced50cf9c12eab44d53d Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 16:47:19 +0200
Subject: [PATCH 152/232] Fix routing bugs

---
 .../qgd_Wide_Circuit_Optimization.py          |   2 +-
 .../sabre_router/include/sabre_router.hpp     |   8 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 137 ++++++++-------
 squander/synthesis/PartAM.py                  | 162 +++++++++++-------
 squander/synthesis/PartAM_utils.py            |  35 +++-
 5 files changed, 213 insertions(+), 131 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 900e7868d..9dcd81773 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -2040,7 +2040,7 @@ def OptimizeWideCircuit(
             if self.config.get("use_osr", False) or self.config.get(
                 "use_graph_search", False
             ):
-                part_size_end = min(4, circ.get_Qbit_Num())
+                part_size_end = min(3, circ.get_Qbit_Num())
             count = CNOTGateCount(circ, 0)
             fingerprint_dict = {}
             for max_part_size in range(part_size_start, part_size_end + 1):
diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 7a95bbe8b..5a89eaef2 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -134,9 +134,14 @@ struct SwapCacheKey {
     int64_t pi_snapshot;
     int64_t targets;
     int k;
+    // 0 when the neighbor tiebreak is inactive; otherwise a stable hash of
+    // (edges, initial_pos, weight) from NeighborInfo so that two calls with
+    // the same active future context share cache entries.
+    uint64_t neighbor_hash;
 
     bool operator==(const SwapCacheKey& o) const {
-        return pi_snapshot == o.pi_snapshot && targets == o.targets && k == o.k;
+        return pi_snapshot == o.pi_snapshot && targets == o.targets
+            && k == o.k && neighbor_hash == o.neighbor_hash;
     }
 };
 
@@ -145,6 +150,7 @@ struct SwapCacheKeyHash {
         size_t h = static_cast<size_t>(k.pi_snapshot);
         h ^= static_cast<size_t>(k.targets) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
         h ^= static_cast<size_t>(k.k) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        h ^= static_cast<size_t>(k.neighbor_hash) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
         return h;
     }
 };
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index dd64c688f..8d4afd450 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -8,6 +8,7 @@ C++ backend for the SABRE-style partition-aware routing engine.
 
 #include <algorithm>
 #include <cmath>
+#include <cstring>
 #include <deque>
 #include <functional>
 #include <initializer_list>
@@ -366,25 +367,20 @@ NeighborInfo SabreRouter::build_neighbor_info(
         auto it = canonical_data.find(partition_idx);
         if (it == canonical_data.end()) return;
         const auto& entry = it->second;
+        if (entry.variants.empty()) {
+            return;
+        }
+        const size_t limit = (
+            config_.future_cost_mode == "topk_min"
+        )
+            ? entry.variants.size()
+            : std::min<size_t>(1, entry.variants.size());
         std::vector<const CanonicalEntry::FutureVariant*> active_variants;
-        if (!entry.variants.empty()) {
-            const size_t limit = (
-                config_.future_cost_mode == "topk_min"
-            )
-                ? entry.variants.size()
-                : std::min<size_t>(1, entry.variants.size());
-            active_variants.reserve(limit);
-            for (size_t i = 0; i < limit; i++) {
-                if (!entry.variants[i].edges_u.empty()) {
-                    active_variants.push_back(&entry.variants[i]);
-                }
+        active_variants.reserve(limit);
+        for (size_t i = 0; i < limit; i++) {
+            if (!entry.variants[i].edges_u.empty()) {
+                active_variants.push_back(&entry.variants[i]);
             }
-        } else if (!entry.edges_u.empty()) {
-            CanonicalEntry::FutureVariant primary;
-            primary.edges_u = entry.edges_u;
-            primary.edges_v = entry.edges_v;
-            primary.cnot = entry.cnot;
-            active_variants.push_back(&primary);
         }
         if (active_variants.empty()) {
             return;
@@ -552,21 +548,16 @@ std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::releas
             }
         };
 
-        if (!entry.variants.empty()) {
-            const size_t limit = (
-                config_.future_cost_mode == "topk_min"
-            )
-                ? entry.variants.size()
-                : std::min<size_t>(1, entry.variants.size());
-            for (size_t i = 0; i < limit; i++) {
-                consider_variant(entry.variants[i]);
-            }
-        } else {
-            CanonicalEntry::FutureVariant primary;
-            primary.edges_u = entry.edges_u;
-            primary.edges_v = entry.edges_v;
-            primary.cnot = entry.cnot;
-            consider_variant(primary);
+        if (entry.variants.empty()) {
+            continue;
+        }
+        const size_t limit = (
+            config_.future_cost_mode == "topk_min"
+        )
+            ? entry.variants.size()
+            : std::min<size_t>(1, entry.variants.size());
+        for (size_t i = 0; i < limit; i++) {
+            consider_variant(entry.variants[i]);
         }
 
         if (worst_dist <= 1.0 || worst_u < 0) {
@@ -722,11 +713,38 @@ SabreRouter::find_constrained_swaps(
         return {{}, pi};
     }
 
-    const SwapCacheKey cache_key{initial_packed, target_packed, k};
     const bool use_neighbor =
         neighbor_info != nullptr && neighbor_info->uses_tiebreak();
 
-    if (swap_cache && !use_neighbor) {
+    auto mix64 = [](uint64_t h, uint64_t v) -> uint64_t {
+        h ^= v + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2);
+        return h;
+    };
+
+    uint64_t neighbor_hash = 0;
+    if (use_neighbor) {
+        neighbor_hash = 0xcbf29ce484222325ULL;
+        for (const auto& edge : neighbor_info->edges) {
+            const int lo = std::min(edge.u_idx, edge.v_idx);
+            const int hi = std::max(edge.u_idx, edge.v_idx);
+            uint64_t w_bits;
+            std::memcpy(&w_bits, &edge.weight, sizeof(w_bits));
+            neighbor_hash = mix64(neighbor_hash, static_cast<uint64_t>(lo));
+            neighbor_hash = mix64(neighbor_hash, static_cast<uint64_t>(hi));
+            neighbor_hash = mix64(neighbor_hash, w_bits);
+        }
+        for (int p : neighbor_info->initial_pos) {
+            neighbor_hash = mix64(neighbor_hash, static_cast<uint64_t>(p));
+        }
+        uint64_t weight_bits;
+        const double weight_val = neighbor_info->weight;
+        std::memcpy(&weight_bits, &weight_val, sizeof(weight_bits));
+        neighbor_hash = mix64(neighbor_hash, weight_bits);
+    }
+
+    const SwapCacheKey cache_key{initial_packed, target_packed, k, neighbor_hash};
+
+    if (swap_cache) {
         auto it = swap_cache->find(cache_key);
         if (it != swap_cache->end()) {
             auto result_pi = apply_swaps_to_pi(pi, it->second);
@@ -858,7 +876,7 @@ SabreRouter::find_constrained_swaps(
             std::reverse(path.begin(), path.end());
 
             auto result_pi = apply_swaps_to_pi(pi, path);
-            if (swap_cache && !use_neighbor) {
+            if (swap_cache) {
                 (*swap_cache)[cache_key] = path;
             }
             return {path, result_pi};
@@ -1103,37 +1121,32 @@ double SabreRouter::entry_future_cost(
     const CanonicalEntry& entry,
     const std::vector<int>& pi
 ) const {
-    if (!entry.variants.empty()) {
-        const size_t limit = (
+    if (entry.variants.empty()) {
+        return 0.0;
+    }
+    const size_t limit = (
+        config_.future_cost_mode == "topk_min"
+    )
+        ? entry.variants.size()
+        : std::min<size_t>(1, entry.variants.size());
+    double best = std::numeric_limits<double>::infinity();
+    for (size_t i = 0; i < limit; i++) {
+        const auto& variant = entry.variants[i];
+        const double cnot_weight = (
             config_.future_cost_mode == "topk_min"
         )
-            ? entry.variants.size()
-            : std::min<size_t>(1, entry.variants.size());
-        double best = std::numeric_limits<double>::infinity();
-        for (size_t i = 0; i < limit; i++) {
-            const auto& variant = entry.variants[i];
-            const double cnot_weight = (
-                config_.future_cost_mode == "topk_min"
-            )
-                ? config_.future_candidate_weight
-                : 0.0;
-            const double cost = routing_objective(
-                variant_routing_cost(variant, pi),
-                variant.cnot,
-                cnot_weight
-            );
-            if (cost < best) {
-                best = cost;
-            }
+            ? config_.future_candidate_weight
+            : 0.0;
+        const double cost = routing_objective(
+            variant_routing_cost(variant, pi),
+            variant.cnot,
+            cnot_weight
+        );
+        if (cost < best) {
+            best = cost;
         }
-        return std::isfinite(best) ? best : 0.0;
     }
-
-    CanonicalEntry::FutureVariant primary;
-    primary.edges_u = entry.edges_u;
-    primary.edges_v = entry.edges_v;
-    primary.cnot = entry.cnot;
-    return routing_objective(variant_routing_cost(primary, pi), primary.cnot, 0.0);
+    return std::isfinite(best) ? best : 0.0;
 }
 
 double SabreRouter::compute_routing_cost(
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 858d7da3d..056c2558c 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -143,7 +143,8 @@ def __init__(self, config):
         self.config.setdefault('bh_interval', 50)
         self.config.setdefault('bh_target_accept_rate', 0.5)
         self.config.setdefault('bh_stepwise_factor', 0.9)
-        self.config.setdefault('use_osr', 0)
+        self.config.setdefault('use_osr', 1)
+        self.config.setdefault("use_graph_search", 1)
         self.config.setdefault('n_layout_trials', 1)
         self.config.setdefault('score_tolerance', 0.05)
         self.config.setdefault('trial_swap_cnot_cost', 3)
@@ -156,6 +157,16 @@ def __init__(self, config):
         self.config.setdefault('release_valve_enabled', True)
         self.config.setdefault('release_valve_threshold', 20)
         self.config.setdefault('path_tiebreak_weight', 0.2)
+        # The neighbor heuristic is normalized to [0, 1] and added to A*'s f-value.
+        # g-deltas are integer and h-deltas are half-integer, so preserving
+        # swap-count optimality requires weight < 0.5.
+        if self.config['path_tiebreak_weight'] >= 0.5:
+            logging.warning(
+                "path_tiebreak_weight=%.3f ≥ 0.5 may override SWAP-count "
+                "optimality; clamping to 0.49.",
+                self.config['path_tiebreak_weight'],
+            )
+            self.config['path_tiebreak_weight'] = 0.49
         self.config.setdefault('future_cost_mode', 'canonical')
         self.config.setdefault('future_candidate_top_k', 4)
         self.config.setdefault('future_candidate_weight', 1.0)
@@ -1647,9 +1658,33 @@ def Heuristic_Search(
                 max_lookahead=max_lookahead,
             )
 
-            scores = [
-                self.score_partition_candidate(
-                    partition_candidate,
+            # Group candidates by partition_idx to reuse _build_neighbor_info
+            candidate_order = sorted(
+                range(len(partition_candidates)),
+                key=lambda i: partition_candidates[i].partition_idx
+            )
+            scores = [0.0] * len(partition_candidates)
+            cached_swaps = [None] * len(partition_candidates)
+            cached_pi = [None] * len(partition_candidates)
+            prev_partition_idx = None
+            cached_neighbor_info = None
+            for ci in candidate_order:
+                cand = partition_candidates[ci]
+                if cand.partition_idx != prev_partition_idx:
+                    cached_neighbor_info = self._build_neighbor_info(
+                        cand.partition_idx,
+                        F_snapshot,
+                        E,
+                        pi,
+                        canonical_data,
+                        weight=self.config.get("path_tiebreak_weight", 0.2),
+                        W=E_W,
+                        alpha=E_alpha,
+                        future_cost_mode=future_cost_mode,
+                    )
+                    prev_partition_idx = cand.partition_idx
+                score, swaps, output_perm = self.score_partition_candidate(
+                    cand,
                     F_snapshot,
                     pi,
                     scoring_partitions,
@@ -1668,36 +1703,24 @@ def Heuristic_Search(
                     decay=decay,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
+                    cached_neighbor_info=cached_neighbor_info,
+                    return_transforms=True,
                 )
-                for partition_candidate in partition_candidates
-            ]
+                scores[ci] = score
+                cached_swaps[ci] = swaps
+                cached_pi[ci] = output_perm
+
             min_partition_candidate = self._select_best_candidate(
                 partition_candidates, scores
             )
+            best_idx = partition_candidates.index(min_partition_candidate)
 
             F.remove(min_partition_candidate.partition_idx)
             resolved_partitions[min_partition_candidate.partition_idx] = True
             resolved_count += 1
             pbar.update(1)
 
-            best_neighbor_info = self._build_neighbor_info(
-                min_partition_candidate.partition_idx,
-                F_snapshot,
-                E,
-                pi,
-                canonical_data,
-                weight=self.config.get("path_tiebreak_weight", 0.2),
-                W=E_W,
-                alpha=E_alpha,
-                future_cost_mode=future_cost_mode,
-            )
-            swap_order, pi = min_partition_candidate.transform_pi(
-                pi,
-                D,
-                self._swap_cache,
-                adj=self._adj,
-                neighbor_info=best_neighbor_info,
-            )
+            swap_order, pi = cached_swaps[best_idx], cached_pi[best_idx]
             if swap_order:
                 partition_order.append(construct_swap_circuit(swap_order, len(pi)))
                 self._apply_decay_for_swaps(swap_order, decay)
@@ -1862,9 +1885,33 @@ def _heuristic_search_layout_only(
                 max_lookahead=max_lookahead,
             )
 
-            scores = [
-                self.score_partition_candidate(
-                    pc,
+            # Group candidates by partition_idx to reuse _build_neighbor_info
+            candidate_order = sorted(
+                range(len(partition_candidates)),
+                key=lambda i: partition_candidates[i].partition_idx
+            )
+            scores = [0.0] * len(partition_candidates)
+            cached_swaps = [None] * len(partition_candidates)
+            cached_pi = [None] * len(partition_candidates)
+            prev_partition_idx = None
+            cached_neighbor_info = None
+            for ci in candidate_order:
+                cand = partition_candidates[ci]
+                if cand.partition_idx != prev_partition_idx:
+                    cached_neighbor_info = self._build_neighbor_info(
+                        cand.partition_idx,
+                        F_snapshot,
+                        E,
+                        pi,
+                        canonical_data,
+                        weight=self.config.get("path_tiebreak_weight", 0.2),
+                        W=E_W,
+                        alpha=E_alpha,
+                        future_cost_mode=future_cost_mode,
+                    )
+                    prev_partition_idx = cand.partition_idx
+                score, swaps, output_perm = self.score_partition_candidate(
+                    cand,
                     F_snapshot,
                     pi,
                     scoring_partitions,
@@ -1884,35 +1931,21 @@ def _heuristic_search_layout_only(
                     decay=decay,
                     future_cost_mode=future_cost_mode,
                     future_candidate_weight=future_candidate_weight,
+                    cached_neighbor_info=cached_neighbor_info,
+                    return_transforms=True,
                 )
-                for pc in partition_candidates
-            ]
+                scores[ci] = score
+                cached_swaps[ci] = swaps
+                cached_pi[ci] = output_perm
 
             best = self._select_best_candidate(
                 partition_candidates, scores, rng=rng
             )
+            best_idx = partition_candidates.index(best)
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 
-            best_neighbor_info = self._build_neighbor_info(
-                best.partition_idx,
-                F_snapshot,
-                E,
-                pi,
-                canonical_data,
-                weight=self.config.get("path_tiebreak_weight", 0.2),
-                W=E_W,
-                alpha=E_alpha,
-                future_cost_mode=future_cost_mode,
-            )
-            swaps, pi = best.transform_pi(
-                pi,
-                D,
-                self._swap_cache,
-                reverse=reverse,
-                adj=self._adj,
-                neighbor_info=best_neighbor_info,
-            )
+            swaps, pi = cached_swaps[best_idx], cached_pi[best_idx]
             decay_factor = 1.0
             if swaps:
                 decay_factor = self._decay_factor_for_swaps(swaps, decay)
@@ -2071,7 +2104,9 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   local_cost_weight=0.1, swap_cost=15.0,
                                   path_tiebreak_weight=0.2, decay=None,
                                   future_cost_mode="canonical",
-                                  future_candidate_weight=1.0):
+                                  future_candidate_weight=1.0,
+                                  cached_neighbor_info=None,
+                                  return_transforms=False):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = swap_cost * |swaps|
@@ -2079,17 +2114,20 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
           + (1/|F'|) * average routing cost over F \\ {cand}
           + (W/|E|)  * alpha^d-decayed routing cost over E
         """
-        neighbor_info = qgd_Partition_Aware_Mapping._build_neighbor_info(
-            partition_candidate.partition_idx,
-            F,
-            E,
-            pi,
-            canonical_data,
-            weight=path_tiebreak_weight,
-            W=W,
-            alpha=alpha,
-            future_cost_mode=future_cost_mode,
-        )
+        if cached_neighbor_info is not None:
+            neighbor_info = cached_neighbor_info
+        else:
+            neighbor_info = qgd_Partition_Aware_Mapping._build_neighbor_info(
+                partition_candidate.partition_idx,
+                F,
+                E,
+                pi,
+                canonical_data,
+                weight=path_tiebreak_weight,
+                W=W,
+                alpha=alpha,
+                future_cost_mode=future_cost_mode,
+            )
         swaps, output_perm = partition_candidate.transform_pi(
             pi,
             D,
@@ -2111,6 +2149,8 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
         )
 
         if canonical_data is None:
+            if return_transforms:
+                return score, swaps, output_perm
             return score
 
         output_perm_arr = np.asarray(output_perm, dtype=np.intp)
@@ -2160,6 +2200,8 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 e_sum += (alpha ** depth) * d_cost
             score += W * e_sum / len(E)
 
+        if return_transforms:
+            return score, swaps, output_perm
         return score
 
     # ------------------------------------------------------------------------
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index dc856851b..4ab63d713 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -13,6 +13,28 @@
 # ============================================================================
 # SWAP Routing Algorithms
 # ============================================================================
+def _neighbor_signature(neighbor_info):
+    """Stable hash-friendly signature of an active neighbor_info.
+
+    Returns None when the neighbor heuristic is inactive (no info, zero
+    weight, or empty edge list) — callers treat all such calls as cache-
+    compatible.  Otherwise returns a tuple of (sorted edges as
+    (min(u,v), max(u,v), weight), initial_pos tuple, rounded weight).
+    """
+    if neighbor_info is None:
+        return None
+    weight = neighbor_info.get('weight', 0.0)
+    edges = neighbor_info.get('edges') or ()
+    if weight == 0.0 or not edges:
+        return None
+    canonical_edges = tuple(sorted(
+        (min(int(u), int(v)), max(int(u), int(v)), float(w))
+        for u, v, w in edges
+    ))
+    initial_pos = tuple(int(p) for p in neighbor_info.get('initial_pos', ()))
+    return (canonical_edges, initial_pos, round(float(weight), 6))
+
+
 def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix, adj=None, neighbor_info=None):
     """
     Route partition qubits to their target physical positions using A* over
@@ -529,15 +551,14 @@ def transform_pi(self, pi, D, swap_cache=None, reverse=False, adj=None, neighbor
         pi_list = [int(x) for x in pi]
         n = len(pi_list)
 
-        # Check cache if provided (Bug A: skip cache when neighbor heuristic is active,
-        # since cached paths were computed with different future context)
-        use_cache = (neighbor_info is None or
-                      neighbor_info.get('weight', 0) == 0 or
-                      not neighbor_info.get('edges', []))
-        if swap_cache is not None and use_cache:
+        # Cache is keyed on (pi, qbit_map, neighbor_signature). The signature
+        # captures the neighbor-heuristic context so hits across calls with
+        # the same active neighbor_info are safe.
+        if swap_cache is not None:
             pi_tuple = tuple(pi_list)
             qbit_map_frozen = frozenset(qbit_map_input.items())
-            cache_key = (pi_tuple, qbit_map_frozen)
+            neighbor_sig = _neighbor_signature(neighbor_info)
+            cache_key = (pi_tuple, qbit_map_frozen, neighbor_sig)
             if cache_key in swap_cache:
                 swaps, pi_init = swap_cache[cache_key]
             else:

From 75699136c7df326b75e1d9f40fcfc906f49493e2 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 19:40:13 +0200
Subject: [PATCH 153/232] Fix

---
 squander/src-cpp/sabre_router/sabre_router.cpp |  8 ++++++++
 squander/synthesis/PartAM.py                   | 10 ++++++++--
 squander/synthesis/PartAM_utils.py             |  2 +-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 8d4afd450..5de3d0c50 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -865,6 +865,14 @@ SabreRouter::find_constrained_swaps(
         const int g = g_e;
         const int64_t packed = arena[idx].packed;
 
+        // A state can be reinserted with a lower g-cost after this queue entry
+        // was pushed. The hash table always points at the current best arena
+        // node for a packed state, so discard stale superseded nodes before
+        // accepting a target or expanding neighbors.
+        if (table[table_slot(packed)] != idx) {
+            continue;
+        }
+
         if (packed == target_packed) {
             // Reconstruct path
             std::vector<std::pair<int,int>> path;
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 056c2558c..57b36b773 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -329,6 +329,10 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
                 tuple(tuple(struct) for struct in partition.circuit_structures[tdx])
                 for tdx in range(len(partition.mini_topologies))
             )
+            cnot_counts = tuple(
+                tuple(int(cnot) for cnot in partition.cnot_counts[tdx])
+                for tdx in range(len(partition.mini_topologies))
+            )
 
             scoring_partitions.append(
                 PartitionScoreData(
@@ -336,6 +340,7 @@ def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[Parti
                     topology_candidates=tuple(topology_candidates),
                     permutations_pairs=permutations_pairs,
                     circuit_structures=circuit_structures,
+                    cnot_counts=cnot_counts,
                     qubit_map=dict(partition.qubit_map),
                     involved_qbits=tuple(partition.involved_qbits),
                 )
@@ -387,6 +392,7 @@ def _build_partition_candidate_cache(self, scoring_partitions):
                 topology_candidates = partition.topology_candidates[tdx]
                 permutation_pairs = partition.permutations_pairs[tdx]
                 circuit_structures = partition.circuit_structures[tdx]
+                cnot_counts = partition.cnot_counts[tdx]
 
                 for topology_candidate in topology_candidates:
                     for pdx, permutation_pair in enumerate(permutation_pairs):
@@ -403,7 +409,7 @@ def _build_partition_candidate_cache(self, scoring_partitions):
                                 mini_topology,
                                 partition.qubit_map,
                                 partition.involved_qbits,
-                                cnot_count=len(circuit_structure),
+                                cnot_count=cnot_counts[pdx],
                             )
                         )
 
@@ -2052,7 +2058,7 @@ def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
             variant_map = {}
             for tdx, mini_topology in enumerate(partition.mini_topologies):
                 for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]):
-                    cnot = len(partition.circuit_structures[tdx][pdx])
+                    cnot = partition.cnot_counts[tdx][pdx]
                     P_route = P_o if reverse else P_i
                     if mini_topology:
                         edge_key = tuple(
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 4ab63d713..90658a152 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -610,6 +610,7 @@ class PartitionScoreData:
         Tuple[Tuple[Tuple[int, ...], Tuple[int, ...]], ...], ...
     ]
     circuit_structures: Tuple[Tuple[Tuple[int, ...], ...], ...]
+    cnot_counts: Tuple[Tuple[int, ...], ...]
     qubit_map: Dict[int, int]
     involved_qbits: Tuple[int, ...]
 
@@ -650,4 +651,3 @@ def construct_swap_circuit(swap_order, N):
         swap_circ.add_CNOT(swap[1],swap[0])
         swap_circ.add_CNOT(swap[0],swap[1])
     return swap_circ
-

From e3feb8cb91058c67348f2778b38823edb6a191bf Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 21:46:54 +0200
Subject: [PATCH 154/232] Rework

---
 .../sabre_router/include/sabre_router.hpp     |  52 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 251 +++----
 squander/synthesis/PartAM.py                  | 626 +++++++++---------
 squander/synthesis/bindings.cpp               |  69 +-
 4 files changed, 460 insertions(+), 538 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 5a89eaef2..eb4e577ad 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -37,6 +37,7 @@ struct CandidateData {
     int partition_idx;
     int topology_idx;
     int permutation_idx;
+    int candidate_idx = -1;
     int cnot_count;
 
     // Permutations within the reduced (q*) space
@@ -67,13 +68,7 @@ struct CandidateData {
 struct CanonicalEntry {
     std::vector<int> edges_u; // virtual qubit indices
     std::vector<int> edges_v;
-    int cnot;
-    struct FutureVariant {
-        std::vector<int> edges_u;
-        std::vector<int> edges_v;
-        int cnot = 0;
-    };
-    std::vector<FutureVariant> variants;
+    int cnot = 0;
 };
 
 struct LayoutPartInfo {
@@ -86,22 +81,29 @@ struct SabreConfig {
     int max_E_size = 20;
     int max_lookahead = 4;
     double E_weight = 0.5;
-    double E_alpha = 0.9;
-    double local_cost_weight = 0.1;
-    double swap_cost = 15.0;
-    double score_tolerance = 0.05;
-    int trial_swap_cnot_cost = 3;
+    double E_alpha = 1.0; // LightSABRE uses no per-depth decay; set <1 for SQUANDER-style decay
+    double cnot_cost = 0.1 / 15.0; // weight on candidate.cnot_count; swap cost is fixed at 1.0
     int sabre_iterations = 1;
     int n_layout_trials = 1;
     int random_seed = 42;
-    double decay_delta = 0.1;
-    int decay_reset_interval = 5;
-    bool release_valve_enabled = true;
-    int release_valve_threshold = 20;
+    double decay_delta = 0.001; // Qiskit LightSABRE DECAY_RATE
+    int swap_burst_budget = 5; // Qiskit LightSABRE DECAY_RESET_INTERVAL
     double path_tiebreak_weight = 0.2;
-    std::string future_cost_mode = "canonical";
-    int future_candidate_top_k = 4;
-    double future_candidate_weight = 1.0;
+};
+
+struct RouteStep {
+    int type = 0; // 0=swap, 1=partition, 2=single
+    int partition_idx = -1;
+    int candidate_idx = -1;
+    int physical_qubit = -1;
+    std::vector<std::pair<int,int>> swaps;
+};
+
+struct ForwardRouteResult {
+    std::vector<int> pi_initial;
+    std::vector<int> pi;
+    int cnot_count = 0;
+    std::vector<RouteStep> steps;
 };
 
 struct TrialResult {
@@ -203,6 +205,10 @@ class SabreRouter {
     );
 
     // Thread-safe: all mutable state is stack-local
+    ForwardRouteResult route_forward(
+        const std::vector<int>& pi
+    ) const;
+
     TrialResult run_trial(
         int trial_idx,
         const std::vector<int>& seeded_pi,
@@ -225,7 +231,8 @@ class SabreRouter {
         std::mt19937* rng,
         const std::unordered_map<int, CanonicalEntry>& canonical_data,
         const std::vector<std::vector<int>>& children_graph,
-        const std::vector<std::vector<int>>& parents_graph
+        const std::vector<std::vector<int>>& parents_graph,
+        ForwardRouteResult* route_trace = nullptr
     ) const;
 
     // A* constrained swap search (port of find_constrained_swaps_partial)
@@ -406,11 +413,6 @@ class SabreRouter {
         const std::unordered_map<int, CanonicalEntry>& canonical_data
     ) const;
 
-    double variant_routing_cost(
-        const CanonicalEntry::FutureVariant& variant,
-        const std::vector<int>& pi
-    ) const;
-
     double entry_future_cost(
         const CanonicalEntry& entry,
         const std::vector<int>& pi
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 5de3d0c50..25bee094d 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -367,29 +367,9 @@ NeighborInfo SabreRouter::build_neighbor_info(
         auto it = canonical_data.find(partition_idx);
         if (it == canonical_data.end()) return;
         const auto& entry = it->second;
-        if (entry.variants.empty()) {
-            return;
-        }
-        const size_t limit = (
-            config_.future_cost_mode == "topk_min"
-        )
-            ? entry.variants.size()
-            : std::min<size_t>(1, entry.variants.size());
-        std::vector<const CanonicalEntry::FutureVariant*> active_variants;
-        active_variants.reserve(limit);
-        for (size_t i = 0; i < limit; i++) {
-            if (!entry.variants[i].edges_u.empty()) {
-                active_variants.push_back(&entry.variants[i]);
-            }
-        }
-        if (active_variants.empty()) {
-            return;
-        }
-        const double variant_weight = weight / static_cast<double>(active_variants.size());
-        for (const auto* variant : active_variants) {
-            for (size_t i = 0; i < variant->edges_u.size(); i++) {
-                add_edge(variant->edges_u[i], variant->edges_v[i], variant_weight);
-            }
+        if (entry.edges_u.empty()) return;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            add_edge(entry.edges_u[i], entry.edges_v[i], weight);
         }
     };
 
@@ -434,7 +414,7 @@ double SabreRouter::routing_objective(
 ) const {
     return decay_factor * (
         route_cost
-        + cnot_weight * config_.local_cost_weight * static_cast<double>(cnot_count)
+        + cnot_weight * config_.cnot_cost * static_cast<double>(cnot_count)
     );
 }
 
@@ -505,64 +485,25 @@ std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::releas
 
     for (int partition_idx : F) {
         auto it = canonical_data.find(partition_idx);
-        if (it == canonical_data.end()) {
-            continue;
-        }
+        if (it == canonical_data.end()) continue;
         const auto& entry = it->second;
-        double chosen_route_cost = std::numeric_limits<double>::infinity();
+        if (entry.edges_u.empty()) continue;
+
         double worst_dist = 0.0;
         int worst_u = -1;
         int worst_v = -1;
-
-        auto consider_variant = [&](const CanonicalEntry::FutureVariant& variant) {
-            if (variant.edges_u.empty()) {
-                return;
-            }
-            double route_cost = 0.0;
-            double variant_worst_dist = 0.0;
-            int variant_worst_u = -1;
-            int variant_worst_v = -1;
-            for (size_t i = 0; i < variant.edges_u.size(); i++) {
-                const int u = variant.edges_u[i];
-                const int v = variant.edges_v[i];
-                const double d = dist(pi[u], pi[v]);
-                const double cost = d - 1.0;
-                if (cost > 0.0) {
-                    route_cost += config_.swap_cost * cost;
-                }
-                if (d > variant_worst_dist) {
-                    variant_worst_dist = d;
-                    variant_worst_u = u;
-                    variant_worst_v = v;
-                }
-            }
-            if (
-                route_cost < chosen_route_cost
-                || (route_cost == chosen_route_cost
-                    && variant_worst_dist < worst_dist)
-            ) {
-                chosen_route_cost = route_cost;
-                worst_dist = variant_worst_dist;
-                worst_u = variant_worst_u;
-                worst_v = variant_worst_v;
+        for (size_t i = 0; i < entry.edges_u.size(); i++) {
+            const int u = entry.edges_u[i];
+            const int v = entry.edges_v[i];
+            const double d = dist(pi[u], pi[v]);
+            if (d > worst_dist) {
+                worst_dist = d;
+                worst_u = u;
+                worst_v = v;
             }
-        };
-
-        if (entry.variants.empty()) {
-            continue;
-        }
-        const size_t limit = (
-            config_.future_cost_mode == "topk_min"
-        )
-            ? entry.variants.size()
-            : std::min<size_t>(1, entry.variants.size());
-        for (size_t i = 0; i < limit; i++) {
-            consider_variant(entry.variants[i]);
         }
 
-        if (worst_dist <= 1.0 || worst_u < 0) {
-            continue;
-        }
+        if (worst_dist <= 1.0 || worst_u < 0) continue;
 
         if (
             worst_dist > best_worst_dist
@@ -1110,51 +1051,16 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
 // Routing cost helpers
 // ---------------------------------------------------------------------------
 
-double SabreRouter::variant_routing_cost(
-    const CanonicalEntry::FutureVariant& variant,
-    const std::vector<int>& pi
-) const {
-    double total = 0.0;
-    for (size_t i = 0; i < variant.edges_u.size(); i++) {
-        const double d = dist(pi[variant.edges_u[i]], pi[variant.edges_v[i]]);
-        const double cost = d - 1.0;
-        if (cost > 0.0) {
-            total += config_.swap_cost * cost;
-        }
-    }
-    return total;
-}
-
 double SabreRouter::entry_future_cost(
     const CanonicalEntry& entry,
     const std::vector<int>& pi
 ) const {
-    if (entry.variants.empty()) {
-        return 0.0;
-    }
-    const size_t limit = (
-        config_.future_cost_mode == "topk_min"
-    )
-        ? entry.variants.size()
-        : std::min<size_t>(1, entry.variants.size());
-    double best = std::numeric_limits<double>::infinity();
-    for (size_t i = 0; i < limit; i++) {
-        const auto& variant = entry.variants[i];
-        const double cnot_weight = (
-            config_.future_cost_mode == "topk_min"
-        )
-            ? config_.future_candidate_weight
-            : 0.0;
-        const double cost = routing_objective(
-            variant_routing_cost(variant, pi),
-            variant.cnot,
-            cnot_weight
-        );
-        if (cost < best) {
-            best = cost;
-        }
+    double total = 0.0;
+    for (size_t i = 0; i < entry.edges_u.size(); i++) {
+        const double d = dist(pi[entry.edges_u[i]], pi[entry.edges_v[i]]);
+        if (d > 1.0) total += d - 1.0;
     }
-    return std::isfinite(best) ? best : 0.0;
+    return total;
 }
 
 double SabreRouter::compute_routing_cost(
@@ -1235,7 +1141,7 @@ double SabreRouter::score_candidate(
         decay_factor = decay_factor_for_swaps(swaps, *decay);
     }
     double score = routing_objective(
-        config_.swap_cost * static_cast<double>(swaps.size()),
+        static_cast<double>(swaps.size()),
         cand.cnot_count,
         1.0,
         decay_factor
@@ -1326,7 +1232,7 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     estimated.reserve(candidates.size());
     for (const auto* cand : candidates) {
         const double est = routing_objective(
-            estimate_swap_count(*cand, pi, reverse) * config_.swap_cost,
+            static_cast<double>(estimate_swap_count(*cand, pi, reverse)),
             cand->cnot_count
         );
         estimated.push_back({est, cand});
@@ -1358,6 +1264,8 @@ const CandidateData& SabreRouter::select_best_candidate(
     const std::vector<double>& scores,
     std::mt19937* rng
 ) const {
+    (void)rng;
+
     // Find minimum score
     double min_score = scores[0];
     size_t min_idx = 0;
@@ -1368,18 +1276,6 @@ const CandidateData& SabreRouter::select_best_candidate(
         }
     }
 
-    if (rng && min_score > 0.0) {
-        std::vector<size_t> near_best;
-        double threshold = min_score * (1.0 + config_.score_tolerance);
-        for (size_t i = 0; i < scores.size(); i++) {
-            if (scores[i] <= threshold) near_best.push_back(i);
-        }
-        if (near_best.size() > 1) {
-            std::uniform_int_distribution<size_t> dist(0, near_best.size() - 1);
-            return *candidates[near_best[dist(*rng)]];
-        }
-    }
-
     return *candidates[min_idx];
 }
 
@@ -1395,7 +1291,8 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
     std::mt19937* rng,
     const std::unordered_map<int, CanonicalEntry>& canonical_data,
     const std::vector<std::vector<int>>& cg,
-    const std::vector<std::vector<int>>& pg
+    const std::vector<std::vector<int>>& pg,
+    ForwardRouteResult* route_trace
 ) const {
     std::vector<int> F;
     std::vector<int> queue;
@@ -1420,6 +1317,15 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
 
         if (resolved[p]) continue;
         resolved[p] = 1;
+        if (route_trace) {
+            RouteStep step;
+            step.type = 2;
+            step.partition_idx = p;
+            if (!layout_partitions_[p].involved_qbits.empty()) {
+                step.physical_qubit = pi[layout_partitions_[p].involved_qbits[0]];
+            }
+            route_trace->steps.push_back(std::move(step));
+        }
 
         for (int child : cg[p]) {
             if (!resolved[child] && !in_F[child]) {
@@ -1442,14 +1348,13 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
     // Swap cache for this search call (thread-local, on stack)
     SwapCache swap_cache;
     std::vector<double> decay(N_, 1.0);
-    int swap_burst = 0;
     int swap_heavy_partitions = 0;
 
     // Main search loop
     while (!F.empty()) {
         if (
-            config_.release_valve_enabled
-            && swap_burst > config_.release_valve_threshold
+            config_.swap_burst_budget > 0
+            && swap_heavy_partitions >= config_.swap_burst_budget
         ) {
             auto [valve_swaps, pi_bridged] = release_valve(
                 F,
@@ -1458,18 +1363,25 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             );
             if (!valve_swaps.empty()) {
                 total_cost += routing_objective(
-                    config_.swap_cost
-                        * static_cast<double>(valve_swaps.size()),
+                    static_cast<double>(valve_swaps.size()),
                     0,
                     1.0,
                     decay_factor_for_swaps(valve_swaps, decay)
                 );
+                if (route_trace) {
+                    RouteStep step;
+                    step.type = 0;
+                    step.swaps = valve_swaps;
+                    route_trace->cnot_count += static_cast<int>(valve_swaps.size()) * 3;
+                    route_trace->steps.push_back(std::move(step));
+                }
                 apply_decay_for_swaps(valve_swaps, decay);
                 pi = std::move(pi_bridged);
-                swap_burst = 0;
+                swap_heavy_partitions = 0;
                 continue;
             }
-            swap_burst = 0;
+            reset_decay(decay);
+            swap_heavy_partitions = 0;
         }
 
         auto all_candidates = obtain_partition_candidates(F);
@@ -1552,27 +1464,33 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             ? 1.0
             : decay_factor_for_swaps(swaps, decay);
         total_cost += routing_objective(
-            config_.swap_cost * static_cast<double>(swaps.size()),
+            static_cast<double>(swaps.size()),
             best.cnot_count,
             1.0,
             decay_factor
         );
+        if (route_trace) {
+            if (!swaps.empty()) {
+                RouteStep swap_step;
+                swap_step.type = 0;
+                swap_step.swaps = swaps;
+                route_trace->cnot_count += static_cast<int>(swaps.size()) * 3;
+                route_trace->steps.push_back(std::move(swap_step));
+            }
+            RouteStep part_step;
+            part_step.type = 1;
+            part_step.partition_idx = best.partition_idx;
+            part_step.candidate_idx = best.candidate_idx;
+            route_trace->cnot_count += best.cnot_count;
+            route_trace->steps.push_back(std::move(part_step));
+        }
         pi = std::move(pi_new);
         apply_decay_for_swaps(swaps, decay);
         if (swaps.empty()) {
-            swap_burst = 0;
             swap_heavy_partitions = 0;
             reset_decay(decay);
         } else {
-            swap_burst += static_cast<int>(swaps.size());
             swap_heavy_partitions++;
-            if (
-                config_.decay_reset_interval > 0
-                && swap_heavy_partitions >= config_.decay_reset_interval
-            ) {
-                reset_decay(decay);
-                swap_heavy_partitions = 0;
-            }
         }
 
         // Update F with newly eligible children
@@ -1586,6 +1504,15 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
                 if (parents_ok) {
                     if (layout_partitions_[child].is_single) {
                         resolved[child] = 1;
+                        if (route_trace) {
+                            RouteStep step;
+                            step.type = 2;
+                            step.partition_idx = child;
+                            if (!layout_partitions_[child].involved_qbits.empty()) {
+                                step.physical_qubit = pi[layout_partitions_[child].involved_qbits[0]];
+                            }
+                            route_trace->steps.push_back(std::move(step));
+                        }
                         std::vector<int> stack;
                         for (int gc : cg[child]) stack.push_back(gc);
                         
@@ -1601,6 +1528,15 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
                                 if (gc_parents_ok) {
                                     if (layout_partitions_[gc].is_single) {
                                         resolved[gc] = 1;
+                                        if (route_trace) {
+                                            RouteStep step;
+                                            step.type = 2;
+                                            step.partition_idx = gc;
+                                            if (!layout_partitions_[gc].involved_qbits.empty()) {
+                                                step.physical_qubit = pi[layout_partitions_[gc].involved_qbits[0]];
+                                            }
+                                            route_trace->steps.push_back(std::move(step));
+                                        }
                                         for (int ggc : cg[gc]) stack.push_back(ggc);
                                     } else {
                                         F.push_back(gc);
@@ -1621,6 +1557,27 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
     return {pi, total_cost};
 }
 
+ForwardRouteResult SabreRouter::route_forward(
+    const std::vector<int>& pi
+) const {
+    ForwardRouteResult result;
+    result.pi_initial = pi;
+    auto F_fwd = get_initial_layer();
+    auto routed = heuristic_search(
+        F_fwd,
+        pi,
+        false,
+        nullptr,
+        canonical_data_fwd_,
+        DAG_,
+        IDAG_,
+        &result
+    );
+    result.pi = std::move(routed.first);
+    return result;
+}
+
+
 // ---------------------------------------------------------------------------
 // run_trial (full implementation)
 // ---------------------------------------------------------------------------
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 57b36b773..31fc35902 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -136,26 +136,22 @@ def __init__(self, config):
         self.config.setdefault('topology', None)
         self.config.setdefault('routed', False)
         self.config.setdefault('partition_strategy','ilp')
-        self.config.setdefault('optimizer', 'BFGS2')
+        self.config.setdefault('optimizer', 'BFGS')
         self.config.setdefault('use_basin_hopping', 1)
         self.config.setdefault('bh_T', 1.0)
         self.config.setdefault('bh_stepsize', 0.5)
         self.config.setdefault('bh_interval', 50)
         self.config.setdefault('bh_target_accept_rate', 0.5)
         self.config.setdefault('bh_stepwise_factor', 0.9)
-        self.config.setdefault('use_osr', 1)
-        self.config.setdefault("use_graph_search", 1)
+        self.config.setdefault('use_osr', 0)
+        self.config.setdefault("use_graph_search", 0)
         self.config.setdefault('n_layout_trials', 1)
-        self.config.setdefault('score_tolerance', 0.05)
-        self.config.setdefault('trial_swap_cnot_cost', 3)
         self.config.setdefault('random_seed', 42)
         self.config.setdefault('cleanup', True)
         self.config.setdefault('prefilter_top_k', 50)
         self.config.setdefault('cleanup_top_k', 3)
-        self.config.setdefault('decay_delta', 0.1)
-        self.config.setdefault('decay_reset_interval', 5)
-        self.config.setdefault('release_valve_enabled', True)
-        self.config.setdefault('release_valve_threshold', 20)
+        self.config.setdefault('decay_delta', 0.001)  # Qiskit LightSABRE DECAY_RATE
+        self.config.setdefault('swap_burst_budget', 5)  # Qiskit LightSABRE DECAY_RESET_INTERVAL
         self.config.setdefault('path_tiebreak_weight', 0.2)
         # The neighbor heuristic is normalized to [0, 1] and added to A*'s f-value.
         # g-deltas are integer and h-deltas are half-integer, so preserving
@@ -167,9 +163,7 @@ def __init__(self, config):
                 self.config['path_tiebreak_weight'],
             )
             self.config['path_tiebreak_weight'] = 0.49
-        self.config.setdefault('future_cost_mode', 'canonical')
-        self.config.setdefault('future_candidate_top_k', 4)
-        self.config.setdefault('future_candidate_weight', 1.0)
+        self.config.setdefault('cnot_cost', 0.1 / 15.0)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -926,34 +920,16 @@ def _run_layout_trials_cpp(
         cfg.max_E_size = self.config.get('max_E_size', 20)
         cfg.max_lookahead = self.config.get('max_lookahead', 4)
         cfg.E_weight = self.config.get('E_weight', 0.5)
-        cfg.E_alpha = self.config.get('E_alpha', 0.9)
-        cfg.local_cost_weight = self.config.get('local_cost_weight', 0.1)
-        cfg.swap_cost = self.config.get('swap_cost', 15.0)
-        cfg.score_tolerance = self.config.get('score_tolerance', 0.05)
-        cfg.trial_swap_cnot_cost = self.config.get('trial_swap_cnot_cost', 3)
+        cfg.E_alpha = self.config.get('E_alpha', 1.0)
+        cfg.cnot_cost = self.config.get('cnot_cost', 0.1 / 15.0)
         cfg.sabre_iterations = n_iterations
         cfg.n_layout_trials = max(1, n_trials)
         cfg.random_seed = random_seed
-        cfg.decay_delta = self.config.get('decay_delta', 0.1)
-        cfg.decay_reset_interval = self.config.get('decay_reset_interval', 5)
-        cfg.release_valve_enabled = self.config.get(
-            'release_valve_enabled', True
-        )
-        cfg.release_valve_threshold = self.config.get(
-            'release_valve_threshold', 20
-        )
+        cfg.decay_delta = self.config.get('decay_delta', 0.001)
+        cfg.swap_burst_budget = self.config.get('swap_burst_budget', 5)
         cfg.path_tiebreak_weight = self.config.get(
             'path_tiebreak_weight', 0.2
         )
-        cfg.future_cost_mode = self.config.get(
-            'future_cost_mode', 'canonical'
-        )
-        cfg.future_candidate_top_k = self.config.get(
-            'future_candidate_top_k', 4
-        )
-        cfg.future_candidate_weight = self.config.get(
-            'future_candidate_weight', 1.0
-        )
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
@@ -986,23 +962,154 @@ def _run_layout_trials_cpp(
         )
 
         if not use_parallel:
-            return [
+            trial_results = [
                 router.run_trial(idx, seeded_pi_list, n_iterations, n_trials_actual)
                 for idx in trial_indices
             ]
+        else:
+            from concurrent.futures import ThreadPoolExecutor
+            workers = self.config.get("layout_trial_workers", 0)
+            if workers <= 0:
+                workers = min(n_trials_actual, _available_cpus())
+
+            with ThreadPoolExecutor(max_workers=workers) as pool:
+                futures = [
+                    pool.submit(router.run_trial, idx, seeded_pi_list, n_iterations, n_trials_actual)
+                    for idx in trial_indices
+                ]
+                trial_results = [f.result() for f in futures]
+
+        heuristic_ranked = sorted(trial_results, key=lambda x: x[0])
+        actual_rank_default = min(
+            max(1, self.config.get("cleanup_top_k", 3) * 2),
+            n_trials_actual,
+        )
+        actual_rank_top_k = self.config.get(
+            "actual_routing_rank_top_k", actual_rank_default
+        )
+        if actual_rank_top_k is None or actual_rank_top_k <= 0:
+            actual_rank_top_k = len(heuristic_ranked)
+        actual_rank_top_k = min(int(actual_rank_top_k), len(heuristic_ranked))
+
+        ranked = []
+        for heuristic_cost, trial_pi in heuristic_ranked[:actual_rank_top_k]:
+            actual_cnot, pi_out, pi_init, steps = router.route_forward(
+                [int(x) for x in trial_pi]
+            )
+            ranked.append((actual_cnot, pi_out, heuristic_cost, pi_init, steps))
+        ranked.sort(key=lambda x: (x[0], x[2]))
+        ranked.extend(
+            (float("inf"), pi, cost, None, None)
+            for cost, pi in heuristic_ranked[actual_rank_top_k:]
+        )
+        return ranked
+        
+    @staticmethod
+    def _snapshot_single_qubit_circuits(optimized_partitions):
+        return {
+            i: p.circuit.copy()
+            for i, p in enumerate(optimized_partitions)
+            if isinstance(p, SingleQubitPartitionResult)
+        }
 
-        from concurrent.futures import ThreadPoolExecutor
-        workers = self.config.get("layout_trial_workers", 0)
-        if workers <= 0:
-            workers = min(n_trials_actual, _available_cpus())
+    @staticmethod
+    def _restore_single_qubit_circuits(optimized_partitions, saved_circuits):
+        for idx, orig in saved_circuits.items():
+            optimized_partitions[idx].circuit = orig.copy()
+
+    def _partition_order_from_cpp_steps(
+        self, steps, optimized_partitions, candidate_cache, N
+    ):
+        partition_order = []
+        for step in steps:
+            kind = step[0]
+            if kind == "swap":
+                swaps = [(int(u), int(v)) for u, v in step[1]]
+                if swaps:
+                    partition_order.append(construct_swap_circuit(swaps, N))
+            elif kind == "partition":
+                partition_idx = int(step[1])
+                candidate_idx = int(step[2])
+                partition_order.append(
+                    candidate_cache[partition_idx][candidate_idx]
+                )
+            elif kind == "single":
+                partition_idx = int(step[1])
+                physical_qubit = int(step[2])
+                part = optimized_partitions[partition_idx]
+                circuit_qubit = int(part.circuit.get_Qbits()[0])
+                part.circuit = part.circuit.Remap_Qbits(
+                    {circuit_qubit: physical_qubit}, N
+                )
+                partition_order.append(part)
+        return partition_order
+
+
+    def _rank_layout_trials_by_actual_routing(
+        self,
+        trial_results,
+        DAG,
+        IDAG,
+        optimized_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        rank_top_k=None,
+    ):
+        """Reroute a bounded candidate set and rank it by actual CNOT count."""
+        if trial_results and len(trial_results[0]) >= 5:
+            return sorted(trial_results, key=lambda x: (x[0], x[2]))
+        heuristic_ranked = sorted(trial_results, key=lambda x: x[0])
+        if rank_top_k is None or rank_top_k <= 0:
+            rank_top_k = len(heuristic_ranked)
+        rank_top_k = min(int(rank_top_k), len(heuristic_ranked))
+        actual_candidates = heuristic_ranked[:rank_top_k]
+        heuristic_tail = heuristic_ranked[rank_top_k:]
+
+        saved_sq_circuits = self._snapshot_single_qubit_circuits(
+            optimized_partitions
+        )
+        ranked_results = []
+        old_progressbar = self.config.get("progressbar", 0)
+        self.config["progressbar"] = False
+        try:
+            for heuristic_cost, trial_pi in actual_candidates:
+                self._restore_single_qubit_circuits(
+                    optimized_partitions, saved_sq_circuits
+                )
+                F_trial = self.get_initial_layer(
+                    IDAG, len(trial_pi), optimized_partitions
+                )
+                partition_order, _, _ = self.Heuristic_Search(
+                    F_trial,
+                    np.asarray(trial_pi, dtype=np.int64).copy(),
+                    DAG,
+                    IDAG,
+                    optimized_partitions,
+                    scoring_partitions,
+                    D,
+                    candidate_cache=candidate_cache,
+                )
+                trial_circuit, _ = self.Construct_circuit_from_HS(
+                    partition_order, optimized_partitions, len(trial_pi)
+                )
+                actual_cnot = trial_circuit.get_Gate_Nums().get("CNOT", 0)
+                ranked_results.append((actual_cnot, trial_pi, heuristic_cost, None, None))
+        finally:
+            if old_progressbar is None:
+                self.config.pop("progressbar", None)
+            else:
+                self.config["progressbar"] = old_progressbar
+            self._restore_single_qubit_circuits(
+                optimized_partitions, saved_sq_circuits
+            )
+
+        ranked_results.sort(key=lambda x: (x[0], x[2]))
+        ranked_results.extend(
+            (float("inf"), pi, cost, None, None) for cost, pi in heuristic_tail
+        )
+        return ranked_results
 
-        with ThreadPoolExecutor(max_workers=workers) as pool:
-            futures = [
-                pool.submit(router.run_trial, idx, seeded_pi_list, n_iterations, n_trials_actual)
-                for idx in trial_indices
-            ]
-            return [f.result() for f in futures]
-        
     def Partition_Aware_Mapping(
         self, circ: Circuit, orig_parameters: np.ndarray
     ):
@@ -1065,7 +1172,23 @@ def Partition_Aware_Mapping(
                 n_trials=max(1, n_trials),
                 random_seed=random_seed,
             )
-            trial_results.sort(key=lambda x: x[0])
+            actual_rank_default = min(
+                max(1, self.config.get("cleanup_top_k", 3) * 2),
+                max(1, n_trials),
+            )
+            actual_rank_top_k = self.config.get(
+                "actual_routing_rank_top_k", actual_rank_default
+            )
+            trial_results = self._rank_layout_trials_by_actual_routing(
+                trial_results,
+                DAG,
+                IDAG,
+                optimized_partitions,
+                scoring_partitions,
+                D,
+                candidate_cache,
+                rank_top_k=actual_rank_top_k,
+            )
 
             if do_cleanup:
                 from squander.decomposition.qgd_Wide_Circuit_Optimization import (
@@ -1080,11 +1203,9 @@ def Partition_Aware_Mapping(
                 cleanup_config['global_min'] = True
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
-                saved_sq_circuits = {
-                    i: p.circuit.copy()
-                    for i, p in enumerate(optimized_partitions)
-                    if isinstance(p, SingleQubitPartitionResult)
-                }
+                saved_sq_circuits = self._snapshot_single_qubit_circuits(
+                    optimized_partitions
+                )
 
                 cleanup_top_k = self.config.get('cleanup_top_k', 3)
                 top_layouts = trial_results[:cleanup_top_k]
@@ -1097,23 +1218,30 @@ def Partition_Aware_Mapping(
                 best_pre_cleanup = None
                 cleanup_total = 0.0
 
-                for _, trial_pi in top_layouts:
-                    for idx, orig in saved_sq_circuits.items():
-                        optimized_partitions[idx].circuit = orig.copy()
-
-                    F_trial = self.get_initial_layer(
-                        IDAG, N, optimized_partitions
-                    )
-                    partition_order, pi_out, pi_init = self.Heuristic_Search(
-                        F_trial,
-                        trial_pi.copy(),
-                        DAG,
-                        IDAG,
-                        optimized_partitions,
-                        scoring_partitions,
-                        D,
-                        candidate_cache=candidate_cache,
+                for _, trial_pi, _, trace_pi_init, route_steps in top_layouts:
+                    self._restore_single_qubit_circuits(
+                        optimized_partitions, saved_sq_circuits
                     )
+                    if route_steps is not None:
+                        partition_order = self._partition_order_from_cpp_steps(
+                            route_steps, optimized_partitions, candidate_cache, N
+                        )
+                        pi_out = np.asarray(trial_pi, dtype=np.int64)
+                        pi_init = np.asarray(trace_pi_init, dtype=np.int64)
+                    else:
+                        F_trial = self.get_initial_layer(
+                            IDAG, N, optimized_partitions
+                        )
+                        partition_order, pi_out, pi_init = self.Heuristic_Search(
+                            F_trial,
+                            trial_pi.copy(),
+                            DAG,
+                            IDAG,
+                            optimized_partitions,
+                            scoring_partitions,
+                            D,
+                            candidate_cache=candidate_cache,
+                        )
 
                     trial_circuit, trial_params = self.Construct_circuit_from_HS(
                         partition_order, optimized_partitions, N
@@ -1146,19 +1274,32 @@ def Partition_Aware_Mapping(
                 pi = best_pi
 
             else:
-                _, best_pi = trial_results[0]
+                _, best_pi, _, trace_pi_init, route_steps = trial_results[0]
 
-                F = self.get_initial_layer(IDAG, N, optimized_partitions)
-                partition_order, pi, pi_initial = self.Heuristic_Search(
-                    F,
-                    best_pi.copy(),
-                    DAG,
-                    IDAG,
-                    optimized_partitions,
-                    scoring_partitions,
-                    D,
-                    candidate_cache=candidate_cache,
-                )
+                if route_steps is not None:
+                    saved_sq_circuits = self._snapshot_single_qubit_circuits(
+                        optimized_partitions
+                    )
+                    self._restore_single_qubit_circuits(
+                        optimized_partitions, saved_sq_circuits
+                    )
+                    partition_order = self._partition_order_from_cpp_steps(
+                        route_steps, optimized_partitions, candidate_cache, N
+                    )
+                    pi = np.asarray(best_pi, dtype=np.int64)
+                    pi_initial = np.asarray(trace_pi_init, dtype=np.int64)
+                else:
+                    F = self.get_initial_layer(IDAG, N, optimized_partitions)
+                    partition_order, pi, pi_initial = self.Heuristic_Search(
+                        F,
+                        best_pi.copy(),
+                        DAG,
+                        IDAG,
+                        optimized_partitions,
+                        scoring_partitions,
+                        D,
+                        candidate_cache=candidate_cache,
+                    )
                 final_circuit, final_parameters = self.Construct_circuit_from_HS(
                     partition_order, optimized_partitions, N
                 )
@@ -1196,31 +1337,21 @@ def Partition_Aware_Mapping(
     # ------------------------------------------------------------------------
 
     def _select_best_candidate(self, partition_candidates, scores, rng=None):
-        """Select best candidate, with optional stochastic tie-breaking."""
+        """Select the lowest-scoring candidate deterministically."""
+        del rng
         scores_array = np.array(scores)
-        min_score = np.min(scores_array)
-        tolerance = self.config.get('score_tolerance', 0.05)
-
-        if rng is not None and min_score > 0:
-            threshold = min_score * (1 + tolerance)
-            close_indices = np.where(scores_array <= threshold)[0]
-            if len(close_indices) > 1:
-                return partition_candidates[rng.choice(close_indices)]
-            return partition_candidates[close_indices[0]]
-        else:
-            return partition_candidates[np.argmin(scores_array)]
+        return partition_candidates[np.argmin(scores_array)]
 
     def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=False):
         """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
         if len(partition_candidates) <= top_k:
             return partition_candidates
-        local_cost_weight = self.config.get('local_cost_weight', 0.1)
-        swap_cost = self.config.get('swap_cost', 15.0)
+        cnot_cost = self.config.get('cnot_cost', 0.1 / 15.0)
         estimates = np.array([
             self._routing_objective(
-                pc.estimate_swap_count(pi, D, reverse=reverse) * swap_cost,
+                pc.estimate_swap_count(pi, D, reverse=reverse),
                 pc.cnot_count,
-                local_cost_weight,
+                cnot_cost,
             )
             for pc in partition_candidates
         ])
@@ -1237,17 +1368,17 @@ def _decay_factor_for_swaps(swaps, decay):
     def _routing_objective(
         route_cost,
         cnot_count,
-        local_cost_weight,
+        cnot_cost,
         cnot_weight=1.0,
         decay_factor=1.0,
     ):
         return decay_factor * (
             float(route_cost)
-            + cnot_weight * local_cost_weight * float(cnot_count)
+            + cnot_weight * cnot_cost * float(cnot_count)
         )
 
     def _apply_decay_for_swaps(self, swaps, decay):
-        delta = self.config.get("decay_delta", 0.1)
+        delta = self.config.get("decay_delta", 0.001)
         if delta <= 0:
             return
         for u, v in swaps:
@@ -1345,111 +1476,39 @@ def _bfs_shortest_path(self, src, dst):
         return []
 
     @staticmethod
-    def _entry_variants(entry, future_cost_mode="canonical"):
-        variants = entry.get("variants")
-        if variants:
-            if future_cost_mode == "topk_min":
-                return variants
-            return variants[:1]
-        return (entry,)
-
-    @staticmethod
-    def _variant_routing_cost(variant, output_perm_arr, D_arr, swap_cost):
-        eu = variant["edges_u"]
+    def _entry_future_cost(entry, output_perm_arr, D_arr):
+        eu = entry.get("edges_u")
         if eu is None:
             return 0.0
         phys_u = output_perm_arr[eu]
-        phys_v = output_perm_arr[variant["edges_v"]]
-        return float(swap_cost * np.maximum(0, D_arr[phys_u, phys_v] - 1).sum())
+        phys_v = output_perm_arr[entry["edges_v"]]
+        return float(np.maximum(0, D_arr[phys_u, phys_v] - 1).sum())
 
-    @staticmethod
-    def _entry_future_cost(
-        entry,
-        output_perm_arr,
-        D_arr,
-        swap_cost,
-        local_cost_weight,
-        future_cost_mode="canonical",
-        future_candidate_weight=1.0,
-    ):
-        variants = qgd_Partition_Aware_Mapping._entry_variants(
-            entry, future_cost_mode=future_cost_mode
-        )
-        best = None
-        for variant in variants:
-            route_cost = qgd_Partition_Aware_Mapping._variant_routing_cost(
-                variant, output_perm_arr, D_arr, swap_cost
-            )
-            cnot_weight = (
-                future_candidate_weight
-                if future_cost_mode == "topk_min"
-                else 0.0
-            )
-            variant_cost = qgd_Partition_Aware_Mapping._routing_objective(
-                route_cost,
-                variant["cnot"],
-                local_cost_weight,
-                cnot_weight=cnot_weight,
-            )
-            if best is None or variant_cost < best:
-                best = variant_cost
-        return 0.0 if best is None else best
-
-    @staticmethod
-    def _best_release_variant(entry, pi_arr, D_arr, future_cost_mode="canonical"):
+    def _release_valve(self, F, pi, D, canonical_data):
+        pi_arr = np.asarray(pi, dtype=np.intp)
+        D_arr = np.asarray(D)
         best = None
-        for variant in qgd_Partition_Aware_Mapping._entry_variants(
-            entry, future_cost_mode=future_cost_mode
-        ):
-            eu = variant["edges_u"]
+        for p_idx in F:
+            entry = canonical_data.get(p_idx)
+            if entry is None:
+                continue
+            eu = entry.get("edges_u")
             if eu is None:
                 continue
-            ev = variant["edges_v"]
+            ev = entry["edges_v"]
             phys_u = pi_arr[eu]
             phys_v = pi_arr[ev]
             dists = D_arr[phys_u, phys_v]
             if dists.size == 0:
                 continue
-            route_sum = float(np.maximum(0, dists - 1).sum())
             worst_idx = int(np.argmax(dists))
             worst_d = float(dists[worst_idx])
-            worst_pair = (int(eu[worst_idx]), int(ev[worst_idx]))
-            if (
-                best is None
-                or route_sum < best[0]
-                or (route_sum == best[0] and worst_d < best[1])
-                or (
-                    route_sum == best[0]
-                    and worst_d == best[1]
-                    and worst_pair < best[2]
-                )
-            ):
-                best = (route_sum, worst_d, worst_pair)
-        return best
-
-    def _release_valve(self, F, pi, D, canonical_data, future_cost_mode="canonical"):
-        pi_arr = np.asarray(pi, dtype=np.intp)
-        D_arr = np.asarray(D)
-        best = None
-        for p_idx in F:
-            entry = canonical_data.get(p_idx)
-            if entry is None:
-                continue
-            best_variant = self._best_release_variant(
-                entry,
-                pi_arr,
-                D_arr,
-                future_cost_mode=future_cost_mode,
-            )
-            if best_variant is None:
-                continue
-            _, worst_d, worst_pair = best_variant
-            if worst_d <= 1 or worst_pair is None:
+            if worst_d <= 1:
                 continue
             if best is None or worst_d > best[0] or (
                 worst_d == best[0] and p_idx < best[1]
             ):
-                best = (worst_d, p_idx, worst_pair[0], worst_pair[1])
+                best = (worst_d, p_idx, int(eu[worst_idx]), int(ev[worst_idx]))
 
         if best is None:
             return [], list(pi)
@@ -1479,7 +1538,6 @@ def _build_neighbor_info(
         weight=0.2,
         W=0.5,
         alpha=0.9,
-        future_cost_mode="canonical",
     ):
         if canonical_data is None or weight <= 0:
             return None
@@ -1493,26 +1551,18 @@ def add_edges(target_idx, edge_weight):
             entry = canonical_data.get(target_idx)
             if entry is None:
                 return
-            variants = [
-                variant
-                for variant in qgd_Partition_Aware_Mapping._entry_variants(
-                    entry, future_cost_mode=future_cost_mode
-                )
-                if variant["edges_u"] is not None
-            ]
-            if not variants:
+            eu = entry.get("edges_u")
+            if eu is None:
                 return
-            variant_weight = edge_weight / len(variants)
-            for variant in variants:
-                for u, v in zip(variant["edges_u"], variant["edges_v"]):
-                    u = int(u)
-                    v = int(v)
-                    qubits.add(u)
-                    qubits.add(v)
-                    key = (u, v) if u <= v else (v, u)
-                    edge_weights[key] = (
-                        edge_weights.get(key, 0.0) + variant_weight
-                    )
+            for u, v in zip(eu, entry["edges_v"]):
+                u = int(u)
+                v = int(v)
+                qubits.add(u)
+                qubits.add(v)
+                key = (u, v) if u <= v else (v, u)
+                edge_weights[key] = (
+                    edge_weights.get(key, 0.0) + edge_weight
+                )
 
         for future_idx in F:
             add_edges(future_idx, 1.0)
@@ -1604,30 +1654,22 @@ def Heuristic_Search(
         max_E_size = self.config.get("max_E_size", 20)
         max_lookahead = self.config.get("max_lookahead", 4)
         E_W = self.config.get("E_weight", 0.5)
-        E_alpha = self.config.get("E_alpha", 0.9)
-        future_cost_mode = self.config.get("future_cost_mode", "canonical")
-        future_candidate_weight = self.config.get(
-            "future_candidate_weight", 1.0
-        )
+        E_alpha = self.config.get("E_alpha", 1.0)
+        swap_burst_budget = self.config.get("swap_burst_budget", 5)
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
         decay = [1.0] * len(pi)
-        swap_burst = 0
         swap_heavy_partitions = 0
 
         while F:
             if (
-                self.config.get("release_valve_enabled", True)
-                and swap_burst > self.config.get("release_valve_threshold", 20)
+                swap_burst_budget > 0
+                and swap_heavy_partitions >= swap_burst_budget
             ):
                 valve_swaps, pi_bridged = self._release_valve(
-                    F,
-                    pi,
-                    D,
-                    canonical_data,
-                    future_cost_mode=future_cost_mode,
+                    F, pi, D, canonical_data
                 )
                 if valve_swaps:
                     partition_order.append(
@@ -1635,9 +1677,10 @@ def Heuristic_Search(
                     )
                     self._apply_decay_for_swaps(valve_swaps, decay)
                     pi = np.asarray(pi_bridged)
-                    swap_burst = 0
+                    swap_heavy_partitions = 0
                     continue
-                swap_burst = 0
+                self._reset_decay(decay)
+                swap_heavy_partitions = 0
 
             partition_candidates = self.obtain_partition_candidates(
             F,
@@ -1686,7 +1729,6 @@ def Heuristic_Search(
                         weight=self.config.get("path_tiebreak_weight", 0.2),
                         W=E_W,
                         alpha=E_alpha,
-                        future_cost_mode=future_cost_mode,
                     )
                     prev_partition_idx = cand.partition_idx
                 score, swaps, output_perm = self.score_partition_candidate(
@@ -1701,14 +1743,11 @@ def Heuristic_Search(
                     alpha=E_alpha,
                     canonical_data=canonical_data,
                     adj=self._adj,
-                    local_cost_weight=self.config.get("local_cost_weight", 0.1),
-                    swap_cost=self.config.get("swap_cost", 15.0),
+                    cnot_cost=self.config.get("cnot_cost", 0.1 / 15.0),
                     path_tiebreak_weight=self.config.get(
                         "path_tiebreak_weight", 0.2
                     ),
                     decay=decay,
-                    future_cost_mode=future_cost_mode,
-                    future_candidate_weight=future_candidate_weight,
                     cached_neighbor_info=cached_neighbor_info,
                     return_transforms=True,
                 )
@@ -1730,17 +1769,8 @@ def Heuristic_Search(
             if swap_order:
                 partition_order.append(construct_swap_circuit(swap_order, len(pi)))
                 self._apply_decay_for_swaps(swap_order, decay)
-                swap_burst += len(swap_order)
                 swap_heavy_partitions += 1
-                if (
-                    self.config.get("decay_reset_interval", 5) > 0
-                    and swap_heavy_partitions
-                    >= self.config.get("decay_reset_interval", 5)
-                ):
-                    self._reset_decay(decay)
-                    swap_heavy_partitions = 0
             else:
-                swap_burst = 0
                 swap_heavy_partitions = 0
                 self._reset_decay(decay)
 
@@ -1796,9 +1826,9 @@ def _heuristic_search_layout_only(
                     updates (used for backward passes in SABRE iterations).
 
         Returns:
-            (pi, total_cost): final layout and heuristic trial score.
-            Trial ranking uses the same immediate routing objective as the
-            online scorer: weighted SWAP pressure plus weighted local CNOT cost.
+            (pi, total_cost): final layout and layout-only heuristic score.
+            Trial ranking reroutes returned layouts and sorts by actual
+            constructed-circuit CNOT count; this score is only a tie-breaker.
         """
         F = list(F)
         resolved_partitions = [False] * len(DAG)
@@ -1826,46 +1856,36 @@ def _heuristic_search_layout_only(
         max_E_size = self.config.get("max_E_size", 20)
         max_lookahead = self.config.get("max_lookahead", 4)
         E_W = self.config.get("E_weight", 0.5)
-        E_alpha = self.config.get("E_alpha", 0.9)
-        local_cost_weight = self.config.get("local_cost_weight", 0.1)
-        swap_cost = self.config.get("swap_cost", 15.0)
-        future_cost_mode = self.config.get("future_cost_mode", "canonical")
-        future_candidate_weight = self.config.get(
-            "future_candidate_weight", 1.0
-        )
+        E_alpha = self.config.get("E_alpha", 1.0)
+        cnot_cost = self.config.get("cnot_cost", 0.1 / 15.0)
+        swap_burst_budget = self.config.get("swap_burst_budget", 5)
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=reverse
         )
         decay = [1.0] * len(pi)
-        swap_burst = 0
         swap_heavy_partitions = 0
 
         while F:
             if (
-                self.config.get("release_valve_enabled", True)
-                and swap_burst > self.config.get("release_valve_threshold", 20)
+                swap_burst_budget > 0
+                and swap_heavy_partitions >= swap_burst_budget
             ):
-                valve_swaps, pi = self._release_valve(
-                    F,
-                    pi,
-                    D,
-                    canonical_data,
-                    future_cost_mode=future_cost_mode,
-                )
+                valve_swaps, pi = self._release_valve(F, pi, D, canonical_data)
                 if valve_swaps:
                     total_cost += self._routing_objective(
-                        swap_cost * len(valve_swaps),
+                        len(valve_swaps),
                         0,
-                        local_cost_weight,
+                        cnot_cost,
                         decay_factor=self._decay_factor_for_swaps(
                             valve_swaps, decay
                         ),
                     )
                     self._apply_decay_for_swaps(valve_swaps, decay)
-                    swap_burst = 0
+                    swap_heavy_partitions = 0
                     continue
-                swap_burst = 0
+                self._reset_decay(decay)
+                swap_heavy_partitions = 0
 
             partition_candidates = self.obtain_partition_candidates(
                 F,
@@ -1913,7 +1933,6 @@ def _heuristic_search_layout_only(
                         weight=self.config.get("path_tiebreak_weight", 0.2),
                         W=E_W,
                         alpha=E_alpha,
-                        future_cost_mode=future_cost_mode,
                     )
                     prev_partition_idx = cand.partition_idx
                 score, swaps, output_perm = self.score_partition_candidate(
@@ -1929,14 +1948,11 @@ def _heuristic_search_layout_only(
                     reverse=reverse,
                     canonical_data=canonical_data,
                     adj=self._adj,
-                    local_cost_weight=self.config.get("local_cost_weight", 0.1),
-                    swap_cost=self.config.get("swap_cost", 15.0),
+                    cnot_cost=cnot_cost,
                     path_tiebreak_weight=self.config.get(
                         "path_tiebreak_weight", 0.2
                     ),
                     decay=decay,
-                    future_cost_mode=future_cost_mode,
-                    future_candidate_weight=future_candidate_weight,
                     cached_neighbor_info=cached_neighbor_info,
                     return_transforms=True,
                 )
@@ -1956,24 +1972,15 @@ def _heuristic_search_layout_only(
             if swaps:
                 decay_factor = self._decay_factor_for_swaps(swaps, decay)
             total_cost += self._routing_objective(
-                swap_cost * len(swaps),
+                len(swaps),
                 best.cnot_count,
-                local_cost_weight,
+                cnot_cost,
                 decay_factor=decay_factor,
             )
             if swaps:
                 self._apply_decay_for_swaps(swaps, decay)
-                swap_burst += len(swaps)
                 swap_heavy_partitions += 1
-                if (
-                    self.config.get("decay_reset_interval", 5) > 0
-                    and swap_heavy_partitions
-                    >= self.config.get("decay_reset_interval", 5)
-                ):
-                    self._reset_decay(decay)
-                    swap_heavy_partitions = 0
             else:
-                swap_burst = 0
                 swap_heavy_partitions = 0
                 self._reset_decay(decay)
 
@@ -2038,18 +2045,12 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
     # ------------------------------------------------------------------------
 
     def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
-        """Build compact future-routing surrogates per partition.
+        """Build a compact future-routing surrogate per partition.
 
-        The first stored variant is the old "canonical" lowest-CNOT surrogate.
-        Additional variants keep distinct future edge patterns alive so the
-        router can score a future partition by its best still-available option.
+        For each partition, pick the edge pattern with the lowest CNOT count;
+        the router uses this as a canonical "best still-available option" when
+        scoring future partitions.
         """
-        future_cost_mode = self.config.get("future_cost_mode", "canonical")
-        top_k = (
-            max(1, int(self.config.get("future_candidate_top_k", 4)))
-            if future_cost_mode == "topk_min"
-            else 1
-        )
         data = {}
         for idx, partition in enumerate(scoring_partitions):
             if partition is None:
@@ -2081,42 +2082,30 @@ def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
                         variant_map[edge_key] = cnot
             if not variant_map:
                 continue
-            variants = []
-            for edge_key, cnot in sorted(
+            edge_key, cnot = min(
                 variant_map.items(),
                 key=lambda item: (item[1], len(item[0]), item[0]),
-            )[:top_k]:
-                if edge_key:
-                    eu = np.array([e[0] for e in edge_key], dtype=np.intp)
-                    ev = np.array([e[1] for e in edge_key], dtype=np.intp)
-                else:
-                    eu = ev = None
-                variants.append(
-                    {"edges_u": eu, "edges_v": ev, "cnot": cnot}
-                )
-            primary = variants[0]
-            data[idx] = {
-                "edges_u": primary["edges_u"],
-                "edges_v": primary["edges_v"],
-                "cnot": primary["cnot"],
-                "variants": tuple(variants),
-            }
+            )
+            if edge_key:
+                eu = np.array([e[0] for e in edge_key], dtype=np.intp)
+                ev = np.array([e[1] for e in edge_key], dtype=np.intp)
+            else:
+                eu = ev = None
+            data[idx] = {"edges_u": eu, "edges_v": ev, "cnot": cnot}
         return data
 
     @staticmethod
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
                                   canonical_data=None, adj=None,
-                                  local_cost_weight=0.1, swap_cost=15.0,
+                                  cnot_cost=0.1 / 15.0,
                                   path_tiebreak_weight=0.2, decay=None,
-                                  future_cost_mode="canonical",
-                                  future_candidate_weight=1.0,
                                   cached_neighbor_info=None,
                                   return_transforms=False):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
-        H = swap_cost * |swaps|
-          + local_cost_weight * cand.cnot_count
+        H = |swaps|
+          + cnot_cost * cand.cnot_count
           + (1/|F'|) * average routing cost over F \\ {cand}
           + (W/|E|)  * alpha^d-decayed routing cost over E
         """
@@ -2132,7 +2121,6 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 weight=path_tiebreak_weight,
                 W=W,
                 alpha=alpha,
-                future_cost_mode=future_cost_mode,
             )
         swaps, output_perm = partition_candidate.transform_pi(
             pi,
@@ -2148,9 +2136,9 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 swaps, decay
             )
         score = qgd_Partition_Aware_Mapping._routing_objective(
-            swap_cost * len(swaps),
+            len(swaps),
             partition_candidate.cnot_count,
-            local_cost_weight,
+            cnot_cost,
             decay_factor=decay_factor,
         )
 
@@ -2174,13 +2162,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 continue
             n_other += 1
             f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
-                entry,
-                output_perm_arr,
-                D_arr,
-                swap_cost,
-                local_cost_weight,
-                future_cost_mode=future_cost_mode,
-                future_candidate_weight=future_candidate_weight,
+                entry, output_perm_arr, D_arr
             )
         if n_other > 0:
             score += f_sum / n_other
@@ -2195,13 +2177,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 if entry is None:
                     continue
                 d_cost = qgd_Partition_Aware_Mapping._entry_future_cost(
-                    entry,
-                    output_perm_arr,
-                    D_arr,
-                    swap_cost,
-                    local_cost_weight,
-                    future_cost_mode=future_cost_mode,
-                    future_candidate_weight=future_candidate_weight,
+                    entry, output_perm_arr, D_arr
                 )
                 e_sum += (alpha ** depth) * d_cost
             score += W * e_sum / len(E)
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 63e785002..070a3dc5a 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -75,24 +75,11 @@ static std::vector<int> extract_int_array(py::handle obj) {
     return result;
 }
 
-static CanonicalEntry::FutureVariant extract_future_variant(py::dict d) {
-    CanonicalEntry::FutureVariant variant;
-    if (d.contains("edges_u") && !d["edges_u"].is_none()) {
-        variant.edges_u = extract_int_array(d["edges_u"]);
-    }
-    if (d.contains("edges_v") && !d["edges_v"].is_none()) {
-        variant.edges_v = extract_int_array(d["edges_v"]);
-    }
-    variant.cnot = d["cnot"].cast<int>();
-    return variant;
-}
-
 static std::unordered_map<int, CanonicalEntry> extract_canonical_data(py::dict cd) {
     std::unordered_map<int, CanonicalEntry> result;
     for (auto [key, val] : cd) {
         int pidx = key.cast<int>();
         CanonicalEntry entry;
-        // val is a dict with 'edges_u', 'edges_v', 'cnot'
         py::dict d = py::reinterpret_borrow<py::dict>(val);
         if (d.contains("edges_u") && !d["edges_u"].is_none()) {
             entry.edges_u = extract_int_array(d["edges_u"]);
@@ -101,22 +88,6 @@ static std::unordered_map<int, CanonicalEntry> extract_canonical_data(py::dict c
             entry.edges_v = extract_int_array(d["edges_v"]);
         }
         entry.cnot = d["cnot"].cast<int>();
-
-        if (d.contains("variants") && !d["variants"].is_none()) {
-            py::iterable variants = py::reinterpret_borrow<py::iterable>(d["variants"]);
-            for (auto item : variants) {
-                entry.variants.push_back(
-                    extract_future_variant(py::reinterpret_borrow<py::dict>(item))
-                );
-            }
-        }
-        if (entry.variants.empty()) {
-            CanonicalEntry::FutureVariant primary;
-            primary.edges_u = entry.edges_u;
-            primary.edges_v = entry.edges_v;
-            primary.cnot = entry.cnot;
-            entry.variants.push_back(std::move(primary));
-        }
         result[pidx] = std::move(entry);
     }
     return result;
@@ -154,21 +125,13 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("max_lookahead", &SabreConfig::max_lookahead)
         .def_readwrite("E_weight", &SabreConfig::E_weight)
         .def_readwrite("E_alpha", &SabreConfig::E_alpha)
-        .def_readwrite("local_cost_weight", &SabreConfig::local_cost_weight)
-        .def_readwrite("swap_cost", &SabreConfig::swap_cost)
-        .def_readwrite("score_tolerance", &SabreConfig::score_tolerance)
-        .def_readwrite("trial_swap_cnot_cost", &SabreConfig::trial_swap_cnot_cost)
+        .def_readwrite("cnot_cost", &SabreConfig::cnot_cost)
         .def_readwrite("sabre_iterations", &SabreConfig::sabre_iterations)
         .def_readwrite("n_layout_trials", &SabreConfig::n_layout_trials)
         .def_readwrite("random_seed", &SabreConfig::random_seed)
         .def_readwrite("decay_delta", &SabreConfig::decay_delta)
-        .def_readwrite("decay_reset_interval", &SabreConfig::decay_reset_interval)
-        .def_readwrite("release_valve_enabled", &SabreConfig::release_valve_enabled)
-        .def_readwrite("release_valve_threshold", &SabreConfig::release_valve_threshold)
-        .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
-        .def_readwrite("future_cost_mode", &SabreConfig::future_cost_mode)
-        .def_readwrite("future_candidate_top_k", &SabreConfig::future_candidate_top_k)
-        .def_readwrite("future_candidate_weight", &SabreConfig::future_candidate_weight);
+        .def_readwrite("swap_burst_budget", &SabreConfig::swap_burst_budget)
+        .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")
@@ -201,7 +164,9 @@ PYBIND11_MODULE(_sabre_router, m) {
                     py::list cl = py::reinterpret_borrow<py::list>(part_cands);
                     cands.reserve(py::len(cl));
                     for (auto c : cl) {
-                        cands.push_back(extract_candidate(c));
+                        auto cd = extract_candidate(c);
+                        cd.candidate_idx = static_cast<int>(cands.size());
+                        cands.push_back(std::move(cd));
                     }
                     cc.push_back(std::move(cands));
                 }
@@ -225,6 +190,28 @@ PYBIND11_MODULE(_sabre_router, m) {
             py::arg("canonical_data_fwd"),
             py::arg("canonical_data_rev")
         )
+        .def("route_forward",
+            [](const SabreRouter& self,
+               const std::vector<int>& pi
+            ) -> py::tuple {
+                py::gil_scoped_release release;
+                auto result = self.route_forward(pi);
+                py::gil_scoped_acquire acquire;
+                py::list steps;
+                for (const auto& step : result.steps) {
+                    if (step.type == 0) {
+                        steps.append(py::make_tuple("swap", step.swaps));
+                    } else if (step.type == 1) {
+                        steps.append(py::make_tuple("partition", step.partition_idx, step.candidate_idx));
+                    } else {
+                        steps.append(py::make_tuple("single", step.partition_idx, step.physical_qubit));
+                    }
+                }
+                return py::make_tuple(result.cnot_count, result.pi, result.pi_initial, steps);
+            },
+            py::arg("pi"),
+            "Run actual forward routing and return CNOT count, final pi, initial pi, and route steps"
+        )
         .def("run_trial",
             [](const SabreRouter& self,
                int trial_idx,

From 2a7f8f4c8167e1d98ffa9da2b43969f622c71076 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 24 Apr 2026 21:55:24 +0200
Subject: [PATCH 155/232] Fix

---
 .../sabre_router/include/sabre_router.hpp      |  2 +-
 squander/synthesis/PartAM.py                   | 18 ++++++------------
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index eb4e577ad..a4d4ba216 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -82,7 +82,7 @@ struct SabreConfig {
     int max_lookahead = 4;
     double E_weight = 0.5;
     double E_alpha = 1.0; // LightSABRE uses no per-depth decay; set <1 for SQUANDER-style decay
-    double cnot_cost = 0.1 / 15.0; // weight on candidate.cnot_count; swap cost is fixed at 1.0
+    double cnot_cost = 1.0 / 3.0; // weight on candidate.cnot_count; swap cost is fixed at 1.0 (1 SWAP = 3 CNOTs)
     int sabre_iterations = 1;
     int n_layout_trials = 1;
     int random_seed = 42;
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 31fc35902..a58242e72 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -137,12 +137,6 @@ def __init__(self, config):
         self.config.setdefault('routed', False)
         self.config.setdefault('partition_strategy','ilp')
         self.config.setdefault('optimizer', 'BFGS')
-        self.config.setdefault('use_basin_hopping', 1)
-        self.config.setdefault('bh_T', 1.0)
-        self.config.setdefault('bh_stepsize', 0.5)
-        self.config.setdefault('bh_interval', 50)
-        self.config.setdefault('bh_target_accept_rate', 0.5)
-        self.config.setdefault('bh_stepwise_factor', 0.9)
         self.config.setdefault('use_osr', 0)
         self.config.setdefault("use_graph_search", 0)
         self.config.setdefault('n_layout_trials', 1)
@@ -163,7 +157,7 @@ def __init__(self, config):
                 self.config['path_tiebreak_weight'],
             )
             self.config['path_tiebreak_weight'] = 0.49
-        self.config.setdefault('cnot_cost', 0.1 / 15.0)
+        self.config.setdefault('cnot_cost', 1.0 / 3.0)  # 1 SWAP = 3 CNOTs
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -921,7 +915,7 @@ def _run_layout_trials_cpp(
         cfg.max_lookahead = self.config.get('max_lookahead', 4)
         cfg.E_weight = self.config.get('E_weight', 0.5)
         cfg.E_alpha = self.config.get('E_alpha', 1.0)
-        cfg.cnot_cost = self.config.get('cnot_cost', 0.1 / 15.0)
+        cfg.cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
         cfg.sabre_iterations = n_iterations
         cfg.n_layout_trials = max(1, n_trials)
         cfg.random_seed = random_seed
@@ -1346,7 +1340,7 @@ def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=Fals
         """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
         if len(partition_candidates) <= top_k:
             return partition_candidates
-        cnot_cost = self.config.get('cnot_cost', 0.1 / 15.0)
+        cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
         estimates = np.array([
             self._routing_objective(
                 pc.estimate_swap_count(pi, D, reverse=reverse),
@@ -1743,7 +1737,7 @@ def Heuristic_Search(
                     alpha=E_alpha,
                     canonical_data=canonical_data,
                     adj=self._adj,
-                    cnot_cost=self.config.get("cnot_cost", 0.1 / 15.0),
+                    cnot_cost=self.config.get("cnot_cost", 1.0 / 3.0),
                     path_tiebreak_weight=self.config.get(
                         "path_tiebreak_weight", 0.2
                     ),
@@ -1857,7 +1851,7 @@ def _heuristic_search_layout_only(
         max_lookahead = self.config.get("max_lookahead", 4)
         E_W = self.config.get("E_weight", 0.5)
         E_alpha = self.config.get("E_alpha", 1.0)
-        cnot_cost = self.config.get("cnot_cost", 0.1 / 15.0)
+        cnot_cost = self.config.get("cnot_cost", 1.0 / 3.0)
         swap_burst_budget = self.config.get("swap_burst_budget", 5)
 
         canonical_data = self._build_canonical_neighbor_data(
@@ -2098,7 +2092,7 @@ def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False):
     def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache,
                                   E=None, W=0.5, alpha=0.9, reverse=False,
                                   canonical_data=None, adj=None,
-                                  cnot_cost=0.1 / 15.0,
+                                  cnot_cost=1.0 / 3.0,
                                   path_tiebreak_weight=0.2, decay=None,
                                   cached_neighbor_info=None,
                                   return_transforms=False):

From 21b161760ad87fbbc0bd8228ff53907fcc68a1ad Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 25 Apr 2026 09:30:50 +0200
Subject: [PATCH 156/232] improve partition scoring

---
 .../sabre_router/include/sabre_router.hpp     |  27 ++
 .../src-cpp/sabre_router/sabre_router.cpp     | 197 ++++++++---
 squander/synthesis/PartAM.py                  | 321 +++++++++++++-----
 3 files changed, 411 insertions(+), 134 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index a4d4ba216..a91778482 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -348,6 +348,8 @@ class SabreRouter {
         const std::vector<const CandidateData*>& candidates,
         const std::vector<int>& pi,
         int top_k,
+        const std::vector<int>& F_snapshot,
+        const std::vector<std::pair<int,int>>& E,
         bool reverse
     ) const;
 
@@ -418,6 +420,31 @@ class SabreRouter {
         const std::vector<int>& pi
     ) const;
 
+    double partition_compactness_cost(
+        int partition_idx,
+        const std::vector<int>& pi
+    ) const;
+
+    double partition_future_lower_bound(
+        int partition_idx,
+        const std::vector<int>& pi,
+        bool reverse
+    ) const;
+
+    double future_context_cost(
+        int exclude_partition_idx,
+        const std::vector<int>& pi,
+        const std::vector<int>& F_snapshot,
+        const std::vector<std::pair<int,int>>& E,
+        bool reverse
+    ) const;
+
+    std::vector<int> estimate_candidate_output_layout(
+        const CandidateData& cand,
+        const std::vector<int>& pi,
+        bool reverse
+    ) const;
+
     // Immutable data members
     SabreConfig config_;
     int N_; // number of physical qubits
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 25bee094d..a9fe1b788 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -312,6 +312,7 @@ NeighborInfo SabreRouter::build_neighbor_info(
     const std::vector<int>& pi,
     const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
+    (void)canonical_data;
     NeighborInfo info;
     info.weight = config_.path_tiebreak_weight;
     if (info.weight <= 0.0) {
@@ -364,12 +365,16 @@ NeighborInfo SabreRouter::build_neighbor_info(
 
     auto add_partition_edges = [&](int partition_idx, double weight) {
         if (partition_idx == exclude_partition_idx || weight <= 0.0) return;
-        auto it = canonical_data.find(partition_idx);
-        if (it == canonical_data.end()) return;
-        const auto& entry = it->second;
-        if (entry.edges_u.empty()) return;
-        for (size_t i = 0; i < entry.edges_u.size(); i++) {
-            add_edge(entry.edges_u[i], entry.edges_v[i], weight);
+        if (
+            partition_idx < 0
+            || partition_idx >= static_cast<int>(layout_partitions_.size())
+        ) return;
+        const auto& involved = layout_partitions_[partition_idx].involved_qbits;
+        if (involved.size() < 2) return;
+        for (size_t i = 0; i < involved.size(); i++) {
+            for (size_t j = i + 1; j < involved.size(); j++) {
+                add_edge(involved[i], involved[j], weight);
+            }
         }
     };
 
@@ -1063,6 +1068,119 @@ double SabreRouter::entry_future_cost(
     return total;
 }
 
+double SabreRouter::partition_compactness_cost(
+    int partition_idx,
+    const std::vector<int>& pi
+) const {
+    if (
+        partition_idx < 0
+        || partition_idx >= static_cast<int>(layout_partitions_.size())
+    ) {
+        return 0.0;
+    }
+
+    const auto& involved = layout_partitions_[partition_idx].involved_qbits;
+    if (involved.size() < 2) {
+        return 0.0;
+    }
+
+    double best = std::numeric_limits<double>::infinity();
+    for (int q : involved) {
+        double term = 0.0;
+        for (int p : involved) {
+            if (p == q) continue;
+            term += dist(pi[q], pi[p]);
+        }
+        best = std::min(best, term);
+    }
+    return std::isfinite(best) ? best : 0.0;
+}
+
+double SabreRouter::partition_future_lower_bound(
+    int partition_idx,
+    const std::vector<int>& pi,
+    bool reverse
+) const {
+    if (
+        partition_idx < 0
+        || partition_idx >= static_cast<int>(candidate_cache_.size())
+    ) {
+        return 0.0;
+    }
+
+    const auto& candidates = candidate_cache_[partition_idx];
+    if (candidates.empty()) {
+        return partition_compactness_cost(partition_idx, pi);
+    }
+
+    double best = std::numeric_limits<double>::infinity();
+    for (const auto& cand : candidates) {
+        const double cost = routing_objective(
+            static_cast<double>(estimate_swap_count(cand, pi, reverse)),
+            cand.cnot_count
+        );
+        best = std::min(best, cost);
+    }
+    return std::isfinite(best)
+        ? best
+        : partition_compactness_cost(partition_idx, pi);
+}
+
+double SabreRouter::future_context_cost(
+    int exclude_partition_idx,
+    const std::vector<int>& pi,
+    const std::vector<int>& F_snapshot,
+    const std::vector<std::pair<int,int>>& E,
+    bool reverse
+) const {
+    double f_sum = 0.0;
+    int n_other = 0;
+    for (int p_idx : F_snapshot) {
+        if (p_idx == exclude_partition_idx) continue;
+        f_sum += partition_future_lower_bound(p_idx, pi, reverse);
+        n_other++;
+    }
+
+    double score = n_other > 0
+        ? f_sum / static_cast<double>(n_other)
+        : 0.0;
+
+    if (!E.empty()) {
+        double e_sum = 0.0;
+        for (auto [p_idx, depth] : E) {
+            if (p_idx == exclude_partition_idx) continue;
+            const double alpha =
+                (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
+                    ? alpha_weights_[depth]
+                    : std::pow(config_.E_alpha, depth);
+            e_sum += alpha * partition_future_lower_bound(
+                p_idx, pi, reverse);
+        }
+        score += config_.E_weight * e_sum / static_cast<double>(E.size());
+    }
+
+    return score;
+}
+
+std::vector<int> SabreRouter::estimate_candidate_output_layout(
+    const CandidateData& cand,
+    const std::vector<int>& pi,
+    bool reverse
+) const {
+    const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
+    std::vector<int> pi_output = pi;
+
+    for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
+        if (q_star < cand.qstar_to_q.size()) {
+            int k = cand.qstar_to_q[q_star];
+            if (k < 0) continue;
+            pi_output[k] = cand.node_mapping_flat[P_exit[q_star]];
+        }
+    }
+
+    return pi_output;
+}
+
 double SabreRouter::compute_routing_cost(
     const std::vector<int>& pi,
     int exclude_partition_idx,
@@ -1147,50 +1265,16 @@ double SabreRouter::score_candidate(
         decay_factor
     );
 
-    // F cost: average routing cost over F \ {cand}
     const int cand_idx = cand.partition_idx;
-    int n_other = 0;
-    double f_sum = 0.0;
-    if (resolved_F) {
-        for (const auto& re : *resolved_F) {
-            if (re.partition_idx == cand_idx) continue;
-            if (!re.entry) continue;
-            n_other++;
-            f_sum += entry_future_cost(*re.entry, output_perm);
-        }
-    } else {
-        for (int p_idx : F_snapshot) {
-            if (p_idx == cand_idx) continue;
-            auto it = canonical_data.find(p_idx);
-            if (it == canonical_data.end()) continue;
-            n_other++;
-            f_sum += entry_future_cost(it->second, output_perm);
-        }
-    }
-    if (n_other > 0) score += f_sum / static_cast<double>(n_other);
-
-    // E cost: alpha^depth-decayed lookahead
-    if (!E.empty()) {
-        double e_sum = 0.0;
-        if (resolved_E) {
-            for (const auto& re : *resolved_E) {
-                if (re.partition_idx == cand_idx) continue;
-                if (!re.entry) continue;
-                e_sum += re.alpha * entry_future_cost(*re.entry, output_perm);
-            }
-        } else {
-            for (auto [p_idx, depth] : E) {
-                if (p_idx == cand_idx) continue;
-                const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
-                    ? alpha_weights_[depth]
-                    : std::pow(config_.E_alpha, depth);
-                auto it = canonical_data.find(p_idx);
-                if (it == canonical_data.end()) continue;
-                e_sum += alpha * entry_future_cost(it->second, output_perm);
-            }
-        }
-        score += config_.E_weight * e_sum / static_cast<double>(E.size());
-    }
+    score += future_context_cost(
+        cand_idx,
+        output_perm,
+        F_snapshot,
+        E,
+        reverse
+    );
+    (void)resolved_F;
+    (void)resolved_E;
 
     if (out_swaps) *out_swaps = std::move(swaps);
     if (out_pi_new) *out_pi_new = std::move(output_perm);
@@ -1222,6 +1306,8 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     const std::vector<const CandidateData*>& candidates,
     const std::vector<int>& pi,
     int top_k,
+    const std::vector<int>& F_snapshot,
+    const std::vector<std::pair<int,int>>& E,
     bool reverse
 ) const {
     if (static_cast<int>(candidates.size()) <= top_k) return candidates;
@@ -1231,10 +1317,13 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     std::vector<Pair> estimated;
     estimated.reserve(candidates.size());
     for (const auto* cand : candidates) {
+        const auto approx_output = estimate_candidate_output_layout(
+            *cand, pi, reverse);
         const double est = routing_objective(
             static_cast<double>(estimate_swap_count(*cand, pi, reverse)),
             cand->cnot_count
-        );
+        ) + future_context_cost(
+            cand->partition_idx, approx_output, F_snapshot, E, reverse);
         estimated.push_back({est, cand});
     }
 
@@ -1387,13 +1476,13 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
         auto all_candidates = obtain_partition_candidates(F);
         if (all_candidates.empty()) break;
 
-        // Prefilter
-        auto candidates = prefilter_candidates(
-            all_candidates, pi, config_.prefilter_top_k, reverse);
-
         // Generate extended set
         auto E = generate_extended_set(F, resolved, cg, pg);
 
+        // Prefilter with a cheap estimate of the candidate's future context.
+        auto candidates = prefilter_candidates(
+            all_candidates, pi, config_.prefilter_top_k, F, E, reverse);
+
         // Pre-resolve canonical entries for F and E once per F-step
         std::vector<ResolvedEntry> resolved_F;
         resolved_F.reserve(F.size());
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index a58242e72..3b700575d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1336,16 +1336,46 @@ def _select_best_candidate(self, partition_candidates, scores, rng=None):
         scores_array = np.array(scores)
         return partition_candidates[np.argmin(scores_array)]
 
-    def _prefilter_candidates(self, partition_candidates, pi, D, top_k, reverse=False):
+    def _prefilter_candidates(
+        self,
+        partition_candidates,
+        pi,
+        D,
+        top_k,
+        F=None,
+        E=None,
+        candidate_cache=None,
+        layout_partitions=None,
+        reverse=False,
+        W=0.5,
+        alpha=1.0,
+    ):
         """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
         if len(partition_candidates) <= top_k:
             return partition_candidates
         cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
         estimates = np.array([
-            self._routing_objective(
-                pc.estimate_swap_count(pi, D, reverse=reverse),
-                pc.cnot_count,
-                cnot_cost,
+            (
+                self._routing_objective(
+                    pc.estimate_swap_count(pi, D, reverse=reverse),
+                    pc.cnot_count,
+                    cnot_cost,
+                )
+                + self._future_context_cost(
+                    pc.partition_idx,
+                    self._estimate_candidate_output_layout(
+                        pc, pi, reverse=reverse
+                    ),
+                    F or (),
+                    E or (),
+                    D,
+                    candidate_cache,
+                    reverse=reverse,
+                    cnot_cost=cnot_cost,
+                    W=W,
+                    alpha=alpha,
+                    layout_partitions=layout_partitions,
+                )
             )
             for pc in partition_candidates
         ])
@@ -1478,6 +1508,128 @@ def _entry_future_cost(entry, output_perm_arr, D_arr):
         phys_v = output_perm_arr[entry["edges_v"]]
         return float(np.maximum(0, D_arr[phys_u, phys_v] - 1).sum())
 
+    @staticmethod
+    def _partition_compactness_cost(partition_idx, pi, layout_partitions, D):
+        if (
+            layout_partitions is None
+            or partition_idx < 0
+            or partition_idx >= len(layout_partitions)
+        ):
+            return 0.0
+        involved = qgd_Partition_Aware_Mapping._partition_involved_qbits(
+            layout_partitions[partition_idx]
+        )
+        if len(involved) < 2:
+            return 0.0
+
+        pi_arr = np.asarray(pi, dtype=np.intp)
+        D_arr = np.asarray(D)
+        best = float("inf")
+        for q in involved:
+            term = 0.0
+            for p in involved:
+                if p != q:
+                    term += float(D_arr[pi_arr[q], pi_arr[p]])
+            best = min(best, term)
+        return 0.0 if not np.isfinite(best) else best
+
+    @staticmethod
+    def _partition_future_lower_bound(
+        partition_idx,
+        pi,
+        D,
+        candidate_cache,
+        reverse=False,
+        cnot_cost=1.0 / 3.0,
+        layout_partitions=None,
+    ):
+        if (
+            candidate_cache is None
+            or partition_idx < 0
+            or partition_idx >= len(candidate_cache)
+            or not candidate_cache[partition_idx]
+        ):
+            return qgd_Partition_Aware_Mapping._partition_compactness_cost(
+                partition_idx, pi, layout_partitions, D
+            )
+
+        best = float("inf")
+        for cand in candidate_cache[partition_idx]:
+            cost = qgd_Partition_Aware_Mapping._routing_objective(
+                cand.estimate_swap_count(pi, D, reverse=reverse),
+                cand.cnot_count,
+                cnot_cost,
+            )
+            best = min(best, cost)
+        if np.isfinite(best):
+            return best
+        return qgd_Partition_Aware_Mapping._partition_compactness_cost(
+            partition_idx, pi, layout_partitions, D
+        )
+
+    @staticmethod
+    def _estimate_candidate_output_layout(partition_candidate, pi, reverse=False):
+        P_exit = partition_candidate.P_i if reverse else partition_candidate.P_o
+        pi_output = [int(x) for x in pi]
+        qbit_map_inverse = {
+            v: k for k, v in partition_candidate.qbit_map.items()
+        }
+        for q_star in range(len(P_exit)):
+            if q_star in qbit_map_inverse:
+                k = qbit_map_inverse[q_star]
+                pi_output[k] = partition_candidate.node_mapping[P_exit[q_star]]
+        return pi_output
+
+    @staticmethod
+    def _future_context_cost(
+        exclude_partition_idx,
+        pi,
+        F,
+        E,
+        D,
+        candidate_cache,
+        reverse=False,
+        cnot_cost=1.0 / 3.0,
+        W=0.5,
+        alpha=1.0,
+        layout_partitions=None,
+    ):
+        f_sum = 0.0
+        n_other = 0
+        for p_idx in F:
+            if p_idx == exclude_partition_idx:
+                continue
+            f_sum += qgd_Partition_Aware_Mapping._partition_future_lower_bound(
+                p_idx,
+                pi,
+                D,
+                candidate_cache,
+                reverse=reverse,
+                cnot_cost=cnot_cost,
+                layout_partitions=layout_partitions,
+            )
+            n_other += 1
+        score = f_sum / n_other if n_other > 0 else 0.0
+
+        if E:
+            e_sum = 0.0
+            for p_idx, depth in E:
+                if p_idx == exclude_partition_idx:
+                    continue
+                e_sum += (
+                    alpha ** depth
+                ) * qgd_Partition_Aware_Mapping._partition_future_lower_bound(
+                    p_idx,
+                    pi,
+                    D,
+                    candidate_cache,
+                    reverse=reverse,
+                    cnot_cost=cnot_cost,
+                    layout_partitions=layout_partitions,
+                )
+            score += W * e_sum / len(E)
+        return score
+
     def _release_valve(self, F, pi, D, canonical_data):
         pi_arr = np.asarray(pi, dtype=np.intp)
         D_arr = np.asarray(D)
@@ -1532,8 +1684,10 @@ def _build_neighbor_info(
         weight=0.2,
         W=0.5,
         alpha=0.9,
+        layout_partitions=None,
     ):
-        if canonical_data is None or weight <= 0:
+        del canonical_data
+        if weight <= 0 or layout_partitions is None:
             return None
 
         edge_weights = {}
@@ -1542,21 +1696,21 @@ def _build_neighbor_info(
         def add_edges(target_idx, edge_weight):
             if target_idx == partition_idx or edge_weight <= 0:
                 return
-            entry = canonical_data.get(target_idx)
-            if entry is None:
+            if target_idx >= len(layout_partitions):
                 return
-            eu = entry.get("edges_u")
-            if eu is None:
-                return
-            for u, v in zip(eu, entry["edges_v"]):
-                u = int(u)
-                v = int(v)
-                qubits.add(u)
-                qubits.add(v)
-                key = (u, v) if u <= v else (v, u)
-                edge_weights[key] = (
-                    edge_weights.get(key, 0.0) + edge_weight
-                )
+            involved = qgd_Partition_Aware_Mapping._partition_involved_qbits(
+                layout_partitions[target_idx]
+            )
+            for i, u in enumerate(involved):
+                for v in involved[i + 1:]:
+                    u = int(u)
+                    v = int(v)
+                    qubits.add(u)
+                    qubits.add(v)
+                    key = (u, v) if u <= v else (v, u)
+                    edge_weights[key] = (
+                        edge_weights.get(key, 0.0) + edge_weight
+                    )
 
         for future_idx in F:
             add_edges(future_idx, 1.0)
@@ -1676,20 +1830,6 @@ def Heuristic_Search(
                 self._reset_decay(decay)
                 swap_heavy_partitions = 0
 
-            partition_candidates = self.obtain_partition_candidates(
-            F,
-            optimized_partitions,
-            candidate_cache=candidate_cache,
-            )
-
-            if not partition_candidates:
-                break
-
-            top_k = self.config.get("prefilter_top_k", 50)
-            partition_candidates = self._prefilter_candidates(
-                partition_candidates, pi, D, top_k
-            )
-
             F_snapshot = tuple(F)
             E = self.generate_extended_set(
                 F,
@@ -1701,6 +1841,28 @@ def Heuristic_Search(
                 max_lookahead=max_lookahead,
             )
 
+            partition_candidates = self.obtain_partition_candidates(
+                F,
+                optimized_partitions,
+                candidate_cache=candidate_cache,
+            )
+            if not partition_candidates:
+                break
+
+            top_k = self.config.get("prefilter_top_k", 50)
+            partition_candidates = self._prefilter_candidates(
+                partition_candidates,
+                pi,
+                D,
+                top_k,
+                F=F_snapshot,
+                E=E,
+                candidate_cache=candidate_cache,
+                layout_partitions=optimized_partitions,
+                W=E_W,
+                alpha=E_alpha,
+            )
+
             # Group candidates by partition_idx to reuse _build_neighbor_info
             candidate_order = sorted(
                 range(len(partition_candidates)),
@@ -1723,6 +1885,7 @@ def Heuristic_Search(
                         weight=self.config.get("path_tiebreak_weight", 0.2),
                         W=E_W,
                         alpha=E_alpha,
+                        layout_partitions=optimized_partitions,
                     )
                     prev_partition_idx = cand.partition_idx
                 score, swaps, output_perm = self.score_partition_candidate(
@@ -1743,6 +1906,8 @@ def Heuristic_Search(
                     ),
                     decay=decay,
                     cached_neighbor_info=cached_neighbor_info,
+                    candidate_cache=candidate_cache,
+                    layout_partitions=optimized_partitions,
                     return_transforms=True,
                 )
                 scores[ci] = score
@@ -1881,6 +2046,17 @@ def _heuristic_search_layout_only(
                 self._reset_decay(decay)
                 swap_heavy_partitions = 0
 
+            F_snapshot = tuple(F)
+            E = self.generate_extended_set(
+                F,
+                DAG,
+                IDAG,
+                resolved_partitions,
+                optimized_partitions,
+                max_E_size=max_E_size,
+                max_lookahead=max_lookahead,
+            )
+
             partition_candidates = self.obtain_partition_candidates(
                 F,
                 optimized_partitions,
@@ -1891,18 +2067,17 @@ def _heuristic_search_layout_only(
 
             top_k = self.config.get("prefilter_top_k", 50)
             partition_candidates = self._prefilter_candidates(
-                partition_candidates, pi, D, top_k, reverse=reverse
-            )
-
-            F_snapshot = tuple(F)
-            E = self.generate_extended_set(
-                F,
-                DAG,
-                IDAG,
-                resolved_partitions,
-                optimized_partitions,
-                max_E_size=max_E_size,
-                max_lookahead=max_lookahead,
+                partition_candidates,
+                pi,
+                D,
+                top_k,
+                F=F_snapshot,
+                E=E,
+                candidate_cache=candidate_cache,
+                layout_partitions=optimized_partitions,
+                reverse=reverse,
+                W=E_W,
+                alpha=E_alpha,
             )
 
             # Group candidates by partition_idx to reuse _build_neighbor_info
@@ -1927,6 +2102,7 @@ def _heuristic_search_layout_only(
                         weight=self.config.get("path_tiebreak_weight", 0.2),
                         W=E_W,
                         alpha=E_alpha,
+                        layout_partitions=optimized_partitions,
                     )
                     prev_partition_idx = cand.partition_idx
                 score, swaps, output_perm = self.score_partition_candidate(
@@ -1948,6 +2124,8 @@ def _heuristic_search_layout_only(
                     ),
                     decay=decay,
                     cached_neighbor_info=cached_neighbor_info,
+                    candidate_cache=candidate_cache,
+                    layout_partitions=optimized_partitions,
                     return_transforms=True,
                 )
                 scores[ci] = score
@@ -2095,6 +2273,8 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   cnot_cost=1.0 / 3.0,
                                   path_tiebreak_weight=0.2, decay=None,
                                   cached_neighbor_info=None,
+                                  candidate_cache=None,
+                                  layout_partitions=None,
                                   return_transforms=False):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
@@ -2115,6 +2295,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                 weight=path_tiebreak_weight,
                 W=W,
                 alpha=alpha,
+                layout_partitions=layout_partitions,
             )
         swaps, output_perm = partition_candidate.transform_pi(
             pi,
@@ -2136,45 +2317,25 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             decay_factor=decay_factor,
         )
 
-        if canonical_data is None:
+        if candidate_cache is None:
             if return_transforms:
                 return score, swaps, output_perm
             return score
 
-        output_perm_arr = np.asarray(output_perm, dtype=np.intp)
-        D_arr = np.asarray(D)
         cand_idx = partition_candidate.partition_idx
-
-        # Basic component: average dist over F \ {cand}
-        f_sum = 0.0
-        n_other = 0
-        for partition_idx in F:
-            if partition_idx == cand_idx:
-                continue
-            entry = canonical_data.get(partition_idx)
-            if entry is None:
-                continue
-            n_other += 1
-            f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
-                entry, output_perm_arr, D_arr
-            )
-        if n_other > 0:
-            score += f_sum / n_other
-
-        # Lookahead component: alpha^depth-decayed average over E
-        if E:
-            e_sum = 0.0
-            for partition_idx, depth in E:
-                if partition_idx == cand_idx:
-                    continue
-                entry = canonical_data.get(partition_idx)
-                if entry is None:
-                    continue
-                d_cost = qgd_Partition_Aware_Mapping._entry_future_cost(
-                    entry, output_perm_arr, D_arr
-                )
-                e_sum += (alpha ** depth) * d_cost
-            score += W * e_sum / len(E)
+        score += qgd_Partition_Aware_Mapping._future_context_cost(
+            cand_idx,
+            output_perm,
+            F,
+            E,
+            D,
+            candidate_cache,
+            reverse=reverse,
+            cnot_cost=cnot_cost,
+            W=W,
+            alpha=alpha,
+            layout_partitions=layout_partitions,
+        )
 
         if return_transforms:
             return score, swaps, output_perm

From 1572c44f3e5f960007a144af87a627cfb0466571 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 25 Apr 2026 10:40:58 +0200
Subject: [PATCH 157/232] better routing and cost function

---
 squander/partitioning/ilp.py                  | 42 +++++++++++++++++++
 .../sabre_router/include/sabre_router.hpp     |  6 ++-
 .../src-cpp/sabre_router/sabre_router.cpp     | 22 ++++++----
 squander/synthesis/PartAM.py                  | 39 +++++++++++------
 4 files changed, 88 insertions(+), 21 deletions(-)

diff --git a/squander/partitioning/ilp.py b/squander/partitioning/ilp.py
index e3ad3e3c1..9731247e6 100644
--- a/squander/partitioning/ilp.py
+++ b/squander/partitioning/ilp.py
@@ -539,6 +539,48 @@ def sol_to_badsccs(g, allparts, L):
     _, scc = scc_tarjan_iterative(G_part)
     return {frozenset(v) for v in scc if len(v) > 1}
 
+def parts_to_overlap_scores(allparts, g, gate_to_qubit):
+    """
+    Per-part tie-breaker weights from logical-qubit overlap with DAG-downstream
+    candidate parts.
+
+    For each part i, score s[i] is the mean over candidate parts j reachable
+    from i in the gate DAG of |support(i) ∩ support(j)|. Returned weights are
+    `(s_max - s[i]) * eps` (lower is better — ILP minimizes), with eps small
+    enough that count-minimization in `ilp_global_optimal` is strictly
+    preserved when these weights are passed via `weights=`.
+
+    Args:
+        allparts (list[frozenset[int]]): Candidate parts (gate sets).
+        g (dict[int, set[int]]): Contracted gate DAG (u -> successors v) as
+            returned by `get_all_partitions`.
+        gate_to_qubit (dict[int, set[int]]): Gate -> qubits acted on.
+
+    Returns:
+        list[float]: weights[i] indexed like allparts, all in
+            [0, 1 / (len(allparts) * len(g))).
+    """
+    N = len(allparts)
+    if N == 0: return []
+    _, reach = nuutila_reach_scc(g)
+    gate_to_parts = {gate: [] for gate in g}
+    for i, part in enumerate(allparts):
+        for gate in part: gate_to_parts[gate].append(i)
+    supports = [set.union(*(gate_to_qubit[v] for v in part)) for part in allparts]
+    scores = [0.0] * N
+    for i, part in enumerate(allparts):
+        dgates = set().union(*(reach[u] for u in part)) - part
+        if not dgates: continue
+        succ_idxs = set().union(*(gate_to_parts[v] for v in dgates))
+        succ_idxs.discard(i)
+        if not succ_idxs: continue
+        sup_i = supports[i]
+        scores[i] = sum(len(sup_i & supports[j]) for j in succ_idxs) / len(succ_idxs)
+    s_max = max(scores)
+    if s_max == 0.0: return [0.0] * N
+    eps = 0.9 / (N * max(len(g), 1) * (s_max + 1.0))
+    return [(s_max - s) * eps for s in scores]
+
 def ilp_global_optimal(allparts, g, weighted_info=None, gurobi_direct=False, use_order=False, weights=None):
     """
     Select an optimal set of non-overlapping parts via ILP/MIP with cycle cuts.
diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index a91778482..fb003ae36 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -350,7 +350,8 @@ class SabreRouter {
         int top_k,
         const std::vector<int>& F_snapshot,
         const std::vector<std::pair<int,int>>& E,
-        bool reverse
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
     ) const;
 
     // Select best candidate with optional stochastic tie-breaking
@@ -436,7 +437,8 @@ class SabreRouter {
         const std::vector<int>& pi,
         const std::vector<int>& F_snapshot,
         const std::vector<std::pair<int,int>>& E,
-        bool reverse
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
     ) const;
 
     std::vector<int> estimate_candidate_output_layout(
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index a9fe1b788..fc4f513e9 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1131,7 +1131,8 @@ double SabreRouter::future_context_cost(
     const std::vector<int>& pi,
     const std::vector<int>& F_snapshot,
     const std::vector<std::pair<int,int>>& E,
-    bool reverse
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
     double f_sum = 0.0;
     int n_other = 0;
@@ -1145,16 +1146,19 @@ double SabreRouter::future_context_cost(
         ? f_sum / static_cast<double>(n_other)
         : 0.0;
 
+    // Extended set: BQSKit-style — just sum gate-edge distances on the
+    // partition's logical qubits, ignoring candidate permutations entirely.
     if (!E.empty()) {
         double e_sum = 0.0;
         for (auto [p_idx, depth] : E) {
             if (p_idx == exclude_partition_idx) continue;
+            auto it = canonical_data.find(p_idx);
+            if (it == canonical_data.end()) continue;
             const double alpha =
                 (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
                     ? alpha_weights_[depth]
                     : std::pow(config_.E_alpha, depth);
-            e_sum += alpha * partition_future_lower_bound(
-                p_idx, pi, reverse);
+            e_sum += alpha * entry_future_cost(it->second, pi);
         }
         score += config_.E_weight * e_sum / static_cast<double>(E.size());
     }
@@ -1271,7 +1275,8 @@ double SabreRouter::score_candidate(
         output_perm,
         F_snapshot,
         E,
-        reverse
+        reverse,
+        canonical_data
     );
     (void)resolved_F;
     (void)resolved_E;
@@ -1308,7 +1313,8 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     int top_k,
     const std::vector<int>& F_snapshot,
     const std::vector<std::pair<int,int>>& E,
-    bool reverse
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
     if (static_cast<int>(candidates.size()) <= top_k) return candidates;
     if (top_k <= 0) return {};
@@ -1323,7 +1329,8 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
             static_cast<double>(estimate_swap_count(*cand, pi, reverse)),
             cand->cnot_count
         ) + future_context_cost(
-            cand->partition_idx, approx_output, F_snapshot, E, reverse);
+            cand->partition_idx, approx_output, F_snapshot, E, reverse,
+            canonical_data);
         estimated.push_back({est, cand});
     }
 
@@ -1481,7 +1488,8 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
 
         // Prefilter with a cheap estimate of the candidate's future context.
         auto candidates = prefilter_candidates(
-            all_candidates, pi, config_.prefilter_top_k, F, E, reverse);
+            all_candidates, pi, config_.prefilter_top_k, F, E, reverse,
+            canonical_data);
 
         // Pre-resolve canonical entries for F and E once per F-step
         std::vector<ResolvedEntry> resolved_F;
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3b700575d..f1552c4e3 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -24,6 +24,7 @@
     _get_topo_order,
     topo_sort_partitions,
     ilp_global_optimal,
+    parts_to_overlap_scores,
 )
 # Module-level globals for pool workers (set via Pool initializer)
 _worker_config = None
@@ -158,6 +159,7 @@ def __init__(self, config):
             )
             self.config['path_tiebreak_weight'] = 0.49
         self.config.setdefault('cnot_cost', 1.0 / 3.0)  # 1 SWAP = 3 CNOTs
+        self.config.setdefault('overlap_tiebreak', True)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -490,8 +492,14 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         # ---- Phase 2: ILP partition selection ----
         # Minimize total partition count so PAM gets the largest blocks possible
         # under max_partition_size. Larger blocks = more (P_i, P_o) freedom to
-        # absorb routing SWAPs.
-        L_parts, _ = ilp_global_optimal(allparts, g)
+        # absorb routing SWAPs. Overlap-based tie-breaker (when enabled)
+        # picks deterministically among min-count covers, preferring covers
+        # whose parts share more logical qubits with their DAG successors.
+        if self.config['overlap_tiebreak']:
+            tb_weights = parts_to_overlap_scores(allparts, g, gate_to_qubit)
+            L_parts, _ = ilp_global_optimal(allparts, g, weights=tb_weights)
+        else:
+            L_parts, _ = ilp_global_optimal(allparts, g)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----
         selected_surrounded_starts = set()
@@ -1349,6 +1357,7 @@ def _prefilter_candidates(
         reverse=False,
         W=0.5,
         alpha=1.0,
+        canonical_data=None,
     ):
         """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
         if len(partition_candidates) <= top_k:
@@ -1375,6 +1384,7 @@ def _prefilter_candidates(
                     W=W,
                     alpha=alpha,
                     layout_partitions=layout_partitions,
+                    canonical_data=canonical_data,
                 )
             )
             for pc in partition_candidates
@@ -1593,6 +1603,7 @@ def _future_context_cost(
         W=0.5,
         alpha=1.0,
         layout_partitions=None,
+        canonical_data=None,
     ):
         f_sum = 0.0
         n_other = 0
@@ -1611,21 +1622,22 @@ def _future_context_cost(
             n_other += 1
         score = f_sum / n_other if n_other > 0 else 0.0
 
+        # Extended set: BQSKit-style — sum gate-edge distances on the partition's
+        # logical qubits, ignoring candidate permutations entirely.
         if E:
             e_sum = 0.0
+            pi_arr = np.asarray(pi, dtype=np.intp)
+            D_arr = np.asarray(D)
             for p_idx, depth in E:
                 if p_idx == exclude_partition_idx:
                     continue
-                e_sum += (
-                    alpha ** depth
-                ) * qgd_Partition_Aware_Mapping._partition_future_lower_bound(
-                    p_idx,
-                    pi,
-                    D,
-                    candidate_cache,
-                    reverse=reverse,
-                    cnot_cost=cnot_cost,
-                    layout_partitions=layout_partitions,
+                if canonical_data is None:
+                    continue
+                entry = canonical_data.get(p_idx)
+                if entry is None:
+                    continue
+                e_sum += (alpha ** depth) * qgd_Partition_Aware_Mapping._entry_future_cost(
+                    entry, pi_arr, D_arr
                 )
             score += W * e_sum / len(E)
         return score
@@ -1861,6 +1873,7 @@ def Heuristic_Search(
                 layout_partitions=optimized_partitions,
                 W=E_W,
                 alpha=E_alpha,
+                canonical_data=canonical_data,
             )
 
             # Group candidates by partition_idx to reuse _build_neighbor_info
@@ -2078,6 +2091,7 @@ def _heuristic_search_layout_only(
                 reverse=reverse,
                 W=E_W,
                 alpha=E_alpha,
+                canonical_data=canonical_data,
             )
 
             # Group candidates by partition_idx to reuse _build_neighbor_info
@@ -2335,6 +2349,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             W=W,
             alpha=alpha,
             layout_partitions=layout_partitions,
+            canonical_data=canonical_data,
         )
 
         if return_transforms:

From 318c987b68dfb705c545a03361db85bb6df54cab Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 25 Apr 2026 13:50:27 +0200
Subject: [PATCH 158/232] Use BQSKit-style canonical-edge cost for routing
 future context

---
 .../sabre_router/include/sabre_router.hpp     | 11 ---
 .../src-cpp/sabre_router/sabre_router.cpp     | 70 ++-------------
 squander/synthesis/PartAM.py                  | 86 ++++---------------
 3 files changed, 24 insertions(+), 143 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index fb003ae36..6fd6edfc4 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -421,17 +421,6 @@ class SabreRouter {
         const std::vector<int>& pi
     ) const;
 
-    double partition_compactness_cost(
-        int partition_idx,
-        const std::vector<int>& pi
-    ) const;
-
-    double partition_future_lower_bound(
-        int partition_idx,
-        const std::vector<int>& pi,
-        bool reverse
-    ) const;
-
     double future_context_cost(
         int exclude_partition_idx,
         const std::vector<int>& pi,
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index fc4f513e9..de40ae976 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1068,64 +1068,6 @@ double SabreRouter::entry_future_cost(
     return total;
 }
 
-double SabreRouter::partition_compactness_cost(
-    int partition_idx,
-    const std::vector<int>& pi
-) const {
-    if (
-        partition_idx < 0
-        || partition_idx >= static_cast<int>(layout_partitions_.size())
-    ) {
-        return 0.0;
-    }
-
-    const auto& involved = layout_partitions_[partition_idx].involved_qbits;
-    if (involved.size() < 2) {
-        return 0.0;
-    }
-
-    double best = std::numeric_limits<double>::infinity();
-    for (int q : involved) {
-        double term = 0.0;
-        for (int p : involved) {
-            if (p == q) continue;
-            term += dist(pi[q], pi[p]);
-        }
-        best = std::min(best, term);
-    }
-    return std::isfinite(best) ? best : 0.0;
-}
-
-double SabreRouter::partition_future_lower_bound(
-    int partition_idx,
-    const std::vector<int>& pi,
-    bool reverse
-) const {
-    if (
-        partition_idx < 0
-        || partition_idx >= static_cast<int>(candidate_cache_.size())
-    ) {
-        return 0.0;
-    }
-
-    const auto& candidates = candidate_cache_[partition_idx];
-    if (candidates.empty()) {
-        return partition_compactness_cost(partition_idx, pi);
-    }
-
-    double best = std::numeric_limits<double>::infinity();
-    for (const auto& cand : candidates) {
-        const double cost = routing_objective(
-            static_cast<double>(estimate_swap_count(cand, pi, reverse)),
-            cand.cnot_count
-        );
-        best = std::min(best, cost);
-    }
-    return std::isfinite(best)
-        ? best
-        : partition_compactness_cost(partition_idx, pi);
-}
-
 double SabreRouter::future_context_cost(
     int exclude_partition_idx,
     const std::vector<int>& pi,
@@ -1134,11 +1076,19 @@ double SabreRouter::future_context_cost(
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
+    (void)reverse;
+
+    // BQSKit-style cost: sum max(0, dist - 1) over each canonical gate edge,
+    // with no candidate-permutation enumeration. Same shape for F and E so
+    // the future signal is monotone in distance instead of flickering with
+    // whichever candidate happens to win the lower bound.
     double f_sum = 0.0;
     int n_other = 0;
     for (int p_idx : F_snapshot) {
         if (p_idx == exclude_partition_idx) continue;
-        f_sum += partition_future_lower_bound(p_idx, pi, reverse);
+        auto it = canonical_data.find(p_idx);
+        if (it == canonical_data.end()) continue;
+        f_sum += entry_future_cost(it->second, pi);
         n_other++;
     }
 
@@ -1146,8 +1096,6 @@ double SabreRouter::future_context_cost(
         ? f_sum / static_cast<double>(n_other)
         : 0.0;
 
-    // Extended set: BQSKit-style — just sum gate-edge distances on the
-    // partition's logical qubits, ignoring candidate permutations entirely.
     if (!E.empty()) {
         double e_sum = 0.0;
         for (auto [p_idx, depth] : E) {
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index f1552c4e3..efef16bcb 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1518,65 +1518,6 @@ def _entry_future_cost(entry, output_perm_arr, D_arr):
         phys_v = output_perm_arr[entry["edges_v"]]
         return float(np.maximum(0, D_arr[phys_u, phys_v] - 1).sum())
 
-    @staticmethod
-    def _partition_compactness_cost(partition_idx, pi, layout_partitions, D):
-        if (
-            layout_partitions is None
-            or partition_idx < 0
-            or partition_idx >= len(layout_partitions)
-        ):
-            return 0.0
-        involved = qgd_Partition_Aware_Mapping._partition_involved_qbits(
-            layout_partitions[partition_idx]
-        )
-        if len(involved) < 2:
-            return 0.0
-
-        pi_arr = np.asarray(pi, dtype=np.intp)
-        D_arr = np.asarray(D)
-        best = float("inf")
-        for q in involved:
-            term = 0.0
-            for p in involved:
-                if p != q:
-                    term += float(D_arr[pi_arr[q], pi_arr[p]])
-            best = min(best, term)
-        return 0.0 if not np.isfinite(best) else best
-
-    @staticmethod
-    def _partition_future_lower_bound(
-        partition_idx,
-        pi,
-        D,
-        candidate_cache,
-        reverse=False,
-        cnot_cost=1.0 / 3.0,
-        layout_partitions=None,
-    ):
-        if (
-            candidate_cache is None
-            or partition_idx < 0
-            or partition_idx >= len(candidate_cache)
-            or not candidate_cache[partition_idx]
-        ):
-            return qgd_Partition_Aware_Mapping._partition_compactness_cost(
-                partition_idx, pi, layout_partitions, D
-            )
-
-        best = float("inf")
-        for cand in candidate_cache[partition_idx]:
-            cost = qgd_Partition_Aware_Mapping._routing_objective(
-                cand.estimate_swap_count(pi, D, reverse=reverse),
-                cand.cnot_count,
-                cnot_cost,
-            )
-            best = min(best, cost)
-        if np.isfinite(best):
-            return best
-        return qgd_Partition_Aware_Mapping._partition_compactness_cost(
-            partition_idx, pi, layout_partitions, D
-        )
-
     @staticmethod
     def _estimate_candidate_output_layout(partition_candidate, pi, reverse=False):
         P_exit = partition_candidate.P_i if reverse else partition_candidate.P_o
@@ -1605,29 +1546,32 @@ def _future_context_cost(
         layout_partitions=None,
         canonical_data=None,
     ):
+        del candidate_cache, reverse, cnot_cost, layout_partitions
+
+        # BQSKit-style cost: sum max(0, dist - 1) over each canonical gate edge,
+        # for both F and E. No candidate-permutation enumeration so the future
+        # signal stays monotone in distance.
+        pi_arr = np.asarray(pi, dtype=np.intp)
+        D_arr = np.asarray(D)
+
         f_sum = 0.0
         n_other = 0
         for p_idx in F:
             if p_idx == exclude_partition_idx:
                 continue
-            f_sum += qgd_Partition_Aware_Mapping._partition_future_lower_bound(
-                p_idx,
-                pi,
-                D,
-                candidate_cache,
-                reverse=reverse,
-                cnot_cost=cnot_cost,
-                layout_partitions=layout_partitions,
+            if canonical_data is None:
+                continue
+            entry = canonical_data.get(p_idx)
+            if entry is None:
+                continue
+            f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
+                entry, pi_arr, D_arr
             )
             n_other += 1
         score = f_sum / n_other if n_other > 0 else 0.0
 
-        # Extended set: BQSKit-style — sum gate-edge distances on the partition's
-        # logical qubits, ignoring candidate permutations entirely.
         if E:
             e_sum = 0.0
-            pi_arr = np.asarray(pi, dtype=np.intp)
-            D_arr = np.asarray(D)
             for p_idx, depth in E:
                 if p_idx == exclude_partition_idx:
                     continue

From 10cabbaca47364e5d4d55c90e0b9751bc7a5bbf1 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 25 Apr 2026 18:05:16 +0200
Subject: [PATCH 159/232] Remove pre filtering

---
 .../sabre_router/include/sabre_router.hpp     |  36 --
 .../src-cpp/sabre_router/sabre_router.cpp     | 365 +++++++-----------
 squander/synthesis/PartAM.py                  |  89 ++---
 squander/synthesis/PartAM_utils.py            |  32 +-
 4 files changed, 192 insertions(+), 330 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 6fd6edfc4..4ba58aa1d 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -262,13 +262,6 @@ class SabreRouter {
         const std::vector<std::vector<int>>& parents_graph
     ) const;
 
-    // Pre-resolved canonical entries for an F-step (avoids hash lookups per candidate)
-    struct ResolvedEntry {
-        int partition_idx;
-        const CanonicalEntry* entry; // may be null
-        double alpha; // 1.0 for F, alpha^depth for E
-    };
-
     // LightSABRE scoring (port of score_partition_candidate)
     double score_candidate(
         const CandidateData& cand,
@@ -281,8 +274,6 @@ class SabreRouter {
         const std::vector<double>* decay = nullptr,
         std::vector<std::pair<int,int>>* out_swaps = nullptr,
         std::vector<int>* out_pi_new = nullptr,
-        const std::vector<ResolvedEntry>* resolved_F = nullptr,
-        const std::vector<ResolvedEntry>* resolved_E = nullptr,
         const NeighborInfo* cached_neighbor_info = nullptr
     ) const;
 
@@ -389,33 +380,6 @@ class SabreRouter {
         std::mt19937& rng
     ) const;
 
-    // Build P_route_inv: the inverse permutation used for routing
-    std::vector<int> build_route_inv(const std::vector<int>& P, bool reverse) const;
-
-    // Build target dict for A*: {qbit_map_key -> node_mapping[P_route_inv[qbit_map_val]]}
-    void build_target_positions(
-        const CandidateData& cand,
-        bool reverse,
-        std::vector<int>& out_keys,
-        std::vector<int>& out_targets
-    ) const;
-
-    // Compute routing cost for canonical edges under a given pi
-    double compute_routing_cost(
-        const std::vector<int>& pi,
-        int exclude_partition_idx,
-        const std::vector<int>& partition_indices,
-        const std::unordered_map<int, CanonicalEntry>& canonical_data
-    ) const;
-
-    // Compute lookahead cost with alpha^depth decay
-    double compute_lookahead_cost(
-        const std::vector<int>& pi,
-        int exclude_partition_idx,
-        const std::vector<std::pair<int,int>>& E,
-        const std::unordered_map<int, CanonicalEntry>& canonical_data
-    ) const;
-
     double entry_future_cost(
         const CanonicalEntry& entry,
         const std::vector<int>& pi
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index de40ae976..779dfab72 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -247,40 +247,6 @@ std::vector<int> SabreRouter::sample_initial_layout(
     return random_permutation(N_, rng);
 }
 
-// ---------------------------------------------------------------------------
-// Helper: build P_route_inv
-// ---------------------------------------------------------------------------
-
-std::vector<int> SabreRouter::build_route_inv(const std::vector<int>& P, bool /*reverse*/) const {
-    // P_route_inv[i] = index of i in P (inverse permutation)
-    int k = static_cast<int>(P.size());
-    std::vector<int> inv(k);
-    for (int i = 0; i < k; i++) {
-        inv[P[i]] = i;
-    }
-    return inv;
-}
-
-// ---------------------------------------------------------------------------
-// Helper: build target positions for A*
-// ---------------------------------------------------------------------------
-
-void SabreRouter::build_target_positions(
-    const CandidateData& cand,
-    bool reverse,
-    std::vector<int>& out_keys,
-    std::vector<int>& out_targets
-) const {
-    const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
-
-    out_keys = cand.qbit_map_keys;
-    out_targets.resize(cand.qbit_map_keys.size());
-    for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) {
-        int v = cand.qbit_map_vals[i];
-        out_targets[i] = cand.node_mapping_flat[P_route_inv[v]];
-    }
-}
-
 // ---------------------------------------------------------------------------
 // apply_swaps_to_pi
 // ---------------------------------------------------------------------------
@@ -312,7 +278,6 @@ NeighborInfo SabreRouter::build_neighbor_info(
     const std::vector<int>& pi,
     const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
-    (void)canonical_data;
     NeighborInfo info;
     info.weight = config_.path_tiebreak_weight;
     if (info.weight <= 0.0) {
@@ -369,6 +334,17 @@ NeighborInfo SabreRouter::build_neighbor_info(
             partition_idx < 0
             || partition_idx >= static_cast<int>(layout_partitions_.size())
         ) return;
+        auto canonical_it = canonical_data.find(partition_idx);
+        if (canonical_it != canonical_data.end()
+            && !canonical_it->second.edges_u.empty()
+        ) {
+            const auto& entry = canonical_it->second;
+            for (size_t i = 0; i < entry.edges_u.size(); i++) {
+                add_edge(entry.edges_u[i], entry.edges_v[i], weight);
+            }
+            return;
+        }
+
         const auto& involved = layout_partitions_[partition_idx].involved_qbits;
         if (involved.size() < 2) return;
         for (size_t i = 0; i < involved.size(); i++) {
@@ -551,36 +527,16 @@ std::pair<std::vector<std::pair<int,int>>, std::vector<int>> SabreRouter::releas
 
 std::vector<int> SabreRouter::get_initial_layer() const {
     std::vector<int> layer;
-    std::vector<uint8_t> covered(N_, 0);
-    int uncovered = N_;
-    for (int p = 0; p < num_partitions_ && uncovered > 0; p++) {
-        if (IDAG_[p].empty()) {
-            layer.push_back(p);
-            for (int q : layout_partitions_[p].involved_qbits) {
-                if (q < N_ && !covered[q]) {
-                    covered[q] = 1;
-                    uncovered--;
-                }
-            }
-        }
+    for (int p = 0; p < num_partitions_; p++) {
+        if (IDAG_[p].empty()) layer.push_back(p);
     }
     return layer;
 }
 
 std::vector<int> SabreRouter::get_final_layer() const {
     std::vector<int> layer;
-    std::vector<uint8_t> covered(N_, 0);
-    int uncovered = N_;
-    for (int p = num_partitions_ - 1; p >= 0 && uncovered > 0; p--) {
-        if (DAG_[p].empty()) {
-            layer.push_back(p);
-            for (int q : layout_partitions_[p].involved_qbits) {
-                if (q < N_ && !covered[q]) {
-                    covered[q] = 1;
-                    uncovered--;
-                }
-            }
-        }
+    for (int p = num_partitions_ - 1; p >= 0; p--) {
+        if (DAG_[p].empty()) layer.push_back(p);
     }
     return layer;
 }
@@ -724,7 +680,7 @@ SabreRouter::find_constrained_swaps(
         initial_nb_total = compute_nb_total(neighbor_info->initial_pos);
     }
 
-    // ---- Arena + open-addressed hash table (replaces visited+parent maps) ----
+    // ---- Arena + best-state table (replaces visited+parent maps) ----
     struct Node {
         int64_t packed;
         int parent_idx;
@@ -732,52 +688,77 @@ SabreRouter::find_constrained_swaps(
         int sw_lo, sw_hi;
         double h_sum;       // sum(dist(pos[i], target[i])) — twice the admissible h
         double nb_total;    // sum(edge.weight * dist(...)) — pre-scale
-        int nb_arena_idx;   // -1 if !use_neighbor; else index into nb_arena
+        int nb_arena_idx;   // -1 if !use_neighbor; else slot in nb_pos_flat
+    };
+    struct StateKey {
+        int64_t packed;
+        std::vector<int> nb_pos;
+
+        bool operator==(const StateKey& other) const {
+            return packed == other.packed && nb_pos == other.nb_pos;
+        }
+    };
+    struct StateKeyHash {
+        size_t operator()(const StateKey& key) const {
+            uint64_t h = static_cast<uint64_t>(key.packed);
+            h ^= h >> 33; h *= 0xff51afd7ed558ccdULL;
+            h ^= h >> 33; h *= 0xc4ceb9fe1a85ec53ULL;
+            h ^= h >> 33;
+            for (int v : key.nb_pos) {
+                h ^= static_cast<uint64_t>(v) + 0x9e3779b97f4a7c15ULL
+                   + (h << 6) + (h >> 2);
+            }
+            return static_cast<size_t>(h);
+        }
     };
     thread_local std::vector<Node> arena;
-    thread_local std::vector<int32_t> table;
-    thread_local std::vector<std::vector<int>> nb_arena;
+    // Flat storage for neighbor positions: slot s lives at
+    // [s * nb_stride, (s+1) * nb_stride). Slots are shared across nodes whose
+    // swap doesn't touch any neighbor virtual qubit.
+    thread_local std::vector<int> nb_pos_flat;
+    thread_local std::vector<std::vector<int>> vq_edges;
+    thread_local std::vector<int> nb_scratch;
     arena.clear();
-    nb_arena.clear();
+    nb_pos_flat.clear();
     arena.reserve(1024);
+    std::unordered_map<StateKey, int32_t, StateKeyHash> best_node;
+    best_node.reserve(2048);
 
-    // table size: power of 2, ~2x expected entries
-    size_t cap = 1024;
-    table.assign(cap, -1);
-
-    auto hash_packed = [](int64_t v) -> uint64_t {
-        uint64_t x = static_cast<uint64_t>(v);
-        x ^= x >> 33; x *= 0xff51afd7ed558ccdULL;
-        x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53ULL;
-        x ^= x >> 33;
-        return x;
-    };
-
-    auto table_grow = [&]() {
-        std::vector<int32_t> new_table(table.size() * 2, -1);
-        const size_t mask = new_table.size() - 1;
-        for (int32_t idx : table) {
-            if (idx < 0) continue;
-            size_t i = hash_packed(arena[idx].packed) & mask;
-            while (new_table[i] >= 0) i = (i + 1) & mask;
-            new_table[i] = idx;
+    const int nb_stride = use_neighbor
+        ? static_cast<int>(neighbor_info->neighbor_vqs.size())
+        : 0;
+    if (use_neighbor) {
+        // Per-vq edge index list: which edges touch each virtual qubit.
+        vq_edges.assign(nb_stride, {});
+        for (int e = 0; e < static_cast<int>(neighbor_info->edges.size()); e++) {
+            const auto& edge = neighbor_info->edges[e];
+            vq_edges[edge.u_idx].push_back(e);
+            if (edge.v_idx != edge.u_idx) {
+                vq_edges[edge.v_idx].push_back(e);
+            }
         }
-        table = std::move(new_table);
-    };
+        nb_pos_flat.reserve(static_cast<size_t>(nb_stride) * 1024);
+        nb_pos_flat.insert(nb_pos_flat.end(),
+                           neighbor_info->initial_pos.begin(),
+                           neighbor_info->initial_pos.end());
+        nb_scratch.resize(nb_stride);
+    }
 
-    // Returns slot index in `table`. *slot is -1 if empty.
-    auto table_slot = [&](int64_t packed) -> size_t {
-        const size_t mask = table.size() - 1;
-        size_t i = hash_packed(packed) & mask;
-        while (true) {
-            int32_t idx = table[i];
-            if (idx < 0) return i;
-            if (arena[idx].packed == packed) return i;
-            i = (i + 1) & mask;
+    auto make_state_key = [&](int64_t packed, int nb_arena_idx) {
+        StateKey key;
+        key.packed = packed;
+        if (use_neighbor) {
+            const size_t base = static_cast<size_t>(nb_arena_idx) * nb_stride;
+            key.nb_pos.assign(
+                nb_pos_flat.begin() + static_cast<std::ptrdiff_t>(base),
+                nb_pos_flat.begin() + static_cast<std::ptrdiff_t>(base + nb_stride)
+            );
         }
+        return key;
     };
 
     // ---- Push initial node ----
+    // Slot 0 of nb_pos_flat already holds neighbor_info->initial_pos.
     {
         Node n;
         n.packed = initial_packed;
@@ -786,13 +767,9 @@ SabreRouter::find_constrained_swaps(
         n.sw_lo = -1; n.sw_hi = -1;
         n.h_sum = h0_sum;
         n.nb_total = initial_nb_total;
-        n.nb_arena_idx = -1;
-        if (use_neighbor) {
-            n.nb_arena_idx = static_cast<int>(nb_arena.size());
-            nb_arena.push_back(neighbor_info->initial_pos);
-        }
+        n.nb_arena_idx = use_neighbor ? 0 : -1;
         arena.push_back(n);
-        table[table_slot(initial_packed)] = 0;
+        best_node.emplace(make_state_key(initial_packed, n.nb_arena_idx), 0);
     }
 
     // PQ entry: (f, g, counter, arena_idx)
@@ -812,10 +789,12 @@ SabreRouter::find_constrained_swaps(
         const int64_t packed = arena[idx].packed;
 
         // A state can be reinserted with a lower g-cost after this queue entry
-        // was pushed. The hash table always points at the current best arena
-        // node for a packed state, so discard stale superseded nodes before
-        // accepting a target or expanding neighbors.
-        if (table[table_slot(packed)] != idx) {
+        // was pushed. When the neighbor tie-breaker is active, future-qubit
+        // positions are part of the state so equal-length paths with different
+        // bystander layouts are not collapsed.
+        StateKey cur_key = make_state_key(packed, arena[idx].nb_arena_idx);
+        auto cur_best = best_node.find(cur_key);
+        if (cur_best == best_node.end() || cur_best->second != idx) {
             continue;
         }
 
@@ -880,35 +859,61 @@ SabreRouter::find_constrained_swaps(
                 }
 
                 const int new_g = g + 1;
-                const size_t slot = table_slot(new_packed);
-                const int32_t existing = table[slot];
-                if (existing >= 0 && arena[existing].g <= new_g) {
-                    continue;
-                }
 
-                // Neighbor heuristic: simple recompute (cheaper than incremental for small edge counts)
+                // Neighbor heuristic: incremental delta. Only edges incident
+                // to the affected virtual qubits change; everything else
+                // contributes the same dist as in the parent state.
                 double new_nb_total = cur_nb_total;
                 int new_nb_arena_idx = -1;
                 if (use_neighbor) {
-                    std::vector<int> new_pos_nb = nb_arena[cur_nb_arena_idx];
-                    int idx_nb = -1, idx_p = -1;
-                    for (size_t z = 0; z < new_pos_nb.size(); z++) {
-                        const int phys = new_pos_nb[z];
-                        if (phys == nb) idx_nb = static_cast<int>(z);
-                        else if (phys == p) idx_p = static_cast<int>(z);
-                        if (idx_nb >= 0 && idx_p >= 0) break;
+                    const size_t parent_base =
+                        static_cast<size_t>(cur_nb_arena_idx) * nb_stride;
+                    for (int z = 0; z < nb_stride; z++) {
+                        nb_scratch[z] = nb_pos_flat[parent_base + z];
                     }
-                    if (idx_nb >= 0 || idx_p >= 0) {
-                        if (idx_nb >= 0) new_pos_nb[idx_nb] = p;
-                        if (idx_p >= 0)  new_pos_nb[idx_p]  = nb;
-                        new_nb_total = compute_nb_total(new_pos_nb);
-                        new_nb_arena_idx = static_cast<int>(nb_arena.size());
-                        nb_arena.push_back(std::move(new_pos_nb));
+                    int idx_nb_vq = -1, idx_p_vq = -1;
+                    for (int z = 0; z < nb_stride; z++) {
+                        const int phys = nb_scratch[z];
+                        if (phys == nb) idx_nb_vq = z;
+                        else if (phys == p) idx_p_vq = z;
+                        if (idx_nb_vq >= 0 && idx_p_vq >= 0) break;
+                    }
+                    if (idx_nb_vq >= 0 || idx_p_vq >= 0) {
+                        double delta = 0.0;
+                        auto accum = [&](int vq_idx, double sign) {
+                            if (vq_idx < 0) return;
+                            for (int e : vq_edges[vq_idx]) {
+                                const auto& edge = neighbor_info->edges[e];
+                                delta += sign * edge.weight * dist(
+                                    nb_scratch[edge.u_idx],
+                                    nb_scratch[edge.v_idx]);
+                            }
+                        };
+                        accum(idx_nb_vq, -1.0);
+                        accum(idx_p_vq, -1.0);
+                        if (idx_nb_vq >= 0) nb_scratch[idx_nb_vq] = p;
+                        if (idx_p_vq >= 0)  nb_scratch[idx_p_vq]  = nb;
+                        accum(idx_nb_vq, +1.0);
+                        accum(idx_p_vq, +1.0);
+                        new_nb_total = cur_nb_total + delta;
+                        new_nb_arena_idx = static_cast<int>(
+                            nb_pos_flat.size() / nb_stride);
+                        nb_pos_flat.insert(nb_pos_flat.end(),
+                                           nb_scratch.begin(),
+                                           nb_scratch.end());
                     } else {
                         new_nb_arena_idx = cur_nb_arena_idx;
                     }
                 }
 
+                StateKey new_key = make_state_key(new_packed, new_nb_arena_idx);
+                auto existing = best_node.find(new_key);
+                if (existing != best_node.end()
+                    && arena[existing->second].g <= new_g
+                ) {
+                    continue;
+                }
+
                 // Insert/update node
                 Node n;
                 n.packed = new_packed;
@@ -923,15 +928,7 @@ SabreRouter::find_constrained_swaps(
 
                 int32_t new_idx = static_cast<int32_t>(arena.size());
                 arena.push_back(n);
-
-                // Re-find slot if arena grew (table didn't, but slot is still valid
-                // since we didn't grow `table`); just write
-                table[slot] = new_idx;
-
-                // Grow table if load factor too high (> 0.5)
-                if (arena.size() * 2 > table.size()) {
-                    table_grow();
-                }
+                best_node[std::move(new_key)] = new_idx;
 
                 const double f_new = static_cast<double>(new_g)
                                    + 0.5 * new_h_sum
@@ -1133,43 +1130,6 @@ std::vector<int> SabreRouter::estimate_candidate_output_layout(
     return pi_output;
 }
 
-double SabreRouter::compute_routing_cost(
-    const std::vector<int>& pi,
-    int exclude_partition_idx,
-    const std::vector<int>& partition_indices,
-    const std::unordered_map<int, CanonicalEntry>& canonical_data
-) const {
-    double total = 0.0;
-    for (int p_idx : partition_indices) {
-        if (p_idx == exclude_partition_idx) continue;
-        auto it = canonical_data.find(p_idx);
-        if (it == canonical_data.end()) continue;
-        total += entry_future_cost(it->second, pi);
-    }
-    return total;
-}
-
-double SabreRouter::compute_lookahead_cost(
-    const std::vector<int>& pi,
-    int exclude_partition_idx,
-    const std::vector<std::pair<int,int>>& E,
-    const std::unordered_map<int, CanonicalEntry>& canonical_data
-) const {
-    if (E.empty()) return 0.0;
-    double total = 0.0;
-    for (auto [p_idx, depth] : E) {
-        if (p_idx == exclude_partition_idx) continue;
-        auto it = canonical_data.find(p_idx);
-        if (it == canonical_data.end()) continue;
-        const double d_cost = entry_future_cost(it->second, pi);
-        const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
-            ? alpha_weights_[depth]
-            : std::pow(config_.E_alpha, depth);
-        total += alpha * d_cost;
-    }
-    return config_.E_weight * total / static_cast<double>(E.size());
-}
-
 // ---------------------------------------------------------------------------
 // score_candidate (LightSABRE scoring)
 // ---------------------------------------------------------------------------
@@ -1185,8 +1145,6 @@ double SabreRouter::score_candidate(
     const std::vector<double>* decay,
     std::vector<std::pair<int,int>>* out_swaps,
     std::vector<int>* out_pi_new,
-    const std::vector<ResolvedEntry>* resolved_F,
-    const std::vector<ResolvedEntry>* resolved_E,
     const NeighborInfo* cached_neighbor_info
 ) const {
     NeighborInfo local_neighbor_info;
@@ -1226,8 +1184,6 @@ double SabreRouter::score_candidate(
         reverse,
         canonical_data
     );
-    (void)resolved_F;
-    (void)resolved_E;
 
     if (out_swaps) *out_swaps = std::move(swaps);
     if (out_pi_new) *out_pi_new = std::move(output_perm);
@@ -1253,6 +1209,7 @@ std::vector<const CandidateData*> SabreRouter::obtain_partition_candidates(
 
 // ---------------------------------------------------------------------------
 // prefilter_candidates
+// Disabled for routing quality: exact scoring sees every candidate.
 // ---------------------------------------------------------------------------
 
 std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
@@ -1264,39 +1221,13 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
-    if (static_cast<int>(candidates.size()) <= top_k) return candidates;
-    if (top_k <= 0) return {};
-
-    using Pair = std::pair<double, const CandidateData*>;
-    std::vector<Pair> estimated;
-    estimated.reserve(candidates.size());
-    for (const auto* cand : candidates) {
-        const auto approx_output = estimate_candidate_output_layout(
-            *cand, pi, reverse);
-        const double est = routing_objective(
-            static_cast<double>(estimate_swap_count(*cand, pi, reverse)),
-            cand->cnot_count
-        ) + future_context_cost(
-            cand->partition_idx, approx_output, F_snapshot, E, reverse,
-            canonical_data);
-        estimated.push_back({est, cand});
-    }
-
-    std::nth_element(
-        estimated.begin(),
-        estimated.begin() + top_k,
-        estimated.end(),
-        [](const Pair& a, const Pair& b) {
-            return a.first < b.first;
-        }
-    );
-
-    std::vector<const CandidateData*> result;
-    result.reserve(top_k);
-    for (int i = 0; i < top_k; i++) {
-        result.push_back(estimated[i].second);
-    }
-    return result;
+    (void)pi;
+    (void)top_k;
+    (void)F_snapshot;
+    (void)E;
+    (void)reverse;
+    (void)canonical_data;
+    return candidates;
 }
 
 // ---------------------------------------------------------------------------
@@ -1439,25 +1370,6 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             all_candidates, pi, config_.prefilter_top_k, F, E, reverse,
             canonical_data);
 
-        // Pre-resolve canonical entries for F and E once per F-step
-        std::vector<ResolvedEntry> resolved_F;
-        resolved_F.reserve(F.size());
-        for (int p_idx : F) {
-            auto it = canonical_data.find(p_idx);
-            const CanonicalEntry* ent = (it != canonical_data.end()) ? &it->second : nullptr;
-            resolved_F.push_back({p_idx, ent, 1.0});
-        }
-        std::vector<ResolvedEntry> resolved_E;
-        resolved_E.reserve(E.size());
-        for (auto [p_idx, depth] : E) {
-            auto it = canonical_data.find(p_idx);
-            const CanonicalEntry* ent = (it != canonical_data.end()) ? &it->second : nullptr;
-            const double alpha = (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
-                ? alpha_weights_[depth]
-                : std::pow(config_.E_alpha, depth);
-            resolved_E.push_back({p_idx, ent, alpha});
-        }
-
         // Group candidates by partition_idx so build_neighbor_info is shared
         std::vector<size_t> order(candidates.size());
         std::iota(order.begin(), order.end(), 0);
@@ -1483,7 +1395,6 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
                 F, pi, E, reverse, canonical_data,
                 &swap_cache, &decay,
                 &cached_swaps[ci], &cached_pi[ci],
-                &resolved_F, &resolved_E,
                 &cached_ni
             );
         }
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index efef16bcb..e1b2abb02 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1359,38 +1359,21 @@ def _prefilter_candidates(
         alpha=1.0,
         canonical_data=None,
     ):
-        """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
-        if len(partition_candidates) <= top_k:
-            return partition_candidates
-        cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
-        estimates = np.array([
-            (
-                self._routing_objective(
-                    pc.estimate_swap_count(pi, D, reverse=reverse),
-                    pc.cnot_count,
-                    cnot_cost,
-                )
-                + self._future_context_cost(
-                    pc.partition_idx,
-                    self._estimate_candidate_output_layout(
-                        pc, pi, reverse=reverse
-                    ),
-                    F or (),
-                    E or (),
-                    D,
-                    candidate_cache,
-                    reverse=reverse,
-                    cnot_cost=cnot_cost,
-                    W=W,
-                    alpha=alpha,
-                    layout_partitions=layout_partitions,
-                    canonical_data=canonical_data,
-                )
-            )
-            for pc in partition_candidates
-        ])
-        top_k_indices = np.argpartition(estimates, top_k)[:top_k]
-        return [partition_candidates[i] for i in top_k_indices]
+        """Return all candidates; cheap prefiltering is disabled for routing quality."""
+        del (
+            pi,
+            D,
+            top_k,
+            F,
+            E,
+            candidate_cache,
+            layout_partitions,
+            reverse,
+            W,
+            alpha,
+            canonical_data,
+        )
+        return partition_candidates
 
     @staticmethod
     def _decay_factor_for_swaps(swaps, decay):
@@ -1642,7 +1625,6 @@ def _build_neighbor_info(
         alpha=0.9,
         layout_partitions=None,
     ):
-        del canonical_data
         if weight <= 0 or layout_partitions is None:
             return None
 
@@ -1654,6 +1636,19 @@ def add_edges(target_idx, edge_weight):
                 return
             if target_idx >= len(layout_partitions):
                 return
+            entry = canonical_data.get(target_idx) if canonical_data else None
+            if entry is not None and entry.get("edges_u") is not None:
+                for u, v in zip(entry["edges_u"], entry["edges_v"]):
+                    u = int(u)
+                    v = int(v)
+                    qubits.add(u)
+                    qubits.add(v)
+                    key = (u, v) if u <= v else (v, u)
+                    edge_weights[key] = (
+                        edge_weights.get(key, 0.0) + edge_weight
+                    )
+                return
+
             involved = qgd_Partition_Aware_Mapping._partition_involved_qbits(
                 layout_partitions[target_idx]
             )
@@ -2414,33 +2409,13 @@ def obtain_partition_candidates(
     # ------------------------------------------------------------------------
         
     def get_initial_layer(self, IDAG, N, optimized_partitions):
-        initial_layer = []
-        active_qbits = set(range(N))
-        for idx in range(len(IDAG)):
-            if len(IDAG[idx]) == 0:
-                initial_layer.append(idx)
-                for qbit in self._partition_involved_qbits(
-                    optimized_partitions[idx]
-                ):
-                    active_qbits.discard(qbit)
-            if not active_qbits:
-                break
-        return initial_layer
+        del N, optimized_partitions
+        return [idx for idx in range(len(IDAG)) if not IDAG[idx]]
 
 
     def get_final_layer(self, DAG, N, optimized_partitions):
-        final_layer = []
-        active_qbits = set(range(N))
-        for idx in range(len(DAG) - 1, -1, -1):
-            if len(DAG[idx]) == 0:
-                final_layer.append(idx)
-                for qbit in self._partition_involved_qbits(
-                    optimized_partitions[idx]
-                ):
-                    active_qbits.discard(qbit)
-            if not active_qbits:
-                break
-        return final_layer
+        del N, optimized_partitions
+        return [idx for idx in range(len(DAG) - 1, -1, -1) if not DAG[idx]]
                 
     def construct_DAG_and_IDAG(self, optimized_partitions):
         DAG = []
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 90658a152..591722d5f 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -118,22 +118,31 @@ def neighbor_heuristic(n_pos):
     # Paths are reconstructed via a parent-pointer dict to avoid copying lists
     # on every heap push (which would be O(depth²) total).
     counter = 0  # tiebreak counter so tuples never compare paths
-    parent = {}  # state → (parent_state, swap) for path reconstruction
-    parent[initial_positions] = None
+    # When the neighbor tie-breaker is active, the full search state must
+    # include the tracked future-qubit positions.  Otherwise two equal-length
+    # paths to the same partition positions but different bystander layouts
+    # collapse into one visited entry, defeating the downstream-layout signal.
+    initial_state = (
+        (initial_positions, initial_n_pos) if use_neighbor else initial_positions
+    )
+    parent = {}  # state_key → (parent_state_key, swap) for path reconstruction
+    parent[initial_state] = None
 
     h0 = heuristic(initial_positions)
     nh0 = n_weight * neighbor_heuristic(initial_n_pos) if use_neighbor else 0.0
     heap = []
     heapq.heappush(heap, (h0 + nh0, 0, counter, initial_positions, initial_n_pos))
-    visited = {initial_positions: 0}
+    visited = {initial_state: 0}
 
     while heap:
         f, g, _, positions, n_pos = heapq.heappop(heap)
 
+        state_key = (positions, n_pos) if use_neighbor else positions
+
         if positions == target_positions:
             # Reconstruct swap path via parent pointers
             path = []
-            state = positions
+            state = state_key
             while parent[state] is not None:
                 prev_state, swap = parent[state]
                 path.append(swap)
@@ -151,7 +160,7 @@ def neighbor_heuristic(n_pos):
                 final_v2p[q1], final_v2p[q2] = P2, P1
             return path, final_v2p
 
-        if visited.get(positions, float('inf')) < g:
+        if visited.get(state_key, float('inf')) < g:
             continue
 
         # Quick lookup: physical position → index within partition_qubits list
@@ -173,9 +182,6 @@ def neighbor_heuristic(n_pos):
                 new_positions = tuple(new_positions)
 
                 new_g = g + 1
-                if visited.get(new_positions, float('inf')) <= new_g:
-                    continue
-
                 # Bug B fix: update neighbor positions for BOTH sides of the swap.
                 # A neighbor qubit at nb gets displaced to p, AND a neighbor qubit
                 # at p (if it's also tracked, e.g. overlaps with a partition qubit)
@@ -192,9 +198,15 @@ def neighbor_heuristic(n_pos):
                     new_n_pos = n_pos
                     new_nh = 0.0
 
-                visited[new_positions] = new_g
+                new_state_key = (
+                    (new_positions, new_n_pos) if use_neighbor else new_positions
+                )
+                if visited.get(new_state_key, float('inf')) <= new_g:
+                    continue
+
+                visited[new_state_key] = new_g
                 swap_key = (min(p, nb), max(p, nb))
-                parent[new_positions] = (positions, swap_key)
+                parent[new_state_key] = (state_key, swap_key)
                 counter += 1
                 heapq.heappush(heap, (new_g + heuristic(new_positions) + new_nh,
                                       new_g, counter, new_positions, new_n_pos))

From c2a85ae74ae44597cc79bdb4f8a125023df10ac4 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 25 Apr 2026 20:50:50 +0200
Subject: [PATCH 160/232] Add 3q exit routing penalty

---
 benchmark_PartAM.py                           | 176 ++++++++++++++++++
 .../sabre_router/include/sabre_router.hpp     |   1 +
 .../src-cpp/sabre_router/sabre_router.cpp     |   6 +-
 squander/synthesis/PartAM.py                  |  19 +-
 squander/synthesis/bindings.cpp               |   3 +-
 5 files changed, 201 insertions(+), 4 deletions(-)
 create mode 100644 benchmark_PartAM.py

diff --git a/benchmark_PartAM.py b/benchmark_PartAM.py
new file mode 100644
index 000000000..249a5bfd4
--- /dev/null
+++ b/benchmark_PartAM.py
@@ -0,0 +1,176 @@
+"""
+Benchmark PartAM cleanup phase per circuit.
+
+Runs each circuit 5 times with PartAM (cleanup=True) and records:
+  - qubit count
+  - initial CNOT count (original QASM circuit)
+  - CNOT count before cleanup (post-synthesis, pre-cleanup)
+  - CNOT count after cleanup (final)
+  - decomposition error
+  - compilation time (seconds)
+
+Results are exported to benchmark_PartAM.csv.
+
+Usage:
+    conda activate qgd
+    python benchmark_PartAM.py
+"""
+
+import numpy as np
+import time
+import os
+import glob
+import csv
+import random
+
+from squander import Partition_Aware_Mapping
+from squander import utils
+from squander import Circuit
+
+N_RUNS = 3
+OUTPUT_CSV = "benchmark_PartAM_layout.csv"
+
+
+def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm):
+    num_qubits = circ.get_Qbit_Num()
+    matrix_size = 1 << num_qubits
+    rng = np.random.RandomState(0)
+    initial_state = rng.uniform(-1, 1, (matrix_size,)) + 1j * rng.uniform(-1, 1, (matrix_size,))
+    initial_state /= np.linalg.norm(initial_state)
+
+    original_state = initial_state.copy()
+    circ_orig.apply_to(parameters_orig, original_state)
+
+    circ_Final = Circuit(num_qubits)
+    output_perm_T = [0] * num_qubits
+    for i, j in enumerate(output_perm):
+        output_perm_T[j] = i
+    circ_Final.add_Permutation([int(x) for x in input_perm])
+    circ_Final.add_Circuit(circ)
+    circ_Final.add_Permutation(output_perm_T)
+
+    state = initial_state.copy()
+    circ_Final.apply_to(params, state)
+    return 1 - abs(np.vdot(state, original_state))
+
+
+def make_linear_topology(n_qubits):
+    return [(i, i + 1) for i in range(n_qubits - 1)]
+
+
+def run_once(circ_orig, parameters_orig, topology):
+    config = {
+        'strategy': "TreeSearch",
+        'test_subcircuits': False,
+        'test_final_circuit': False,
+        'max_partition_size': 3,
+        'progressbar': False,
+        'topology': topology,
+        'verbosity': 0,
+        'cleanup': True,
+        'sabre_iterations':5,
+        'n_layout_trials':64,
+        'random_seed':random.randint(1,100),
+        # Diagnostic routing mode: score every candidate exactly. The current
+        # PartAM router bypasses prefiltering, so this is kept only for older
+        # code paths/config visibility.
+        'prefilter_top_k': 5000,
+        # Rank every layout trial by actual constructed routing, not only by
+        # the heuristic trial cost.
+        'actual_routing_rank_top_k': None,
+        'top_k_pi': 1,
+        'cnot_cost': 1.0 / 3.0,  # old: swap_cost=15, local_cost_weight=1.0 -> 15:1 swap:cnot
+        'cleanup_top_k': 3,
+        "parallel_layout_trials": True,
+        "layout_trial_workers": 0,
+        'max_E_size': 40,
+        'max_lookahead': 6,
+        'E_weight': 0.7,
+        'E_alpha': 1.0,        # LightSABRE-style uniform lookahead (no per-depth decay)
+        # Disable extra routing heuristics while diagnosing 3-qubit partition
+        # quality.
+        'decay_delta': 0.0,
+        'swap_burst_budget': 0,
+        'path_tiebreak_weight': 0.0,
+        'three_qubit_exit_weight': 2.0,
+    }
+
+    # Clean the initial circuit using the same config pattern as in PartAM.py
+    from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
+    cleanup_config = dict(config)
+    cleanup_config['topology'] = None
+    cleanup_config['routed'] = False
+    cleanup_config['test_subcircuits'] = False
+    cleanup_config['test_final_circuit'] = False
+    cleanup_config['global_min'] = True
+    cleanup_config['pre-opt-strategy'] = 'TreeSearch'
+
+    wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+    circ_orig, parameters_orig = wco.OptimizeWideCircuit(circ_orig.get_Flat_Circuit(), parameters_orig)
+
+    start = time.time()
+    pam = Partition_Aware_Mapping(config)
+    circ, params, pi_in, pi_out = pam.Partition_Aware_Mapping(circ_orig.get_Flat_Circuit(), parameters_orig)
+    elapsed = time.time() - start
+    routing_time = pam._routing_time
+    cnot_before_cleanup = pam._cnot_pre_cleanup
+    cnot_after_cleanup = circ.get_Gate_Nums().get('CNOT', 0)
+    error = validate_result(circ_orig, parameters_orig, circ, params, pi_in, pi_out)
+
+    return cnot_before_cleanup, cnot_after_cleanup, error, elapsed, routing_time
+
+
+if __name__ == '__main__':
+    circs_dir = "circs"
+    qasm_files = sorted(glob.glob(os.path.join(circs_dir, "*.qasm")))
+
+    if not qasm_files:
+        print(f"No .qasm files found in {circs_dir}/")
+        exit(1)
+
+    print(f"Found {len(qasm_files)} circuits in {circs_dir}/")
+    print(f"Running {N_RUNS} times per circuit (cleanup=True)\n")
+
+    fieldnames = [
+        'circuit', 'n_qubits', 'run',
+        'initial_cnot', 'cnot_pre_cleanup', 'cnot_post_cleanup',
+        'error', 'time_s','routing_time_s'
+    ]
+
+    # Open CSV once and flush after each circuit so partial results are never lost
+    with open(OUTPUT_CSV, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+
+        for filepath in qasm_files:
+            name = os.path.basename(filepath)
+            print(f"{'='*70}")
+            print(f"Circuit: {name}")
+
+            circ_orig, parameters_orig, _ = utils.qasm_to_squander_circuit(filepath)
+            n_qubits = circ_orig.get_Qbit_Num()
+            topology = make_linear_topology(n_qubits)
+
+            initial_cnot = circ_orig.get_Gate_Nums().get('CNOT', 0)
+            print(f"Qubits: {n_qubits}, Initial CNOTs: {initial_cnot}")
+            print(f"{'Run':>4} {'Pre-cleanup':>12} {'Post-cleanup':>12} {'Error':>12} {'Time(s)':>10} {'Routing time(s)':>10}")
+
+            for run_idx in range(N_RUNS):
+                cnot_pre, cnot_post, error, elapsed, routing_time = run_once(circ_orig, parameters_orig, topology)
+                print(f"{run_idx:>4} {cnot_pre:>12} {cnot_post:>12} {error:>12.2e} {elapsed:>10.1f} {routing_time:>10.1f}")
+                writer.writerow({
+                    'circuit': name,
+                    'n_qubits': n_qubits,
+                    'run': run_idx,
+                    'initial_cnot': initial_cnot,
+                    'cnot_pre_cleanup': cnot_pre,
+                    'cnot_post_cleanup': cnot_post,
+                    'error': error,
+                    'time_s': round(elapsed, 3),
+                    'routing_time_s': round(routing_time,3)
+                })
+                f.flush()
+
+            print()
+
+    print(f"Results saved to {OUTPUT_CSV}")
diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 4ba58aa1d..bfc7ac6a5 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -89,6 +89,7 @@ struct SabreConfig {
     double decay_delta = 0.001; // Qiskit LightSABRE DECAY_RATE
     int swap_burst_budget = 5; // Qiskit LightSABRE DECAY_RESET_INTERVAL
     double path_tiebreak_weight = 0.2;
+    double three_qubit_exit_weight = 1.0;
 };
 
 struct RouteStep {
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 779dfab72..168df19f7 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1176,7 +1176,7 @@ double SabreRouter::score_candidate(
     );
 
     const int cand_idx = cand.partition_idx;
-    score += future_context_cost(
+    double future_score = future_context_cost(
         cand_idx,
         output_perm,
         F_snapshot,
@@ -1184,6 +1184,10 @@ double SabreRouter::score_candidate(
         reverse,
         canonical_data
     );
+    if (cand.involved_qbits.size() >= 3) {
+        future_score *= config_.three_qubit_exit_weight;
+    }
+    score += future_score;
 
     if (out_swaps) *out_swaps = std::move(swaps);
     if (out_pi_new) *out_pi_new = std::move(output_perm);
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index e1b2abb02..451348cca 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -160,6 +160,7 @@ def __init__(self, config):
             self.config['path_tiebreak_weight'] = 0.49
         self.config.setdefault('cnot_cost', 1.0 / 3.0)  # 1 SWAP = 3 CNOTs
         self.config.setdefault('overlap_tiebreak', True)
+        self.config.setdefault('three_qubit_exit_weight', 1.0)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -932,6 +933,10 @@ def _run_layout_trials_cpp(
         cfg.path_tiebreak_weight = self.config.get(
             'path_tiebreak_weight', 0.2
         )
+        if hasattr(cfg, 'three_qubit_exit_weight'):
+            cfg.three_qubit_exit_weight = self.config.get(
+                'three_qubit_exit_weight', 1.0
+            )
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
@@ -1861,6 +1866,9 @@ def Heuristic_Search(
                     candidate_cache=candidate_cache,
                     layout_partitions=optimized_partitions,
                     return_transforms=True,
+                    three_qubit_exit_weight=self.config.get(
+                        "three_qubit_exit_weight", 1.0
+                    ),
                 )
                 scores[ci] = score
                 cached_swaps[ci] = swaps
@@ -2080,6 +2088,9 @@ def _heuristic_search_layout_only(
                     candidate_cache=candidate_cache,
                     layout_partitions=optimized_partitions,
                     return_transforms=True,
+                    three_qubit_exit_weight=self.config.get(
+                        "three_qubit_exit_weight", 1.0
+                    ),
                 )
                 scores[ci] = score
                 cached_swaps[ci] = swaps
@@ -2228,7 +2239,8 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   cached_neighbor_info=None,
                                   candidate_cache=None,
                                   layout_partitions=None,
-                                  return_transforms=False):
+                                  return_transforms=False,
+                                  three_qubit_exit_weight=1.0):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = |swaps|
@@ -2276,7 +2288,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             return score
 
         cand_idx = partition_candidate.partition_idx
-        score += qgd_Partition_Aware_Mapping._future_context_cost(
+        future_score = qgd_Partition_Aware_Mapping._future_context_cost(
             cand_idx,
             output_perm,
             F,
@@ -2290,6 +2302,9 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             layout_partitions=layout_partitions,
             canonical_data=canonical_data,
         )
+        if len(partition_candidate.involved_qbits) >= 3:
+            future_score *= three_qubit_exit_weight
+        score += future_score
 
         if return_transforms:
             return score, swaps, output_perm
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 070a3dc5a..7103819cc 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -131,7 +131,8 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("random_seed", &SabreConfig::random_seed)
         .def_readwrite("decay_delta", &SabreConfig::decay_delta)
         .def_readwrite("swap_burst_budget", &SabreConfig::swap_burst_budget)
-        .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight);
+        .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
+        .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")

From 94c20f4539b924447272d0081bafc17a31a9e5cd Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 25 Apr 2026 21:20:47 +0200
Subject: [PATCH 161/232] more sabre iter

---
 benchmark_PartAM.py                           | 20 ++++----
 .../src-cpp/sabre_router/sabre_router.cpp     | 41 +++++++++++++---
 squander/synthesis/PartAM.py                  | 49 +++++++++++++------
 3 files changed, 76 insertions(+), 34 deletions(-)

diff --git a/benchmark_PartAM.py b/benchmark_PartAM.py
index 249a5bfd4..31398e013 100644
--- a/benchmark_PartAM.py
+++ b/benchmark_PartAM.py
@@ -68,31 +68,29 @@ def run_once(circ_orig, parameters_orig, topology):
         'topology': topology,
         'verbosity': 0,
         'cleanup': True,
-        'sabre_iterations':5,
-        'n_layout_trials':64,
+        'sabre_iterations':20,
+        'n_layout_trials':128,
         'random_seed':random.randint(1,100),
-        # Diagnostic routing mode: score every candidate exactly. The current
-        # PartAM router bypasses prefiltering, so this is kept only for older
-        # code paths/config visibility.
-        'prefilter_top_k': 5000,
+        # Cheap candidate prefilter before full A* scoring.
+        'prefilter_top_k': 50,
         # Rank every layout trial by actual constructed routing, not only by
         # the heuristic trial cost.
         'actual_routing_rank_top_k': None,
         'top_k_pi': 1,
-        'cnot_cost': 1.0 / 3.0,  # old: swap_cost=15, local_cost_weight=1.0 -> 15:1 swap:cnot
+        'cnot_cost': 0.5 / 3.0,  # old: swap_cost=15, local_cost_weight=1.0 -> 15:1 swap:cnot
         'cleanup_top_k': 3,
         "parallel_layout_trials": True,
         "layout_trial_workers": 0,
         'max_E_size': 40,
         'max_lookahead': 6,
-        'E_weight': 0.7,
+        'E_weight': 0.3,
         'E_alpha': 1.0,        # LightSABRE-style uniform lookahead (no per-depth decay)
         # Disable extra routing heuristics while diagnosing 3-qubit partition
         # quality.
         'decay_delta': 0.0,
-        'swap_burst_budget': 0,
+        'swap_burst_budget': 5,
         'path_tiebreak_weight': 0.0,
-        'three_qubit_exit_weight': 2.0,
+        'three_qubit_exit_weight': 1.5,
     }
 
     # Clean the initial circuit using the same config pattern as in PartAM.py
@@ -106,7 +104,7 @@ def run_once(circ_orig, parameters_orig, topology):
     cleanup_config['pre-opt-strategy'] = 'TreeSearch'
 
     wco = qgd_Wide_Circuit_Optimization(cleanup_config)
-    circ_orig, parameters_orig = wco.OptimizeWideCircuit(circ_orig.get_Flat_Circuit(), parameters_orig)
+    #circ_orig, parameters_orig = wco.OptimizeWideCircuit(circ_orig.get_Flat_Circuit(), parameters_orig)
 
     start = time.time()
     pam = Partition_Aware_Mapping(config)
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 168df19f7..c283484d3 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1213,7 +1213,6 @@ std::vector<const CandidateData*> SabreRouter::obtain_partition_candidates(
 
 // ---------------------------------------------------------------------------
 // prefilter_candidates
-// Disabled for routing quality: exact scoring sees every candidate.
 // ---------------------------------------------------------------------------
 
 std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
@@ -1225,13 +1224,39 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
-    (void)pi;
-    (void)top_k;
-    (void)F_snapshot;
-    (void)E;
-    (void)reverse;
-    (void)canonical_data;
-    return candidates;
+    if (static_cast<int>(candidates.size()) <= top_k) return candidates;
+    if (top_k <= 0) return {};
+
+    using Pair = std::pair<double, const CandidateData*>;
+    std::vector<Pair> estimated;
+    estimated.reserve(candidates.size());
+    for (const auto* cand : candidates) {
+        const auto approx_output = estimate_candidate_output_layout(
+            *cand, pi, reverse);
+        const double est = routing_objective(
+            static_cast<double>(estimate_swap_count(*cand, pi, reverse)),
+            cand->cnot_count
+        ) + future_context_cost(
+            cand->partition_idx, approx_output, F_snapshot, E, reverse,
+            canonical_data);
+        estimated.push_back({est, cand});
+    }
+
+    std::nth_element(
+        estimated.begin(),
+        estimated.begin() + top_k,
+        estimated.end(),
+        [](const Pair& a, const Pair& b) {
+            return a.first < b.first;
+        }
+    );
+
+    std::vector<const CandidateData*> result;
+    result.reserve(top_k);
+    for (int i = 0; i < top_k; i++) {
+        result.push_back(estimated[i].second);
+    }
+    return result;
 }
 
 // ---------------------------------------------------------------------------
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 451348cca..22aad9471 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1364,21 +1364,40 @@ def _prefilter_candidates(
         alpha=1.0,
         canonical_data=None,
     ):
-        """Return all candidates; cheap prefiltering is disabled for routing quality."""
-        del (
-            pi,
-            D,
-            top_k,
-            F,
-            E,
-            candidate_cache,
-            layout_partitions,
-            reverse,
-            W,
-            alpha,
-            canonical_data,
-        )
-        return partition_candidates
+        """Pre-filter candidates using cheap swap-count estimate before full A* scoring."""
+        if top_k <= 0:
+            return []
+        if len(partition_candidates) <= top_k:
+            return partition_candidates
+        cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
+        estimates = np.array([
+            (
+                self._routing_objective(
+                    pc.estimate_swap_count(pi, D, reverse=reverse),
+                    pc.cnot_count,
+                    cnot_cost,
+                )
+                + self._future_context_cost(
+                    pc.partition_idx,
+                    self._estimate_candidate_output_layout(
+                        pc, pi, reverse=reverse
+                    ),
+                    F or (),
+                    E or (),
+                    D,
+                    candidate_cache,
+                    reverse=reverse,
+                    cnot_cost=cnot_cost,
+                    W=W,
+                    alpha=alpha,
+                    layout_partitions=layout_partitions,
+                    canonical_data=canonical_data,
+                )
+            )
+            for pc in partition_candidates
+        ])
+        top_k_indices = np.argpartition(estimates, top_k)[:top_k]
+        return [partition_candidates[i] for i in top_k_indices]
 
     @staticmethod
     def _decay_factor_for_swaps(swaps, decay):

From 7e9e95d637b45251ea3834475a3f37409f3df478 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 27 Apr 2026 11:28:43 +0200
Subject: [PATCH 162/232] new partition weights

---
 squander/synthesis/PartAM.py | 103 ++++++++++++++++++++++++++---------
 1 file changed, 76 insertions(+), 27 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 22aad9471..33313013c 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -24,7 +24,6 @@
     _get_topo_order,
     topo_sort_partitions,
     ilp_global_optimal,
-    parts_to_overlap_scores,
 )
 # Module-level globals for pool workers (set via Pool initializer)
 _worker_config = None
@@ -159,8 +158,9 @@ def __init__(self, config):
             )
             self.config['path_tiebreak_weight'] = 0.49
         self.config.setdefault('cnot_cost', 1.0 / 3.0)  # 1 SWAP = 3 CNOTs
-        self.config.setdefault('overlap_tiebreak', True)
         self.config.setdefault('three_qubit_exit_weight', 1.0)
+        self.config.setdefault('size_density_weight', False)
+        self.config.setdefault('sparse_penalty', 3.0)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -203,6 +203,46 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
     # Static Synthesis Helpers (extracted from SynthesizeWideCircuit)
     # ------------------------------------------------------------------------
 
+    @staticmethod
+    def _parts_to_density_weights(allparts, gate_dict, sparse_penalty=3.0):
+        """Per-part ILP weights that penalise sparse 3-qubit partitions.
+
+        Penalty by active-pair count for a 3q partition:
+          1 pair  -> sparse_penalty        (e.g. 3 → total ILP cost 4)
+          2 pairs -> sparse_penalty / 3    (e.g. 1 → total ILP cost 2)
+          3 pairs -> 0                     (no penalty)
+        For 2q (or 1q) partitions the weight is always 0.
+        """
+        N = max(len(allparts), 1)
+        weights = []
+        for part in allparts:
+            qubits_in_part = set()
+            for gate_idx in part:
+                gate = gate_dict.get(gate_idx)
+                if gate is not None:
+                    qubits_in_part.update(gate.get_Involved_Qbits())
+            if len(qubits_in_part) != 3:
+                weights.append(0.0)
+                continue
+            active_pairs = set()
+            for gate_idx in part:
+                gate = gate_dict.get(gate_idx)
+                if gate is None:
+                    continue
+                qbs = list(gate.get_Involved_Qbits())
+                for a in range(len(qbs)):
+                    for b in range(a + 1, len(qbs)):
+                        active_pairs.add((min(qbs[a], qbs[b]), max(qbs[a], qbs[b])))
+            n_pairs = len(active_pairs)
+            if n_pairs >= 3:
+                penalty = 0.0
+            elif n_pairs == 2:
+                penalty = sparse_penalty / 3.0
+            else:
+                penalty = sparse_penalty
+            weights.append(penalty / N)
+        return weights
+
     @staticmethod
     def _topo_key(mini_topology):
         return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
@@ -496,11 +536,13 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         # absorb routing SWAPs. Overlap-based tie-breaker (when enabled)
         # picks deterministically among min-count covers, preferring covers
         # whose parts share more logical qubits with their DAG successors.
-        if self.config['overlap_tiebreak']:
-            tb_weights = parts_to_overlap_scores(allparts, g, gate_to_qubit)
-            L_parts, _ = ilp_global_optimal(allparts, g, weights=tb_weights)
-        else:
-            L_parts, _ = ilp_global_optimal(allparts, g)
+        ilp_weights = None
+        if self.config.get('size_density_weight', False):
+            sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
+            ilp_weights = self._parts_to_density_weights(
+                allparts, gate_dict, sparse_penalty=sparse_penalty
+            )
+        L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----
         selected_surrounded_starts = set()
@@ -612,11 +654,13 @@ def _run_parallel_synthesis(self, partition_meta):
                     meta['qbit_map'],
                 )
 
-            # ---- Stage 1: fix random P_o, sweep all P_i ----
+            # ---- Stage 1: sweep all P_i (and all P_o for N==2 partitions) ----
+            # For N==2 there are only 4 permutation pairs total, so we enumerate
+            # them all here and skip Stage 2 for those partitions.
             stage1_futures = []
             stage1_cached = []
-            stage1_P_o = {}
             known_pairs = {}
+            full_enum_keys = set()  # (partition_idx, topology_idx) fully covered in S1
 
             for partition_idx, meta in enumerate(partition_meta):
                 if meta is None:
@@ -624,37 +668,40 @@ def _run_parallel_synthesis(self, partition_meta):
                 N = meta['N']
                 perms_all = list(permutations(range(N)))
                 for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
-                    P_o_initial = perms_all[np.random.choice(len(perms_all))]
-                    stage1_P_o[(partition_idx, topology_idx)] = P_o_initial
-                    for P_i in perms_all:
-                        Umtx = self._build_permuted_unitary(meta, P_i, P_o_initial)
-                        ck = self._cache_key(Umtx, mini_topology)
-                        if ck in decomp_cache:
-                            stage1_cached.append((partition_idx, topology_idx, P_i, ck))
-                        else:
-                            future = pool.apply_async(
-                                _decompose_one, (Umtx, mini_topology)
-                            )
-                            stage1_futures.append((partition_idx, topology_idx, P_i, ck, future))
+                    if N == 2:
+                        full_enum_keys.add((partition_idx, topology_idx))
+                        po_sweep = perms_all
+                    else:
+                        po_sweep = [perms_all[np.random.choice(len(perms_all))]]
+                    for P_o in po_sweep:
+                        for P_i in perms_all:
+                            Umtx = self._build_permuted_unitary(meta, P_i, P_o)
+                            ck = self._cache_key(Umtx, mini_topology)
+                            if ck in decomp_cache:
+                                stage1_cached.append((partition_idx, topology_idx, P_i, P_o, ck))
+                            else:
+                                future = pool.apply_async(
+                                    _decompose_one, (Umtx, mini_topology)
+                                )
+                                stage1_futures.append((partition_idx, topology_idx, P_i, P_o, ck, future))
 
             # Process Stage 1 cache hits immediately
-            for partition_idx, topology_idx, P_i, ck in stage1_cached:
+            for partition_idx, topology_idx, P_i, P_o, ck in stage1_cached:
                 meta = partition_meta[partition_idx]
                 N = meta['N']
-                P_o_initial = stage1_P_o[(partition_idx, topology_idx)]
                 mini_topology = meta['mini_topologies'][topology_idx]
                 synth_circuit, synth_params, synth_err = decomp_cache[ck]
                 if synth_err <= self.config['tolerance']:
                     pair_key = (partition_idx, topology_idx)
                     self._add_result_with_auts(
-                        results_map[partition_idx], (P_i, P_o_initial),
+                        results_map[partition_idx], (P_i, P_o),
                         synth_circuit, synth_params, topology_idx,
                         N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                     )
 
             # Collect Stage 1 pool results
             cache_hits_s1 = len(stage1_cached)
-            for partition_idx, topology_idx, P_i, ck, future in tqdm(
+            for partition_idx, topology_idx, P_i, P_o, ck, future in tqdm(
                 stage1_futures, desc=f"Stage 1 Synthesis ({cache_hits_s1} cached)",
                 disable=disable_pbar
             ):
@@ -662,17 +709,17 @@ def _run_parallel_synthesis(self, partition_meta):
                 decomp_cache[ck] = (synth_circuit, synth_params, synth_err)
                 meta = partition_meta[partition_idx]
                 N = meta['N']
-                P_o_initial = stage1_P_o[(partition_idx, topology_idx)]
                 mini_topology = meta['mini_topologies'][topology_idx]
                 if synth_err <= self.config['tolerance']:
                     pair_key = (partition_idx, topology_idx)
                     self._add_result_with_auts(
-                        results_map[partition_idx], (P_i, P_o_initial),
+                        results_map[partition_idx], (P_i, P_o),
                         synth_circuit, synth_params, topology_idx,
                         N, mini_topology, known_pairs, pair_key, use_auts, aut_cache
                     )
 
             # ---- Stage 2: fix top-k P_i from Stage 1, sweep all P_o ----
+            # Skipped for partitions already fully enumerated in Stage 1 (N==2).
             top_k_pi = self.config.get('top_k_pi', 1)
             stage2_futures = []
             stage2_cached = []
@@ -684,6 +731,8 @@ def _run_parallel_synthesis(self, partition_meta):
                 perms_all = list(permutations(range(N)))
                 result = results_map[partition_idx]
                 for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
+                    if (partition_idx, topology_idx) in full_enum_keys:
+                        continue
                     pair_key = (partition_idx, topology_idx)
                     kp = known_pairs.get(pair_key, set()) if use_auts else set()
                     for P_i_cand in result.get_top_k_results(topology_idx, top_k_pi):

From 993becdaf351ec4370946c9baa961122a512db90 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 27 Apr 2026 11:33:53 +0200
Subject: [PATCH 163/232] update benchmark

---
 benchmark_PartAM.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmark_PartAM.py b/benchmark_PartAM.py
index 31398e013..fa4fed3d6 100644
--- a/benchmark_PartAM.py
+++ b/benchmark_PartAM.py
@@ -85,12 +85,12 @@ def run_once(circ_orig, parameters_orig, topology):
         'max_lookahead': 6,
         'E_weight': 0.3,
         'E_alpha': 1.0,        # LightSABRE-style uniform lookahead (no per-depth decay)
-        # Disable extra routing heuristics while diagnosing 3-qubit partition
-        # quality.
-        'decay_delta': 0.0,
+        'decay_delta': 0.001,
         'swap_burst_budget': 5,
-        'path_tiebreak_weight': 0.0,
+        'path_tiebreak_weight': 0.2,
         'three_qubit_exit_weight': 1.5,
+        'size_density_weight': True,
+        'sparse_penalty': 3.0,
     }
 
     # Clean the initial circuit using the same config pattern as in PartAM.py

From fabf287c1bd761f33f8f67ef0418c335aab18edd Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 27 Apr 2026 12:00:07 +0200
Subject: [PATCH 164/232] Add back in printing

---
 squander/synthesis/PartAM.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 33313013c..6b60225b5 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -565,6 +565,15 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
         n_multi = len(L_parts)
 
+        size_counts = {}
+        for gates in selected_parts_gates:
+            involved = set()
+            for g in gates:
+                involved.update(gate_dict[g].get_Involved_Qbits())
+            size = len(involved)
+            size_counts[size] = size_counts.get(size, 0) + 1
+        print(f"Selected partitions: 2-qubit={size_counts.get(2, 0)}, 3-qubit={size_counts.get(3, 0)}, total_multi={sum(size_counts.get(s, 0) for s in size_counts if s > 1)}")
+
         # ---- Phase 4: Assemble partitioned circuit from selected partitions only ----
         partitioned_circuit = Circuit(qbit_num_orig_circuit)
         params = []

From b4662bd8d76833818e974f552936804609593346 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 27 Apr 2026 14:40:40 +0200
Subject: [PATCH 165/232] Speedup

---
 .../src-cpp/sabre_router/sabre_router.cpp     | 81 ++++++++++---------
 1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index c283484d3..189c3392c 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -689,27 +689,7 @@ SabreRouter::find_constrained_swaps(
         double h_sum;       // sum(dist(pos[i], target[i])) — twice the admissible h
         double nb_total;    // sum(edge.weight * dist(...)) — pre-scale
         int nb_arena_idx;   // -1 if !use_neighbor; else slot in nb_pos_flat
-    };
-    struct StateKey {
-        int64_t packed;
-        std::vector<int> nb_pos;
-
-        bool operator==(const StateKey& other) const {
-            return packed == other.packed && nb_pos == other.nb_pos;
-        }
-    };
-    struct StateKeyHash {
-        size_t operator()(const StateKey& key) const {
-            uint64_t h = static_cast<uint64_t>(key.packed);
-            h ^= h >> 33; h *= 0xff51afd7ed558ccdULL;
-            h ^= h >> 33; h *= 0xc4ceb9fe1a85ec53ULL;
-            h ^= h >> 33;
-            for (int v : key.nb_pos) {
-                h ^= static_cast<uint64_t>(v) + 0x9e3779b97f4a7c15ULL
-                   + (h << 6) + (h >> 2);
-            }
-            return static_cast<size_t>(h);
-        }
+        uint64_t nb_hash;   // incremental XOR hash of neighbor VQ positions
     };
     thread_local std::vector<Node> arena;
     // Flat storage for neighbor positions: slot s lives at
@@ -721,7 +701,9 @@ SabreRouter::find_constrained_swaps(
     arena.clear();
     nb_pos_flat.clear();
     arena.reserve(1024);
-    std::unordered_map<StateKey, int32_t, StateKeyHash> best_node;
+    // key = mix(packed) ^ nb_hash; no heap allocation per lookup
+    thread_local std::unordered_map<uint64_t, int32_t> best_node;
+    best_node.clear();
     best_node.reserve(2048);
 
     const int nb_stride = use_neighbor
@@ -744,19 +726,29 @@ SabreRouter::find_constrained_swaps(
         nb_scratch.resize(nb_stride);
     }
 
-    auto make_state_key = [&](int64_t packed, int nb_arena_idx) {
-        StateKey key;
-        key.packed = packed;
-        if (use_neighbor) {
-            const size_t base = static_cast<size_t>(nb_arena_idx) * nb_stride;
-            key.nb_pos.assign(
-                nb_pos_flat.begin() + static_cast<std::ptrdiff_t>(base),
-                nb_pos_flat.begin() + static_cast<std::ptrdiff_t>(base + nb_stride)
-            );
-        }
-        return key;
+    // Per-(vq_idx, phys) contribution to nb_hash; XOR-based so removals are
+    // identical to additions (self-inverse), enabling incremental updates.
+    auto slot_hash = [](int vq_idx, int phys) -> uint64_t {
+        uint64_t h = static_cast<uint64_t>(vq_idx) * 0x9e3779b97f4a7c15ULL
+                   ^ static_cast<uint64_t>(phys)   * 0x6c62272e07bb0142ULL;
+        h ^= h >> 33; h *= 0xff51afd7ed558ccdULL; h ^= h >> 33;
+        return h;
+    };
+    auto make_key = [](int64_t packed, uint64_t nb_hash) -> uint64_t {
+        uint64_t h = static_cast<uint64_t>(packed);
+        h ^= h >> 33; h *= 0xff51afd7ed558ccdULL;
+        h ^= h >> 33; h *= 0xc4ceb9fe1a85ec53ULL;
+        h ^= h >> 33;
+        return h ^ nb_hash;
     };
 
+    uint64_t initial_nb_hash = 0;
+    if (use_neighbor) {
+        for (int z = 0; z < nb_stride; z++) {
+            initial_nb_hash ^= slot_hash(z, neighbor_info->initial_pos[z]);
+        }
+    }
+
     // ---- Push initial node ----
     // Slot 0 of nb_pos_flat already holds neighbor_info->initial_pos.
     {
@@ -768,8 +760,9 @@ SabreRouter::find_constrained_swaps(
         n.h_sum = h0_sum;
         n.nb_total = initial_nb_total;
         n.nb_arena_idx = use_neighbor ? 0 : -1;
+        n.nb_hash = initial_nb_hash;
         arena.push_back(n);
-        best_node.emplace(make_state_key(initial_packed, n.nb_arena_idx), 0);
+        best_node.emplace(make_key(initial_packed, initial_nb_hash), 0);
     }
 
     // PQ entry: (f, g, counter, arena_idx)
@@ -787,12 +780,13 @@ SabreRouter::find_constrained_swaps(
         (void)f; (void)ctr;
         const int g = g_e;
         const int64_t packed = arena[idx].packed;
+        const uint64_t cur_nb_hash = arena[idx].nb_hash;
 
         // A state can be reinserted with a lower g-cost after this queue entry
         // was pushed. When the neighbor tie-breaker is active, future-qubit
         // positions are part of the state so equal-length paths with different
         // bystander layouts are not collapsed.
-        StateKey cur_key = make_state_key(packed, arena[idx].nb_arena_idx);
+        const uint64_t cur_key = make_key(packed, cur_nb_hash);
         auto cur_best = best_node.find(cur_key);
         if (cur_best == best_node.end() || cur_best->second != idx) {
             continue;
@@ -829,6 +823,7 @@ SabreRouter::find_constrained_swaps(
         const double cur_h_sum = arena[idx].h_sum;
         const double cur_nb_total = arena[idx].nb_total;
         const int cur_nb_arena_idx = arena[idx].nb_arena_idx;
+        // cur_nb_hash already read above
 
         // Expand: every SWAP that moves at least one partition qubit
         for (int i = 0; i < k; i++) {
@@ -865,6 +860,7 @@ SabreRouter::find_constrained_swaps(
                 // contributes the same dist as in the parent state.
                 double new_nb_total = cur_nb_total;
                 int new_nb_arena_idx = -1;
+                uint64_t new_nb_hash = cur_nb_hash;
                 if (use_neighbor) {
                     const size_t parent_base =
                         static_cast<size_t>(cur_nb_arena_idx) * nb_stride;
@@ -901,12 +897,22 @@ SabreRouter::find_constrained_swaps(
                         nb_pos_flat.insert(nb_pos_flat.end(),
                                            nb_scratch.begin(),
                                            nb_scratch.end());
+                        // Incremental hash: XOR out old slots, XOR in new ones
+                        if (idx_nb_vq >= 0) {
+                            new_nb_hash ^= slot_hash(idx_nb_vq, nb)
+                                         ^ slot_hash(idx_nb_vq, p);
+                        }
+                        if (idx_p_vq >= 0) {
+                            new_nb_hash ^= slot_hash(idx_p_vq, p)
+                                         ^ slot_hash(idx_p_vq, nb);
+                        }
                     } else {
                         new_nb_arena_idx = cur_nb_arena_idx;
+                        // new_nb_hash unchanged
                     }
                 }
 
-                StateKey new_key = make_state_key(new_packed, new_nb_arena_idx);
+                const uint64_t new_key = make_key(new_packed, new_nb_hash);
                 auto existing = best_node.find(new_key);
                 if (existing != best_node.end()
                     && arena[existing->second].g <= new_g
@@ -925,10 +931,11 @@ SabreRouter::find_constrained_swaps(
                 n.h_sum = new_h_sum;
                 n.nb_total = new_nb_total;
                 n.nb_arena_idx = new_nb_arena_idx;
+                n.nb_hash = new_nb_hash;
 
                 int32_t new_idx = static_cast<int32_t>(arena.size());
                 arena.push_back(n);
-                best_node[std::move(new_key)] = new_idx;
+                best_node[new_key] = new_idx;
 
                 const double f_new = static_cast<double>(new_g)
                                    + 0.5 * new_h_sum

From 8f4e73f981de48d65b3a8c1596d7c6afca350618 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 28 Apr 2026 22:08:02 +0200
Subject: [PATCH 166/232] try new things

---
 .../qgd_Wide_Circuit_Optimization.py          | 225 ++++++++++++++++++
 1 file changed, 225 insertions(+)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 9dcd81773..7e6dfaabe 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -1283,6 +1283,183 @@ def gen_all_min_cnots(
 # N_Qubit_Decomposition_Guided_Tree.gen_all_min_cnots(3); assert False
 # N_Qubit_Decomposition_Guided_Tree.build_sequence(); assert False
 # print(len(list(N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS(3, [(0,1),(1,2),])))); assert False
+
+
+def generate_squander_seqpam(squander_config, block_size):
+    """Build a bqskit SeqPAM workflow using Squander as the inner synthesis engine.
+
+    Partitioning uses squander's ILP (same logic as PartAM.SynthesizeWideCircuit),
+    with optional density-penalty weights for sparse 3-qubit blocks when
+    ``squander_config['size_density_weight']`` is True.
+
+    Args:
+        squander_config: Config dict passed to SquanderSynthesisPass (bqskit-squander keys:
+            ``strategy`` ("Tree_search"/"Tabu_search"), ``verbosity``,
+            ``optimization_tolerance``, ``optimizer_engine``, etc.).
+            Also read by SquanderILPPartitioner: ``size_density_weight`` (bool),
+            ``sparse_penalty`` (float).
+        block_size: Maximum block size for ILP partitioning and SubtopologySelectionPass.
+
+    Returns:
+        bqskit Workflow implementing the two-stage permutation-aware mapping.
+    """
+    from bqskit.passes import (
+        SquanderSynthesisPass,
+        ForEachBlockPass,
+        EmbedAllPermutationsPass,
+        PAMRoutingPass,
+        PAMLayoutPass,
+        PAMVerificationSequence,
+        SubtopologySelectionPass,
+        ApplyPlacement,
+        UnfoldPass,
+        ExtractModelConnectivityPass,
+        RestoreModelConnectivityPass,
+        LogPass,
+    )
+    from bqskit.passes.control import IfThenElsePass
+    from bqskit.passes.control.predicates import NotPredicate, WidthPredicate
+    from bqskit.compiler import Workflow, BasePass
+
+    class SquanderILPPartitioner(BasePass):
+        """Partition a bqskit circuit using squander's ILP with PartAM density weights.
+
+        Mirrors the partition-selection logic of PartAM.SynthesizeWideCircuit:
+        get_all_partitions → ilp_global_optimal (with optional density weights)
+        → wrap each selected partition as a bqskit CircuitGate block.
+        """
+
+        def __init__(self, block_size, squander_config):
+            self.block_size = block_size
+            self.squander_config = squander_config
+
+        async def run(self, circuit, data):
+            from bqskit.ir import Circuit as BQCircuit
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from qiskit import QuantumCircuit as QkCircuit, qasm2 as qasm2_module
+            from squander import Qiskit_IO
+            from squander.partitioning.ilp import (
+                get_all_partitions, _get_topo_order, ilp_global_optimal,
+            )
+            from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping
+
+            # bqskit → squander via QASM; gate order is preserved
+            qasm_str = OPENQASM2Language().encode(circuit)
+            qk_circ = QkCircuit.from_qasm_str(qasm_str)
+            sqdr_circ, _ = Qiskit_IO.convert_Qiskit_to_Squander(qk_circ)
+
+            # Enumerate candidate partitions
+            allparts, g, go, rgo, sq_chains, gate_to_qubit, _ = \
+                get_all_partitions(sqdr_circ, self.block_size)
+            gate_dict = {i: gate for i, gate in enumerate(sqdr_circ.get_Gates())}
+
+            # ILP selection with optional density-penalty weights
+            ilp_weights = None
+            if self.squander_config.get('size_density_weight', False):
+                sparse_penalty = float(self.squander_config.get('sparse_penalty', 3.0))
+                ilp_weights = qgd_Partition_Aware_Mapping._parts_to_density_weights(
+                    allparts, gate_dict, sparse_penalty=sparse_penalty
+                )
+            L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
+
+            # bqskit_ops[i] corresponds to squander gate index i (QASM order preserved)
+            bqskit_ops = list(circuit.operations_with_cycles())
+
+            # Single-qubit chains that are surrounded by selected partitions
+            sqc_pre     = {x[0]: x for x in sq_chains if rgo[x[0]]}
+            sqc_post    = {x[-1]: x for x in sq_chains if go[x[-1]]}
+            sqc_prepost = {x[0]: x for x in sq_chains
+                           if x[0] in sqc_pre and x[-1] in sqc_post}
+
+            covered = set()
+            partitioned = BQCircuit(circuit.num_qudits, circuit.radixes)
+
+            for i in L_parts:
+                part = allparts[i]
+                surrounded = {
+                    t for s in part for t in go[s]
+                    if t in sqc_prepost
+                    and go[sqc_prepost[t][-1]]
+                    and next(iter(go[sqc_prepost[t][-1]])) in part
+                }
+                gate_idxs = frozenset.union(part, *(sqc_prepost[v] for v in surrounded))
+
+                global_qudits = sorted({
+                    q for gi in gate_idxs
+                    for q in gate_dict[gi].get_Involved_Qbits()
+                })
+                local_map = {gq: l for l, gq in enumerate(global_qudits)}
+
+                topo = _get_topo_order(
+                    {x: go[x] & gate_idxs for x in gate_idxs},
+                    {x: rgo[x] & gate_idxs for x in gate_idxs},
+                    gate_to_qubit,
+                )
+                sub = BQCircuit(len(global_qudits))
+                for gi in topo:
+                    _, op = bqskit_ops[gi]
+                    sub.append(op.gate, [local_map[q] for q in op.location])
+
+                partitioned.append_circuit(sub, global_qudits, as_circuit_gate=True)
+                covered.update(gate_idxs)
+
+            # Standalone single-qubit gates → 1-qudit blocks
+            for gi, (_, op) in enumerate(bqskit_ops):
+                if gi not in covered:
+                    sub_1q = BQCircuit(1)
+                    sub_1q.append(op.gate, [0])
+                    partitioned.append_circuit(sub_1q, list(op.location), as_circuit_gate=True)
+
+            circuit.become(partitioned, False)
+
+    squander    = SquanderSynthesisPass(squander_config)
+    partitioner = SquanderILPPartitioner(block_size, squander_config)
+    post_pam_seq: BasePass = PAMVerificationSequence(8)
+
+    return Workflow(
+        IfThenElsePass(
+            NotPredicate(WidthPredicate(2)),
+            [
+                LogPass("Caching permutation-aware synthesis results."),
+                ExtractModelConnectivityPass(),
+                partitioner,
+                ForEachBlockPass(
+                    EmbedAllPermutationsPass(
+                        inner_synthesis=squander,
+                        input_perm=True,
+                        output_perm=False,
+                        vary_topology=False,
+                    ),
+                ),
+                LogPass("Preoptimizing with permutation-aware mapping."),
+                PAMRoutingPass(),
+                post_pam_seq,
+                UnfoldPass(),
+                RestoreModelConnectivityPass(),
+                LogPass("Recaching permutation-aware synthesis results."),
+                SubtopologySelectionPass(block_size),
+                partitioner,
+                ForEachBlockPass(
+                    EmbedAllPermutationsPass(
+                        inner_synthesis=squander,
+                        input_perm=False,
+                        output_perm=True,
+                        vary_topology=True,
+                    ),
+                ),
+                LogPass("Performing permutation-aware mapping."),
+                ApplyPlacement(),
+                PAMLayoutPass(3),
+                PAMRoutingPass(0.1),
+                post_pam_seq,
+                ApplyPlacement(),
+                UnfoldPass(),
+            ],
+        ),
+        name="SeqPAM Mapping",
+    )
+
+
 class qgd_Wide_Circuit_Optimization:
     """Optimize wide (many-qubit) circuits via partitioning and subcircuit decomposition.
 
@@ -1313,6 +1490,7 @@ def __init__(self, config):
             "TreeGuided",
             "qiskit",
             "bqskit",
+            "seqpam_PartAM",
         ]
         if not strategy in allowed_startegies:
             raise Exception(
@@ -1994,6 +2172,53 @@ def OptimizeWideCircuit(
             self.check_compare_circuits(circ, parameters, newcirc, newparameters)
             circ, parameters = newcirc, newparameters
 
+        elif self.config["strategy"] == "seqpam_PartAM":
+            if self.config["verbosity"] >= 1:
+                print("Optimizing circuit with BQSKit SeqPAM + Squander (PartAM ILP weights)")
+            from squander import Qiskit_IO
+            from bqskit.compiler import Compiler
+            from bqskit.compiler.machine import MachineModel
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from bqskit.passes import SetModelPass
+            from qiskit import qasm2, QuantumCircuit
+
+            strategy_map = {"TreeSearch": "Tree_search", "TabuSearch": "Tabu_search"}
+            squander_config = {
+                "strategy": strategy_map.get(self.config.get("strategy", "TreeSearch"), "Tree_search"),
+                "optimization_tolerance": self.config.get("tolerance", 1e-8),
+                "verbosity": self.config.get("verbosity", 0),
+                "optimizer_engine": self.config.get("optimizer_engine", "BFGS"),
+                "Cost_Function_Variant": self.config.get("Cost_Function_Variant", 3),
+                "size_density_weight": True,
+                "sparse_penalty": self.config.get("sparse_penalty", 3.0),
+                "max_partition_size": self.max_partition_size,
+            }
+            block_size = self.max_partition_size
+
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
+            circo = Qiskit_IO.get_Qiskit_Circuit(circ, parameters)
+            bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
+
+            workflow = generate_squander_seqpam(squander_config, block_size)
+
+            with Compiler() as compiler:
+                routed_bqskit_circ = compiler.compile(
+                    bqskit_circ, [SetModelPass(model), workflow]
+                )
+
+            circuit_qiskit = QuantumCircuit.from_qasm_str(
+                OPENQASM2Language().encode(routed_bqskit_circ)
+            )
+            newcirc, newparameters = Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit)
+
+            qgd_Wide_Circuit_Optimization.check_valid_routing(
+                newcirc, self.config["topology"]
+            )
+            if self.config["verbosity"] >= 2:
+                print("OptimizeWideCircuit::check_compare_circuits")
+            self.check_compare_circuits(circ, parameters, newcirc, newparameters)
+            circ, parameters = newcirc, newparameters
+
         elif self.config["strategy"] == "qiskit":
             if self.config["verbosity"] >= 1:
                 print("Optimizing circuit with Qiskit")

From cf2ec052bfdfba4d9125feaabad1b0bc94615d55 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 28 Apr 2026 22:11:39 +0200
Subject: [PATCH 167/232] Fix

---
 squander/decomposition/qgd_Wide_Circuit_Optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 7e6dfaabe..8657e6139 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -1412,7 +1412,7 @@ async def run(self, circuit, data):
 
             circuit.become(partitioned, False)
 
-    squander    = SquanderSynthesisPass(squander_config)
+    squander    = SquanderSynthesisPass(squander_config=squander_config)
     partitioner = SquanderILPPartitioner(block_size, squander_config)
     post_pam_seq: BasePass = PAMVerificationSequence(8)
 

From cca7911bcfe06813a2091d32f0c4b573a53d1772 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 29 Apr 2026 23:15:07 +0200
Subject: [PATCH 168/232] fix

---
 .../qgd_Wide_Circuit_Optimization.py          | 200 ++++++++++++++----
 .../sabre_router/include/sabre_router.hpp     |   2 +
 .../src-cpp/sabre_router/sabre_router.cpp     |  53 ++++-
 squander/synthesis/PartAM.py                  | 190 ++++++++++++++++-
 squander/synthesis/bindings.cpp               |   2 +
 5 files changed, 390 insertions(+), 57 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 8657e6139..2a2cd710d 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -1283,6 +1283,9 @@ def gen_all_min_cnots(
 # N_Qubit_Decomposition_Guided_Tree.gen_all_min_cnots(3); assert False
 # N_Qubit_Decomposition_Guided_Tree.build_sequence(); assert False
 # print(len(list(N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS(3, [(0,1),(1,2),])))); assert False
+def _topology_le_to_be(n_qubits, topology):
+    """Convert a topology from squander LE convention to bqskit BE convention."""
+    return [(n_qubits - 1 - i, n_qubits - 1 - j) for i, j in topology]
 
 
 def generate_squander_seqpam(squander_config, block_size):
@@ -1322,12 +1325,7 @@ def generate_squander_seqpam(squander_config, block_size):
     from bqskit.compiler import Workflow, BasePass
 
     class SquanderILPPartitioner(BasePass):
-        """Partition a bqskit circuit using squander's ILP with PartAM density weights.
-
-        Mirrors the partition-selection logic of PartAM.SynthesizeWideCircuit:
-        get_all_partitions → ilp_global_optimal (with optional density weights)
-        → wrap each selected partition as a bqskit CircuitGate block.
-        """
+        """Partition a bqskit circuit using squander's ILP with PartAM density weights."""
 
         def __init__(self, block_size, squander_config):
             self.block_size = block_size
@@ -1343,17 +1341,20 @@ async def run(self, circuit, data):
             )
             from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping
 
-            # bqskit → squander via QASM; gate order is preserved
-            qasm_str = OPENQASM2Language().encode(circuit)
+            # Unfold any CircuitGate blocks (e.g. from a prior SubtopologySelectionPass)
+            # so that bqskit op indices align 1:1 with squander gate indices after the
+            # QASM roundtrip.  unfold_all() is a no-op on already-flat circuits.
+            flat_circuit = circuit.copy()
+            flat_circuit.unfold_all()
+
+            qasm_str = OPENQASM2Language().encode(flat_circuit)
             qk_circ = QkCircuit.from_qasm_str(qasm_str)
             sqdr_circ, _ = Qiskit_IO.convert_Qiskit_to_Squander(qk_circ)
 
-            # Enumerate candidate partitions
             allparts, g, go, rgo, sq_chains, gate_to_qubit, _ = \
                 get_all_partitions(sqdr_circ, self.block_size)
             gate_dict = {i: gate for i, gate in enumerate(sqdr_circ.get_Gates())}
 
-            # ILP selection with optional density-penalty weights
             ilp_weights = None
             if self.squander_config.get('size_density_weight', False):
                 sparse_penalty = float(self.squander_config.get('sparse_penalty', 3.0))
@@ -1362,18 +1363,15 @@ async def run(self, circuit, data):
                 )
             L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 
-            # bqskit_ops[i] corresponds to squander gate index i (QASM order preserved)
-            bqskit_ops = list(circuit.operations_with_cycles())
+            bqskit_ops = list(flat_circuit.operations_with_cycles())
 
-            # Single-qubit chains that are surrounded by selected partitions
             sqc_pre     = {x[0]: x for x in sq_chains if rgo[x[0]]}
             sqc_post    = {x[-1]: x for x in sq_chains if go[x[-1]]}
             sqc_prepost = {x[0]: x for x in sq_chains
                            if x[0] in sqc_pre and x[-1] in sqc_post}
 
-            covered = set()
-            partitioned = BQCircuit(circuit.num_qudits, circuit.radixes)
-
+            # Build expanded gate_idxs per ILP partition (include surrounding 1q gates)
+            expanded = {}
             for i in L_parts:
                 part = allparts[i]
                 surrounded = {
@@ -1383,38 +1381,106 @@ async def run(self, circuit, data):
                     and next(iter(go[sqc_prepost[t][-1]])) in part
                 }
                 gate_idxs = frozenset.union(part, *(sqc_prepost[v] for v in surrounded))
+                expanded[i] = gate_idxs
 
-                global_qudits = sorted({
-                    q for gi in gate_idxs
-                    for q in gate_dict[gi].get_Involved_Qbits()
-                })
-                local_map = {gq: l for l, gq in enumerate(global_qudits)}
-
-                topo = _get_topo_order(
-                    {x: go[x] & gate_idxs for x in gate_idxs},
-                    {x: rgo[x] & gate_idxs for x in gate_idxs},
-                    gate_to_qubit,
-                )
-                sub = BQCircuit(len(global_qudits))
-                for gi in topo:
-                    _, op = bqskit_ops[gi]
-                    sub.append(op.gate, [local_map[q] for q in op.location])
-
-                partitioned.append_circuit(sub, global_qudits, as_circuit_gate=True)
-                covered.update(gate_idxs)
+            # Further expand: include ALL intermediate gates on partition qubits
+            for i in L_parts:
+                gate_idxs = expanded[i]
+                part_qubits = set()
+                for gi in gate_idxs:
+                    part_qubits.update(gate_dict[gi].get_Involved_Qbits())
+                lo = min(gate_idxs)
+                hi = max(gate_idxs)
+                extra = set()
+                for gi in range(lo, hi + 1):
+                    if gi not in gate_idxs:
+                        gq = set(gate_dict[gi].get_Involved_Qbits())
+                        if gq & part_qubits:
+                            extra.add(gi)
+                if extra:
+                    expanded[i] = gate_idxs | frozenset(extra)
+
+            # Sort partitions by their minimum gate index to preserve original order
+            seen_parts = set()
+            sorted_parts = []
+            for i in L_parts:
+                gate_idxs = expanded[i]
+                part_key = min(gate_idxs)
+                if part_key not in seen_parts:
+                    seen_parts.add(part_key)
+                    sorted_parts.append((part_key, gate_idxs))
+            sorted_parts.sort(key=lambda x: x[0])
+            print(f"[ILP Partitioner] {len(sorted_parts)} partitions, expanded gate counts: {[len(gi) for _, gi in sorted_parts]}")
+
+            # Map gate_idx -> sorted partition index
+            gate_to_part = {}
+            for pidx, (_, gate_idxs) in enumerate(sorted_parts):
+                for gi in gate_idxs:
+                    gate_to_part[gi] = pidx
+
+            # Build partitioned circuit by iterating gates in original order
+            partitioned = BQCircuit(circuit.num_qudits, circuit.radixes)
+            built_parts = set()
 
-            # Standalone single-qubit gates → 1-qudit blocks
             for gi, (_, op) in enumerate(bqskit_ops):
-                if gi not in covered:
+                pidx = gate_to_part.get(gi, -1)
+
+                if pidx >= 0 and pidx not in built_parts:
+                    built_parts.add(pidx)
+                    _, gate_idxs = sorted_parts[pidx]
+                    global_qudits = sorted({
+                        q for ggi in gate_idxs
+                        for q in gate_dict[ggi].get_Involved_Qbits()
+                    })
+                    local_map = {gq: l for l, gq in enumerate(global_qudits)}
+
+                    topo = _get_topo_order(
+                        {x: go[x] & gate_idxs for x in gate_idxs},
+                        {x: rgo[x] & gate_idxs for x in gate_idxs},
+                        gate_to_qubit,
+                    )
+                    sub = BQCircuit(len(global_qudits))
+                    for ggi in topo:
+                        _, gop = bqskit_ops[ggi]
+                        sub.append_gate(gop.gate, [local_map[q] for q in gop.location], gop.params)
+                    partitioned.append_circuit(sub, global_qudits, as_circuit_gate=True)
+
+                elif pidx < 0:
                     sub_1q = BQCircuit(1)
-                    sub_1q.append(op.gate, [0])
+                    sub_1q.append_gate(op.gate, [0], op.params)
                     partitioned.append_circuit(sub_1q, list(op.location), as_circuit_gate=True)
 
+            # Sanity check: all gates from flat_circuit must appear exactly once
+            flat_check = partitioned.copy()
+            flat_check.unfold_all()
+            n_expected = len(bqskit_ops)
+            n_actual = flat_check.num_operations
+            if n_actual != n_expected:
+                print(f'[ILP Partitioner] WARNING: gate count mismatch! '
+                      f'expected={n_expected}, actual={n_actual}')
+
             circuit.become(partitioned, False)
 
+    class SetPAMInitialPlacementPass(BasePass):
+        """Set the placement used as the starting point for the final PAM layout."""
+
+        def __init__(self, placement):
+            self.placement = None if placement is None else list(placement)
+
+        async def run(self, circuit, data):
+            if self.placement is None:
+                return
+            if len(self.placement) != circuit.num_qudits:
+                raise ValueError(
+                    "PAM initial placement length must match circuit width."
+                )
+            data.placement = list(self.placement)
+
     squander    = SquanderSynthesisPass(squander_config=squander_config)
     partitioner = SquanderILPPartitioner(block_size, squander_config)
     post_pam_seq: BasePass = PAMVerificationSequence(8)
+    num_layout_passes = int(squander_config.get("num_layout_passes", 100))
+    pam_initial_placement = squander_config.get("pam_initial_placement", None)
 
     return Workflow(
         IfThenElsePass(
@@ -1427,7 +1493,7 @@ async def run(self, circuit, data):
                     EmbedAllPermutationsPass(
                         inner_synthesis=squander,
                         input_perm=True,
-                        output_perm=False,
+                        output_perm=True,
                         vary_topology=False,
                     ),
                 ),
@@ -1442,14 +1508,15 @@ async def run(self, circuit, data):
                 ForEachBlockPass(
                     EmbedAllPermutationsPass(
                         inner_synthesis=squander,
-                        input_perm=False,
+                        input_perm=True,
                         output_perm=True,
                         vary_topology=True,
                     ),
                 ),
                 LogPass("Performing permutation-aware mapping."),
                 ApplyPlacement(),
-                PAMLayoutPass(3),
+                SetPAMInitialPlacementPass(pam_initial_placement),
+                PAMLayoutPass(num_layout_passes),
                 PAMRoutingPass(0.1),
                 post_pam_seq,
                 ApplyPlacement(),
@@ -2122,7 +2189,7 @@ def OptimizeWideCircuit(
             )
 
             # Build BQSKit machine model from your topology
-            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
+            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
 
             # Convert squander circuit → qiskit → BQSKit
             # (BQSKit has a from_qiskit helper if you go via Qiskit IR)
@@ -2195,7 +2262,7 @@ def OptimizeWideCircuit(
             }
             block_size = self.max_partition_size
 
-            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
+            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
             circo = Qiskit_IO.get_Qiskit_Circuit(circ, parameters)
             bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
 
@@ -2733,7 +2800,7 @@ async def run(self, circuit: BQSKitCircuit, data=None):
             from qiskit import qasm2, QuantumCircuit
 
             # Build BQSKit machine model from your topology
-            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
+            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
 
             # Convert squander circuit → qiskit → BQSKit
             # (BQSKit has a from_qiskit helper if you go via Qiskit IR)
@@ -2789,6 +2856,53 @@ async def run(self, circuit: BQSKitCircuit, data=None):
                 pass_data.placement[x] for x in pass_data.final_mapping
             )
 
+        elif strategy == "seqpam_partam":
+            from squander import Qiskit_IO
+            from squander.decomposition.qgd_Wide_Circuit_Optimization import generate_squander_seqpam
+            from bqskit.compiler import Compiler
+            from bqskit.compiler.machine import MachineModel
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from bqskit.passes import SetModelPass
+            from qiskit import qasm2, QuantumCircuit
+
+            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
+            circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters)
+            bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
+
+            squander_config = {
+                'strategy': 'Tree_search',
+                'optimization_tolerance': self.config.get('tolerance', 1e-8),
+                'verbosity': self.config.get('verbosity', 0),
+                'optimizer_engine': self.config.get('optimizer_engine', 'BFGS'),
+                'size_density_weight': True,
+                'sparse_penalty': self.config.get('sparse_penalty', 3.0),
+                'max_partition_size': self.max_partition_size,
+                'use_osr':0,
+                'use_graph_search':0,
+            }
+            workflow = generate_squander_seqpam(squander_config, self.max_partition_size)
+
+            with Compiler() as compiler:
+                routed_bqskit_circ, pass_data = compiler.compile(
+                    bqskit_circ, [SetModelPass(model), workflow], True
+                )
+
+            circuit_qiskit_routed = QuantumCircuit.from_qasm_str(
+                OPENQASM2Language().encode(routed_bqskit_circ)
+            )
+            Squander_remapped_circuit, parameters_remapped_circuit = (
+                Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_routed)
+            )
+            Squander_remapped_circuit = Squander_remapped_circuit.Remap_Qbits(
+                {i: j for i, j in enumerate(pass_data.placement)}
+            )
+            self.config["initial_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.initial_mapping
+            )
+            self.config["final_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.final_mapping
+            )
+
         elif strategy == "light-sabre":
             from squander import Qiskit_IO
             from qiskit import transpile
diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index bfc7ac6a5..316698b13 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -78,6 +78,8 @@ struct LayoutPartInfo {
 
 struct SabreConfig {
     int prefilter_top_k = 50;
+    int prefilter_min_per_partition = 2;
+    int prefilter_min_3q = 12;
     int max_E_size = 20;
     int max_lookahead = 4;
     double E_weight = 0.5;
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 189c3392c..a6516d98a 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1249,19 +1249,60 @@ std::vector<const CandidateData*> SabreRouter::prefilter_candidates(
         estimated.push_back({est, cand});
     }
 
-    std::nth_element(
+    std::stable_sort(
         estimated.begin(),
-        estimated.begin() + top_k,
         estimated.end(),
         [](const Pair& a, const Pair& b) {
-            return a.first < b.first;
+            if (a.first != b.first) return a.first < b.first;
+            if (a.second->partition_idx != b.second->partition_idx) {
+                return a.second->partition_idx < b.second->partition_idx;
+            }
+            return a.second->candidate_idx < b.second->candidate_idx;
         }
     );
 
+    const int min_per_partition =
+        std::max(0, config_.prefilter_min_per_partition);
+    const int min_3q = std::max(0, config_.prefilter_min_3q);
+
     std::vector<const CandidateData*> result;
-    result.reserve(top_k);
-    for (int i = 0; i < top_k; i++) {
-        result.push_back(estimated[i].second);
+    result.reserve(std::min(static_cast<int>(candidates.size()), top_k));
+    std::unordered_set<const CandidateData*> selected;
+
+    if (min_per_partition > 0 || min_3q > 0) {
+        std::unordered_map<int, int> quota_by_partition;
+        for (const auto& item : estimated) {
+            const CandidateData* cand = item.second;
+            int quota = min_per_partition;
+            if (cand->involved_qbits.size() >= 3) {
+                quota = std::max(quota, min_3q);
+            }
+            if (quota <= 0) continue;
+            auto it = quota_by_partition.find(cand->partition_idx);
+            if (it == quota_by_partition.end() || quota > it->second) {
+                quota_by_partition[cand->partition_idx] = quota;
+            }
+        }
+
+        std::unordered_map<int, int> selected_by_partition;
+        for (const auto& item : estimated) {
+            const CandidateData* cand = item.second;
+            auto quota_it = quota_by_partition.find(cand->partition_idx);
+            if (quota_it == quota_by_partition.end()) continue;
+            int& count = selected_by_partition[cand->partition_idx];
+            if (count >= quota_it->second) continue;
+            result.push_back(cand);
+            selected.insert(cand);
+            count++;
+        }
+    }
+
+    for (const auto& item : estimated) {
+        if (static_cast<int>(result.size()) >= top_k) break;
+        const CandidateData* cand = item.second;
+        if (selected.find(cand) != selected.end()) continue;
+        result.push_back(cand);
+        selected.insert(cand);
     }
     return result;
 }
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6b60225b5..a490c6d0a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -143,6 +143,8 @@ def __init__(self, config):
         self.config.setdefault('random_seed', 42)
         self.config.setdefault('cleanup', True)
         self.config.setdefault('prefilter_top_k', 50)
+        self.config.setdefault('prefilter_min_per_partition', 2)
+        self.config.setdefault('prefilter_min_3q', 12)
         self.config.setdefault('cleanup_top_k', 3)
         self.config.setdefault('decay_delta', 0.001)  # Qiskit LightSABRE DECAY_RATE
         self.config.setdefault('swap_burst_budget', 5)  # Qiskit LightSABRE DECAY_RESET_INTERVAL
@@ -159,8 +161,14 @@ def __init__(self, config):
             self.config['path_tiebreak_weight'] = 0.49
         self.config.setdefault('cnot_cost', 1.0 / 3.0)  # 1 SWAP = 3 CNOTs
         self.config.setdefault('three_qubit_exit_weight', 1.0)
+        self.config.setdefault('routing_aware_partitioning', True)
         self.config.setdefault('size_density_weight', False)
         self.config.setdefault('sparse_penalty', 3.0)
+        self.config.setdefault('two_pair_3q_penalty', None)
+        self.config.setdefault('dense_3q_penalty', None)
+        self.config.setdefault('triangle_free_3q_penalty', 1.0)
+        self.config.setdefault('three_qubit_reuse_discount', 0.15)
+        self.config.setdefault('three_qubit_reuse_discount_cap', 1.0)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -243,6 +251,114 @@ def _parts_to_density_weights(allparts, gate_dict, sparse_penalty=3.0):
             weights.append(penalty / N)
         return weights
 
+    @staticmethod
+    def _topology_has_triangle(topology):
+        """Return True when the hardware graph contains a 3-cycle."""
+        if not topology:
+            return False
+        adj = defaultdict(set)
+        for u, v in topology:
+            adj[int(u)].add(int(v))
+            adj[int(v)].add(int(u))
+        for u, neighbors in adj.items():
+            ordered = sorted(v for v in neighbors if v > u)
+            for idx, v in enumerate(ordered):
+                if any(w in adj[v] for w in ordered[idx + 1:]):
+                    return True
+        return False
+
+    @staticmethod
+    def _parts_to_routing_aware_weights(
+        allparts,
+        gate_dict,
+        topology=None,
+        sparse_penalty=3.0,
+        two_pair_3q_penalty=None,
+        dense_3q_penalty=None,
+        triangle_free_3q_penalty=1.0,
+        reuse_discount=0.15,
+        reuse_discount_cap=1.0,
+    ):
+        """Per-part ILP weights for routing-aware 3q partition selection.
+
+        ``ilp_global_optimal`` minimizes ``1 + N * weight[i]`` for each
+        selected part.  Returning ``penalty / N`` therefore makes ``penalty``
+        the extra cost of selecting that part.
+
+        The older density tie-breaker preserved minimum partition count, which
+        still lets a sparse 3q block beat the two 2q blocks it would replace.
+        This cost is intentionally allowed to cross that boundary:
+
+          one active pair  -> strongly prefer the equivalent 2q partition
+          two active pairs -> prefer two 2q partitions unless the block is reused
+          three pairs      -> keep 3q attractive, but penalize triangle-free HW
+        """
+        N = max(len(allparts), 1)
+        reuse_discount = float(reuse_discount or 0.0)
+        reuse_discount_cap = float(reuse_discount_cap or 0.0)
+        two_pair_penalty = (
+            sparse_penalty / 2.0
+            if two_pair_3q_penalty is None
+            else float(two_pair_3q_penalty)
+        )
+        triangle_free_penalty = float(triangle_free_3q_penalty or 0.0)
+        if dense_3q_penalty is None:
+            dense_penalty = (
+                triangle_free_penalty
+                if not qgd_Partition_Aware_Mapping._topology_has_triangle(topology)
+                else 0.0
+            )
+        else:
+            dense_penalty = float(dense_3q_penalty)
+
+        weights = []
+        for part in allparts:
+            qubits_in_part = set()
+            active_pairs = set()
+            multi_qubit_gate_count = 0
+
+            for gate_idx in part:
+                gate = gate_dict.get(gate_idx)
+                if gate is None:
+                    continue
+                qbs = list(gate.get_Involved_Qbits())
+                qubits_in_part.update(qbs)
+                if len(qbs) < 2:
+                    continue
+                multi_qubit_gate_count += 1
+                for a in range(len(qbs)):
+                    for b in range(a + 1, len(qbs)):
+                        active_pairs.add(
+                            (min(qbs[a], qbs[b]), max(qbs[a], qbs[b]))
+                        )
+
+            if len(qubits_in_part) != 3:
+                weights.append(0.0)
+                continue
+
+            n_pairs = len(active_pairs)
+            if n_pairs <= 1:
+                penalty = float(sparse_penalty)
+            elif n_pairs == 2:
+                penalty = two_pair_penalty
+            else:
+                penalty = dense_penalty
+
+            extra_reuse = (
+                max(0, multi_qubit_gate_count - n_pairs)
+                if n_pairs >= 2
+                else 0
+            )
+            if extra_reuse and reuse_discount > 0:
+                discount = min(
+                    float(reuse_discount_cap),
+                    float(reuse_discount) * float(extra_reuse),
+                )
+                penalty = max(0.0, penalty - discount)
+
+            weights.append(penalty / N)
+        return weights
+
     @staticmethod
     def _topo_key(mini_topology):
         return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
@@ -531,13 +647,32 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
         # ---- Phase 2: ILP partition selection ----
-        # Minimize total partition count so PAM gets the largest blocks possible
-        # under max_partition_size. Larger blocks = more (P_i, P_o) freedom to
-        # absorb routing SWAPs. Overlap-based tie-breaker (when enabled)
-        # picks deterministically among min-count covers, preferring covers
-        # whose parts share more logical qubits with their DAG successors.
+        # Route-aware weights let sparse 3q blocks lose to the 2q blocks they
+        # would otherwise replace.  Without weights, the ILP minimizes selected
+        # partition count and therefore over-selects 3q partitions.
         ilp_weights = None
-        if self.config.get('size_density_weight', False):
+        if self.config.get('routing_aware_partitioning', True):
+            sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
+            ilp_weights = self._parts_to_routing_aware_weights(
+                allparts,
+                gate_dict,
+                topology=self.topology,
+                sparse_penalty=sparse_penalty,
+                two_pair_3q_penalty=self.config.get(
+                    'two_pair_3q_penalty', None
+                ),
+                dense_3q_penalty=self.config.get('dense_3q_penalty', None),
+                triangle_free_3q_penalty=self.config.get(
+                    'triangle_free_3q_penalty', 1.0
+                ),
+                reuse_discount=self.config.get(
+                    'three_qubit_reuse_discount', 0.15
+                ),
+                reuse_discount_cap=self.config.get(
+                    'three_qubit_reuse_discount_cap', 1.0
+                ),
+            )
+        elif self.config.get('size_density_weight', False):
             sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
             ilp_weights = self._parts_to_density_weights(
                 allparts, gate_dict, sparse_penalty=sparse_penalty
@@ -978,6 +1113,12 @@ def _run_layout_trials_cpp(
 
         cfg = SabreConfig()
         cfg.prefilter_top_k = self.config.get('prefilter_top_k', 50)
+        if hasattr(cfg, 'prefilter_min_per_partition'):
+            cfg.prefilter_min_per_partition = self.config.get(
+                'prefilter_min_per_partition', 2
+            )
+        if hasattr(cfg, 'prefilter_min_3q'):
+            cfg.prefilter_min_3q = self.config.get('prefilter_min_3q', 12)
         cfg.max_E_size = self.config.get('max_E_size', 20)
         cfg.max_lookahead = self.config.get('max_lookahead', 4)
         cfg.E_weight = self.config.get('E_weight', 0.5)
@@ -1454,8 +1595,41 @@ def _prefilter_candidates(
             )
             for pc in partition_candidates
         ])
-        top_k_indices = np.argpartition(estimates, top_k)[:top_k]
-        return [partition_candidates[i] for i in top_k_indices]
+        selected = set()
+        min_per_partition = int(
+            self.config.get('prefilter_min_per_partition', 0) or 0
+        )
+        min_3q = int(self.config.get('prefilter_min_3q', 0) or 0)
+        if min_per_partition > 0 or min_3q > 0:
+            by_partition = defaultdict(list)
+            for idx, pc in enumerate(partition_candidates):
+                by_partition[pc.partition_idx].append(idx)
+            for indices in by_partition.values():
+                sample = partition_candidates[indices[0]]
+                quota = min_per_partition
+                if len(sample.involved_qbits) >= 3:
+                    quota = max(quota, min_3q)
+                if quota <= 0:
+                    continue
+                ranked = sorted(indices, key=lambda i: estimates[i])
+                selected.update(ranked[:min(quota, len(ranked))])
+
+        remaining = max(0, top_k - len(selected))
+        if remaining > 0:
+            ranked_global = np.argsort(estimates)
+            for idx in ranked_global:
+                selected.add(int(idx))
+                if len(selected) >= top_k:
+                    break
+
+        if not selected:
+            top_k_indices = np.argpartition(estimates, top_k)[:top_k]
+            selected.update(int(i) for i in top_k_indices)
+
+        return [
+            partition_candidates[i]
+            for i in sorted(selected, key=lambda idx: estimates[idx])
+        ]
 
     @staticmethod
     def _decay_factor_for_swaps(swaps, decay):
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 7103819cc..fe2368fd2 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -121,6 +121,8 @@ PYBIND11_MODULE(_sabre_router, m) {
     py::class_<SabreConfig>(m, "SabreConfig")
         .def(py::init<>())
         .def_readwrite("prefilter_top_k", &SabreConfig::prefilter_top_k)
+        .def_readwrite("prefilter_min_per_partition", &SabreConfig::prefilter_min_per_partition)
+        .def_readwrite("prefilter_min_3q", &SabreConfig::prefilter_min_3q)
         .def_readwrite("max_E_size", &SabreConfig::max_E_size)
         .def_readwrite("max_lookahead", &SabreConfig::max_lookahead)
         .def_readwrite("E_weight", &SabreConfig::E_weight)

From 6f0e5a9531daf39b9b37ea136669f5fbdc9e2639 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 29 Apr 2026 23:54:50 +0200
Subject: [PATCH 169/232] try new things

---
 .../sabre_router/include/sabre_router.hpp     |  7 +++
 .../src-cpp/sabre_router/sabre_router.cpp     | 58 ++++++++++++++-----
 squander/synthesis/PartAM.py                  | 51 ++++++++++------
 3 files changed, 84 insertions(+), 32 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 316698b13..681000093 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -310,6 +310,13 @@ class SabreRouter {
         double decay_factor = 1.0
     ) const;
 
+    double future_partition_cost(
+        int partition_idx,
+        const std::vector<int>& pi,
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
     void apply_decay_for_swaps(
         const std::vector<std::pair<int,int>>& swaps,
         std::vector<double>& decay
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index a6516d98a..c423d710b 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1072,6 +1072,35 @@ double SabreRouter::entry_future_cost(
     return total;
 }
 
+double SabreRouter::future_partition_cost(
+    int partition_idx,
+    const std::vector<int>& pi,
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    if (
+        partition_idx >= 0
+        && partition_idx < static_cast<int>(candidate_cache_.size())
+        && !candidate_cache_[partition_idx].empty()
+        && candidate_cache_[partition_idx].front().involved_qbits.size() >= 3
+    ) {
+        double best = std::numeric_limits<double>::infinity();
+        for (const auto& cand : candidate_cache_[partition_idx]) {
+            best = std::min(
+                best,
+                static_cast<double>(estimate_swap_count(cand, pi, reverse))
+            );
+        }
+        return best;
+    }
+
+    auto it = canonical_data.find(partition_idx);
+    if (it == canonical_data.end()) {
+        return std::numeric_limits<double>::infinity();
+    }
+    return entry_future_cost(it->second, pi);
+}
+
 double SabreRouter::future_context_cost(
     int exclude_partition_idx,
     const std::vector<int>& pi,
@@ -1080,19 +1109,17 @@ double SabreRouter::future_context_cost(
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data
 ) const {
-    (void)reverse;
-
-    // BQSKit-style cost: sum max(0, dist - 1) over each canonical gate edge,
-    // with no candidate-permutation enumeration. Same shape for F and E so
-    // the future signal is monotone in distance instead of flickering with
-    // whichever candidate happens to win the lower bound.
+    // Candidate-aware lower bound: for each future partition, use the best
+    // available candidate entry cost under this layout. This lets 3q line
+    // blocks distinguish which logical qubit should sit on the path center.
     double f_sum = 0.0;
     int n_other = 0;
     for (int p_idx : F_snapshot) {
         if (p_idx == exclude_partition_idx) continue;
-        auto it = canonical_data.find(p_idx);
-        if (it == canonical_data.end()) continue;
-        f_sum += entry_future_cost(it->second, pi);
+        const double cost = future_partition_cost(
+            p_idx, pi, reverse, canonical_data);
+        if (!std::isfinite(cost)) continue;
+        f_sum += cost;
         n_other++;
     }
 
@@ -1102,17 +1129,22 @@ double SabreRouter::future_context_cost(
 
     if (!E.empty()) {
         double e_sum = 0.0;
+        int e_count = 0;
         for (auto [p_idx, depth] : E) {
             if (p_idx == exclude_partition_idx) continue;
-            auto it = canonical_data.find(p_idx);
-            if (it == canonical_data.end()) continue;
+            const double cost = future_partition_cost(
+                p_idx, pi, reverse, canonical_data);
+            if (!std::isfinite(cost)) continue;
             const double alpha =
                 (depth >= 0 && depth < static_cast<int>(alpha_weights_.size()))
                     ? alpha_weights_[depth]
                     : std::pow(config_.E_alpha, depth);
-            e_sum += alpha * entry_future_cost(it->second, pi);
+            e_sum += alpha * cost;
+            e_count++;
+        }
+        if (e_count > 0) {
+            score += config_.E_weight * e_sum / static_cast<double>(e_count);
         }
-        score += config_.E_weight * e_sum / static_cast<double>(E.size());
     }
 
     return score;
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index a490c6d0a..3e801ff8c 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1785,44 +1785,57 @@ def _future_context_cost(
         layout_partitions=None,
         canonical_data=None,
     ):
-        del candidate_cache, reverse, cnot_cost, layout_partitions
+        del cnot_cost, layout_partitions
 
-        # BQSKit-style cost: sum max(0, dist - 1) over each canonical gate edge,
-        # for both F and E. No candidate-permutation enumeration so the future
-        # signal stays monotone in distance.
+        # Candidate-aware lower bound: for each future partition, use the best
+        # available candidate entry cost under this layout.  This preserves the
+        # monotone distance signal while allowing 3q line blocks to distinguish
+        # which logical qubit should sit on the path center.
         pi_arr = np.asarray(pi, dtype=np.intp)
         D_arr = np.asarray(D)
 
+        def partition_cost(p_idx):
+            if candidate_cache is not None and 0 <= p_idx < len(candidate_cache):
+                candidates = candidate_cache[p_idx]
+                if candidates and len(candidates[0].involved_qbits) >= 3:
+                    return min(
+                        cand.estimate_swap_count(pi, D, reverse=reverse)
+                        for cand in candidates
+                    )
+            if canonical_data is None:
+                return None
+            entry = canonical_data.get(p_idx)
+            if entry is None:
+                return None
+            return qgd_Partition_Aware_Mapping._entry_future_cost(
+                entry, pi_arr, D_arr
+            )
+
         f_sum = 0.0
         n_other = 0
         for p_idx in F:
             if p_idx == exclude_partition_idx:
                 continue
-            if canonical_data is None:
+            cost = partition_cost(p_idx)
+            if cost is None:
                 continue
-            entry = canonical_data.get(p_idx)
-            if entry is None:
-                continue
-            f_sum += qgd_Partition_Aware_Mapping._entry_future_cost(
-                entry, pi_arr, D_arr
-            )
+            f_sum += cost
             n_other += 1
         score = f_sum / n_other if n_other > 0 else 0.0
 
         if E:
             e_sum = 0.0
+            e_count = 0
             for p_idx, depth in E:
                 if p_idx == exclude_partition_idx:
                     continue
-                if canonical_data is None:
-                    continue
-                entry = canonical_data.get(p_idx)
-                if entry is None:
+                cost = partition_cost(p_idx)
+                if cost is None:
                     continue
-                e_sum += (alpha ** depth) * qgd_Partition_Aware_Mapping._entry_future_cost(
-                    entry, pi_arr, D_arr
-                )
-            score += W * e_sum / len(E)
+                e_sum += (alpha ** depth) * cost
+                e_count += 1
+            if e_count:
+                score += W * e_sum / e_count
         return score
 
     def _release_valve(self, F, pi, D, canonical_data):

From 1bb85a11fd79d479e670a858e7d89288be5c8e7e Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 00:20:18 +0200
Subject: [PATCH 170/232] Boundary beam routing

---
 benchmark_PartAM.py          |  12 +-
 squander/synthesis/PartAM.py | 279 ++++++++++++++++++++++++++++++++++-
 2 files changed, 283 insertions(+), 8 deletions(-)

diff --git a/benchmark_PartAM.py b/benchmark_PartAM.py
index fa4fed3d6..6d7605781 100644
--- a/benchmark_PartAM.py
+++ b/benchmark_PartAM.py
@@ -73,10 +73,16 @@ def run_once(circ_orig, parameters_orig, topology):
         'random_seed':random.randint(1,100),
         # Cheap candidate prefilter before full A* scoring.
         'prefilter_top_k': 50,
+        'prefilter_min_per_partition': 2,
+        'prefilter_min_3q': 12,
         # Rank every layout trial by actual constructed routing, not only by
         # the heuristic trial cost.
         'actual_routing_rank_top_k': None,
         'top_k_pi': 1,
+        # Boundary-state beam routing is a Python prototype on this branch.
+        # Set width/depth to 1 to recover the greedy router.
+        'boundary_beam_width': 4,
+        'boundary_beam_depth': 3,
         'cnot_cost': 0.5 / 3.0,  # old: swap_cost=15, local_cost_weight=1.0 -> 15:1 swap:cnot
         'cleanup_top_k': 3,
         "parallel_layout_trials": True,
@@ -89,8 +95,12 @@ def run_once(circ_orig, parameters_orig, topology):
         'swap_burst_budget': 5,
         'path_tiebreak_weight': 0.2,
         'three_qubit_exit_weight': 1.5,
-        'size_density_weight': True,
+        'routing_aware_partitioning': True,
         'sparse_penalty': 3.0,
+        'two_pair_3q_penalty': 1.5,
+        'triangle_free_3q_penalty': 1.0,
+        'three_qubit_reuse_discount': 0.15,
+        'three_qubit_reuse_discount_cap': 1.0,
     }
 
     # Clean the initial circuit using the same config pattern as in PartAM.py
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 3e801ff8c..a97e3b676 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -161,6 +161,8 @@ def __init__(self, config):
             self.config['path_tiebreak_weight'] = 0.49
         self.config.setdefault('cnot_cost', 1.0 / 3.0)  # 1 SWAP = 3 CNOTs
         self.config.setdefault('three_qubit_exit_weight', 1.0)
+        self.config.setdefault('boundary_beam_width', 1)
+        self.config.setdefault('boundary_beam_depth', 1)
         self.config.setdefault('routing_aware_partitioning', True)
         self.config.setdefault('size_density_weight', False)
         self.config.setdefault('sparse_penalty', 3.0)
@@ -1038,7 +1040,11 @@ def _run_layout_trials(
         n_trials,
         random_seed,
     ):
-        use_cpp = self.config.get('use_cpp_router', True)
+        use_boundary_beam = (
+            int(self.config.get("boundary_beam_width", 1) or 1) > 1
+            and int(self.config.get("boundary_beam_depth", 1) or 1) > 1
+        )
+        use_cpp = self.config.get('use_cpp_router', True) and not use_boundary_beam
         if use_cpp:
             return self._run_layout_trials_cpp(
                 seeded_pi, DAG, IDAG, layout_partitions,
@@ -1954,6 +1960,232 @@ def add_edges(target_idx, edge_weight):
             "weight": weight,
         }
 
+    def _advance_layout_frontier(
+        self,
+        selected_partition_idx,
+        F,
+        resolved_partitions,
+        DAG,
+        IDAG,
+        optimized_partitions,
+    ):
+        """Advance a copied frontier without mutating circuits.
+
+        This mirrors the layout-only single-qubit elision logic and is used by
+        the boundary beam rollout.  It intentionally tracks only dependency
+        state and layout; final circuit construction still happens through the
+        concrete chosen route.
+        """
+        F_next = list(F)
+        resolved_next = list(resolved_partitions)
+
+        if selected_partition_idx in F_next:
+            F_next.remove(selected_partition_idx)
+        resolved_next[selected_partition_idx] = True
+
+        stack = deque(DAG[selected_partition_idx])
+        while stack:
+            child = stack.popleft()
+            if resolved_next[child] or child in F_next:
+                continue
+            if not all(resolved_next[parent] for parent in IDAG[child]):
+                continue
+            if self._partition_is_single(optimized_partitions[child]):
+                resolved_next[child] = True
+                stack.extend(DAG[child])
+            else:
+                F_next.append(child)
+
+        return tuple(F_next), tuple(resolved_next)
+
+    def _boundary_beam_select_index(
+        self,
+        partition_candidates,
+        scores,
+        cached_swaps,
+        cached_pi,
+        F_snapshot,
+        resolved_partitions,
+        DAG,
+        IDAG,
+        optimized_partitions,
+        scoring_partitions,
+        D,
+        candidate_cache,
+        canonical_data,
+        reverse=False,
+        W=0.5,
+        alpha=1.0,
+        cnot_cost=1.0 / 3.0,
+        adj=None,
+    ):
+        """Choose the next candidate by rolling out boundary-layout states.
+
+        The ordinary SABRE selector commits to the locally best candidate. This
+        keeps a small beam of possible boundary layouts across several future
+        partitions, then returns the first candidate from the best rollout.
+        """
+        beam_width = int(self.config.get("boundary_beam_width", 1) or 1)
+        beam_depth = int(self.config.get("boundary_beam_depth", 1) or 1)
+        fallback_idx = int(np.argmin(np.asarray(scores)))
+        if beam_width <= 1 or beam_depth <= 1 or len(partition_candidates) <= 1:
+            return fallback_idx
+        if not any(len(cand.involved_qbits) >= 3 for cand in partition_candidates):
+            return fallback_idx
+
+        max_E_size = self.config.get("max_E_size", 20)
+        max_lookahead = self.config.get("max_lookahead", 4)
+        top_k = self.config.get("prefilter_top_k", 50)
+        path_weight = self.config.get("path_tiebreak_weight", 0.2)
+        three_q_weight = self.config.get("three_qubit_exit_weight", 1.0)
+
+        def transition_cost(cand, swaps):
+            return self._routing_objective(
+                len(swaps or ()),
+                cand.cnot_count,
+                cnot_cost,
+            )
+
+        states = []
+        for idx, cand in enumerate(partition_candidates):
+            if cached_pi[idx] is None:
+                continue
+            trans_cost = transition_cost(cand, cached_swaps[idx])
+            F_next, resolved_next = self._advance_layout_frontier(
+                cand.partition_idx,
+                F_snapshot,
+                resolved_partitions,
+                DAG,
+                IDAG,
+                optimized_partitions,
+            )
+            states.append(
+                (
+                    float(scores[idx]),
+                    float(trans_cost),
+                    tuple(int(x) for x in cached_pi[idx]),
+                    F_next,
+                    resolved_next,
+                    idx,
+                )
+            )
+
+        if not states:
+            return fallback_idx
+
+        states.sort(key=lambda item: (item[0], item[5]))
+        states = states[:beam_width]
+
+        for _ in range(1, beam_depth):
+            expanded = []
+            for _, total_cost, pi_state, F_state, resolved_state, first_idx in states:
+                if not F_state:
+                    expanded.append(
+                        (total_cost, total_cost, pi_state, F_state, resolved_state, first_idx)
+                    )
+                    continue
+
+                resolved_list = list(resolved_state)
+                F_list = list(F_state)
+                E = self.generate_extended_set(
+                    F_list,
+                    DAG,
+                    IDAG,
+                    resolved_list,
+                    optimized_partitions,
+                    max_E_size=max_E_size,
+                    max_lookahead=max_lookahead,
+                )
+                candidates = self.obtain_partition_candidates(
+                    F_list,
+                    optimized_partitions,
+                    candidate_cache=candidate_cache,
+                )
+                if not candidates:
+                    expanded.append(
+                        (total_cost, total_cost, pi_state, F_state, resolved_state, first_idx)
+                    )
+                    continue
+                candidates = self._prefilter_candidates(
+                    candidates,
+                    list(pi_state),
+                    D,
+                    top_k,
+                    F=F_state,
+                    E=E,
+                    candidate_cache=candidate_cache,
+                    layout_partitions=optimized_partitions,
+                    reverse=reverse,
+                    W=W,
+                    alpha=alpha,
+                    canonical_data=canonical_data,
+                )
+
+                for cand in candidates:
+                    neighbor_info = self._build_neighbor_info(
+                        cand.partition_idx,
+                        F_state,
+                        E,
+                        pi_state,
+                        canonical_data,
+                        weight=path_weight,
+                        W=W,
+                        alpha=alpha,
+                        layout_partitions=optimized_partitions,
+                    )
+                    score, swaps, output_perm = self.score_partition_candidate(
+                        cand,
+                        F_state,
+                        list(pi_state),
+                        scoring_partitions,
+                        D,
+                        self._swap_cache,
+                        E=E,
+                        W=W,
+                        alpha=alpha,
+                        reverse=reverse,
+                        canonical_data=canonical_data,
+                        adj=adj,
+                        cnot_cost=cnot_cost,
+                        path_tiebreak_weight=path_weight,
+                        cached_neighbor_info=neighbor_info,
+                        candidate_cache=candidate_cache,
+                        layout_partitions=optimized_partitions,
+                        return_transforms=True,
+                        three_qubit_exit_weight=three_q_weight,
+                    )
+                    trans_cost = transition_cost(cand, swaps)
+                    future_cost = float(score) - trans_cost
+                    new_total = total_cost + trans_cost
+                    rank_cost = new_total + future_cost
+                    F_next, resolved_next = self._advance_layout_frontier(
+                        cand.partition_idx,
+                        F_state,
+                        resolved_state,
+                        DAG,
+                        IDAG,
+                        optimized_partitions,
+                    )
+                    expanded.append(
+                        (
+                            rank_cost,
+                            new_total,
+                            tuple(int(x) for x in output_perm),
+                            F_next,
+                            resolved_next,
+                            first_idx,
+                        )
+                    )
+
+            if not expanded:
+                break
+            expanded.sort(key=lambda item: (item[0], item[5]))
+            states = expanded[:beam_width]
+
+        if not states:
+            return fallback_idx
+        return int(min(states, key=lambda item: (item[0], item[5]))[5])
+
     def Heuristic_Search(
         self,
         F,
@@ -2138,10 +2370,26 @@ def Heuristic_Search(
                 cached_swaps[ci] = swaps
                 cached_pi[ci] = output_perm
 
-            min_partition_candidate = self._select_best_candidate(
-                partition_candidates, scores
+            best_idx = self._boundary_beam_select_index(
+                partition_candidates,
+                scores,
+                cached_swaps,
+                cached_pi,
+                F_snapshot,
+                resolved_partitions,
+                DAG,
+                IDAG,
+                optimized_partitions,
+                scoring_partitions,
+                D,
+                candidate_cache,
+                canonical_data,
+                W=E_W,
+                alpha=E_alpha,
+                cnot_cost=self.config.get("cnot_cost", 1.0 / 3.0),
+                adj=self._adj,
             )
-            best_idx = partition_candidates.index(min_partition_candidate)
+            min_partition_candidate = partition_candidates[best_idx]
 
             F.remove(min_partition_candidate.partition_idx)
             resolved_partitions[min_partition_candidate.partition_idx] = True
@@ -2360,10 +2608,27 @@ def _heuristic_search_layout_only(
                 cached_swaps[ci] = swaps
                 cached_pi[ci] = output_perm
 
-            best = self._select_best_candidate(
-                partition_candidates, scores, rng=rng
+            best_idx = self._boundary_beam_select_index(
+                partition_candidates,
+                scores,
+                cached_swaps,
+                cached_pi,
+                F_snapshot,
+                resolved_partitions,
+                DAG,
+                IDAG,
+                optimized_partitions,
+                scoring_partitions,
+                D,
+                candidate_cache,
+                canonical_data,
+                reverse=reverse,
+                W=E_W,
+                alpha=E_alpha,
+                cnot_cost=cnot_cost,
+                adj=self._adj,
             )
-            best_idx = partition_candidates.index(best)
+            best = partition_candidates[best_idx]
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 

From 8d71cd3c1b938a69b70d626746b31e606866857a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 12:45:19 +0200
Subject: [PATCH 171/232] new partitioning

---
 .../sabre_router/include/sabre_router.hpp     |  24 ++
 .../src-cpp/sabre_router/sabre_router.cpp     | 286 +++++++++++++++++-
 squander/synthesis/PartAM.py                  | 261 ++++++----------
 squander/synthesis/bindings.cpp               |   4 +-
 4 files changed, 391 insertions(+), 184 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 681000093..7a0154e23 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -92,6 +92,8 @@ struct SabreConfig {
     int swap_burst_budget = 5; // Qiskit LightSABRE DECAY_RESET_INTERVAL
     double path_tiebreak_weight = 0.2;
     double three_qubit_exit_weight = 1.0;
+    int boundary_beam_width = 1;
+    int boundary_beam_depth = 1;
 };
 
 struct RouteStep {
@@ -362,6 +364,28 @@ class SabreRouter {
         std::mt19937* rng
     ) const;
 
+    std::pair<std::vector<int>, std::vector<uint8_t>> advance_layout_frontier(
+        int selected_partition_idx,
+        const std::vector<int>& F,
+        const std::vector<uint8_t>& resolved,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::vector<std::vector<int>>& parents_graph
+    ) const;
+
+    size_t boundary_beam_select_index(
+        const std::vector<const CandidateData*>& candidates,
+        const std::vector<double>& scores,
+        const std::vector<std::vector<std::pair<int,int>>>& cached_swaps,
+        const std::vector<std::vector<int>>& cached_pi,
+        const std::vector<int>& F_snapshot,
+        const std::vector<uint8_t>& resolved,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::vector<std::vector<int>>& parents_graph,
+        bool reverse,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data,
+        SwapCache* swap_cache
+    ) const;
+
     // Check if partition is single-qubit
     inline bool partition_is_single(int partition_idx) const {
         return layout_partitions_[partition_idx].is_single;
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index c423d710b..d1444a26d 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1363,6 +1363,268 @@ const CandidateData& SabreRouter::select_best_candidate(
     return *candidates[min_idx];
 }
 
+// ---------------------------------------------------------------------------
+// Boundary beam search helpers
+// ---------------------------------------------------------------------------
+
+std::pair<std::vector<int>, std::vector<uint8_t>>
+SabreRouter::advance_layout_frontier(
+    int selected_partition_idx,
+    const std::vector<int>& F,
+    const std::vector<uint8_t>& resolved,
+    const std::vector<std::vector<int>>& children_graph,
+    const std::vector<std::vector<int>>& parents_graph
+) const {
+    std::vector<int> F_next(F);
+    std::vector<uint8_t> resolved_next(resolved);
+
+    F_next.erase(
+        std::remove(F_next.begin(), F_next.end(), selected_partition_idx),
+        F_next.end()
+    );
+    if (
+        selected_partition_idx >= 0
+        && selected_partition_idx < static_cast<int>(resolved_next.size())
+    ) {
+        resolved_next[selected_partition_idx] = 1;
+    }
+
+    std::deque<int> stack;
+    for (int child : children_graph[selected_partition_idx]) {
+        stack.push_back(child);
+    }
+
+    while (!stack.empty()) {
+        const int child = stack.front();
+        stack.pop_front();
+
+        if (resolved_next[child]) continue;
+        if (std::find(F_next.begin(), F_next.end(), child) != F_next.end()) {
+            continue;
+        }
+
+        bool parents_ok = true;
+        for (int parent : parents_graph[child]) {
+            if (!resolved_next[parent]) {
+                parents_ok = false;
+                break;
+            }
+        }
+        if (!parents_ok) continue;
+
+        if (layout_partitions_[child].is_single) {
+            resolved_next[child] = 1;
+            for (int grandchild : children_graph[child]) {
+                stack.push_back(grandchild);
+            }
+        } else {
+            F_next.push_back(child);
+        }
+    }
+
+    return {std::move(F_next), std::move(resolved_next)};
+}
+
+size_t SabreRouter::boundary_beam_select_index(
+    const std::vector<const CandidateData*>& candidates,
+    const std::vector<double>& scores,
+    const std::vector<std::vector<std::pair<int,int>>>& cached_swaps,
+    const std::vector<std::vector<int>>& cached_pi,
+    const std::vector<int>& F_snapshot,
+    const std::vector<uint8_t>& resolved,
+    const std::vector<std::vector<int>>& children_graph,
+    const std::vector<std::vector<int>>& parents_graph,
+    bool reverse,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data,
+    SwapCache* swap_cache
+) const {
+    size_t fallback_idx = 0;
+    for (size_t i = 1; i < scores.size(); i++) {
+        if (scores[i] < scores[fallback_idx]) {
+            fallback_idx = i;
+        }
+    }
+
+    const int beam_width = std::max(1, config_.boundary_beam_width);
+    const int beam_depth = std::max(1, config_.boundary_beam_depth);
+    if (beam_width <= 1 || beam_depth <= 1 || candidates.size() <= 1) {
+        return fallback_idx;
+    }
+
+    bool has_three_qubit_candidate = false;
+    for (const auto* cand : candidates) {
+        if (cand->involved_qbits.size() >= 3) {
+            has_three_qubit_candidate = true;
+            break;
+        }
+    }
+    if (!has_three_qubit_candidate) {
+        return fallback_idx;
+    }
+
+    struct BeamState {
+        double rank_cost;
+        double total_cost;
+        std::vector<int> pi;
+        std::vector<int> F;
+        std::vector<uint8_t> resolved;
+        size_t first_idx;
+    };
+
+    auto transition_cost = [&](const CandidateData& cand, size_t idx) {
+        return routing_objective(
+            static_cast<double>(cached_swaps[idx].size()),
+            cand.cnot_count
+        );
+    };
+
+    auto sort_states = [](const BeamState& a, const BeamState& b) {
+        if (a.rank_cost != b.rank_cost) return a.rank_cost < b.rank_cost;
+        return a.first_idx < b.first_idx;
+    };
+
+    std::vector<BeamState> states;
+    states.reserve(candidates.size());
+    for (size_t idx = 0; idx < candidates.size(); idx++) {
+        if (cached_pi[idx].empty()) continue;
+        const auto& cand = *candidates[idx];
+        auto [F_next, resolved_next] = advance_layout_frontier(
+            cand.partition_idx,
+            F_snapshot,
+            resolved,
+            children_graph,
+            parents_graph
+        );
+        const double trans_cost = transition_cost(cand, idx);
+        states.push_back(BeamState{
+            scores[idx],
+            trans_cost,
+            cached_pi[idx],
+            std::move(F_next),
+            std::move(resolved_next),
+            idx
+        });
+    }
+
+    if (states.empty()) {
+        return fallback_idx;
+    }
+    std::sort(states.begin(), states.end(), sort_states);
+    if (static_cast<int>(states.size()) > beam_width) {
+        states.resize(beam_width);
+    }
+
+    for (int depth = 1; depth < beam_depth; depth++) {
+        std::vector<BeamState> expanded;
+
+        for (const auto& state : states) {
+            if (state.F.empty()) {
+                expanded.push_back(BeamState{
+                    state.total_cost,
+                    state.total_cost,
+                    state.pi,
+                    state.F,
+                    state.resolved,
+                    state.first_idx
+                });
+                continue;
+            }
+
+            auto E = generate_extended_set(
+                state.F,
+                state.resolved,
+                children_graph,
+                parents_graph
+            );
+
+            auto rollout_candidates = obtain_partition_candidates(state.F);
+            if (rollout_candidates.empty()) {
+                expanded.push_back(BeamState{
+                    state.total_cost,
+                    state.total_cost,
+                    state.pi,
+                    state.F,
+                    state.resolved,
+                    state.first_idx
+                });
+                continue;
+            }
+
+            rollout_candidates = prefilter_candidates(
+                rollout_candidates,
+                state.pi,
+                config_.prefilter_top_k,
+                state.F,
+                E,
+                reverse,
+                canonical_data
+            );
+
+            for (const CandidateData* cand : rollout_candidates) {
+                NeighborInfo neighbor_info = build_neighbor_info(
+                    cand->partition_idx,
+                    state.F,
+                    E,
+                    state.pi,
+                    canonical_data
+                );
+                std::vector<std::pair<int,int>> swaps;
+                std::vector<int> output_perm;
+                const double score = score_candidate(
+                    *cand,
+                    state.F,
+                    state.pi,
+                    E,
+                    reverse,
+                    canonical_data,
+                    swap_cache,
+                    nullptr,
+                    &swaps,
+                    &output_perm,
+                    &neighbor_info
+                );
+                const double trans_cost = routing_objective(
+                    static_cast<double>(swaps.size()),
+                    cand->cnot_count
+                );
+                const double future_cost = score - trans_cost;
+                const double new_total = state.total_cost + trans_cost;
+                const double rank_cost = new_total + future_cost;
+
+                auto [F_next, resolved_next] = advance_layout_frontier(
+                    cand->partition_idx,
+                    state.F,
+                    state.resolved,
+                    children_graph,
+                    parents_graph
+                );
+                expanded.push_back(BeamState{
+                    rank_cost,
+                    new_total,
+                    std::move(output_perm),
+                    std::move(F_next),
+                    std::move(resolved_next),
+                    state.first_idx
+                });
+            }
+        }
+
+        if (expanded.empty()) {
+            break;
+        }
+        std::sort(expanded.begin(), expanded.end(), sort_states);
+        if (static_cast<int>(expanded.size()) > beam_width) {
+            expanded.resize(beam_width);
+        }
+        states = std::move(expanded);
+    }
+
+    if (states.empty()) {
+        return fallback_idx;
+    }
+    return std::min_element(states.begin(), states.end(), sort_states)->first_idx;
+}
+
 // ---------------------------------------------------------------------------
 // ---------------------------------------------------------------------------
 // heuristic_search (main loop)
@@ -1378,6 +1640,8 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
     const std::vector<std::vector<int>>& pg,
     ForwardRouteResult* route_trace
 ) const {
+    (void)rng;
+
     std::vector<int> F;
     std::vector<int> queue;
     std::vector<uint8_t> resolved(num_partitions_, 0);
@@ -1508,13 +1772,21 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             );
         }
 
-        // Select best
-        const auto& best = select_best_candidate(candidates, scores, rng);
-        // Find selected index to retrieve cached transform
-        size_t best_ci = 0;
-        for (size_t ci = 0; ci < candidates.size(); ci++) {
-            if (candidates[ci] == &best) { best_ci = ci; break; }
-        }
+        // Select best, optionally using boundary-layout beam rollout
+        const size_t best_ci = boundary_beam_select_index(
+            candidates,
+            scores,
+            cached_swaps,
+            cached_pi,
+            F,
+            resolved,
+            cg,
+            pg,
+            reverse,
+            canonical_data,
+            &swap_cache
+        );
+        const auto& best = *candidates[best_ci];
 
         // Remove from F and mark resolved
         F.erase(std::remove(F.begin(), F.end(), best.partition_idx), F.end());
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index a97e3b676..792884a4d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -164,13 +164,7 @@ def __init__(self, config):
         self.config.setdefault('boundary_beam_width', 1)
         self.config.setdefault('boundary_beam_depth', 1)
         self.config.setdefault('routing_aware_partitioning', True)
-        self.config.setdefault('size_density_weight', False)
-        self.config.setdefault('sparse_penalty', 3.0)
-        self.config.setdefault('two_pair_3q_penalty', None)
-        self.config.setdefault('dense_3q_penalty', None)
-        self.config.setdefault('triangle_free_3q_penalty', 1.0)
-        self.config.setdefault('three_qubit_reuse_discount', 0.15)
-        self.config.setdefault('three_qubit_reuse_discount_cap', 1.0)
+        self.config.setdefault('embeddable_3q_partitions_only', True)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -214,152 +208,78 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def _parts_to_density_weights(allparts, gate_dict, sparse_penalty=3.0):
-        """Per-part ILP weights that penalise sparse 3-qubit partitions.
-
-        Penalty by active-pair count for a 3q partition:
-          1 pair  -> sparse_penalty        (e.g. 3 → total ILP cost 4)
-          2 pairs -> sparse_penalty / 3    (e.g. 1 → total ILP cost 2)
-          3 pairs -> 0                     (no penalty)
-        For 2q (or 1q) partitions the weight is always 0.
-        """
-        N = max(len(allparts), 1)
-        weights = []
-        for part in allparts:
-            qubits_in_part = set()
-            for gate_idx in part:
-                gate = gate_dict.get(gate_idx)
-                if gate is not None:
-                    qubits_in_part.update(gate.get_Involved_Qbits())
-            if len(qubits_in_part) != 3:
-                weights.append(0.0)
-                continue
-            active_pairs = set()
-            for gate_idx in part:
-                gate = gate_dict.get(gate_idx)
-                if gate is None:
-                    continue
-                qbs = list(gate.get_Involved_Qbits())
-                for a in range(len(qbs)):
-                    for b in range(a + 1, len(qbs)):
-                        active_pairs.add((min(qbs[a], qbs[b]), max(qbs[a], qbs[b])))
-            n_pairs = len(active_pairs)
-            if n_pairs >= 3:
-                penalty = 0.0
-            elif n_pairs == 2:
-                penalty = sparse_penalty / 3.0
-            else:
-                penalty = sparse_penalty
-            weights.append(penalty / N)
-        return weights
-
-    @staticmethod
-    def _topology_has_triangle(topology):
-        """Return True when the hardware graph contains a 3-cycle."""
+    def _max_edges_in_three_qubit_subtopologies(topology):
+        """Maximum edge count of any connected 3-node hardware subtopology."""
         if not topology:
-            return False
-        adj = defaultdict(set)
-        for u, v in topology:
-            adj[int(u)].add(int(v))
-            adj[int(v)].add(int(u))
-        for u, neighbors in adj.items():
-            ordered = sorted(v for v in neighbors if v > u)
-            for idx, v in enumerate(ordered):
-                if any(w in adj[v] for w in ordered[idx + 1:]):
-                    return True
-        return False
+            return 0
+        return max(
+            (
+                len({tuple(sorted(edge)) for edge in mini_topology})
+                for mini_topology in get_unique_subtopologies(topology, 3)
+            ),
+            default=0,
+        )
 
     @staticmethod
-    def _parts_to_routing_aware_weights(
-        allparts,
-        gate_dict,
-        topology=None,
-        sparse_penalty=3.0,
-        two_pair_3q_penalty=None,
-        dense_3q_penalty=None,
-        triangle_free_3q_penalty=1.0,
-        reuse_discount=0.15,
-        reuse_discount_cap=1.0,
-    ):
-        """Per-part ILP weights for routing-aware 3q partition selection.
-
-        ``ilp_global_optimal`` minimizes ``1 + N * weight[i]`` for each
-        selected part.  Returning ``penalty / N`` therefore makes ``penalty``
-        the extra cost of selecting that part.
+    def _part_support_and_active_pairs(part, gate_dict):
+        qubits_in_part = set()
+        active_pairs = set()
+        for gate_idx in part:
+            gate = gate_dict.get(gate_idx)
+            if gate is None:
+                continue
+            qbs = list(gate.get_Involved_Qbits())
+            qubits_in_part.update(qbs)
+            if len(qbs) < 2:
+                continue
+            for a in range(len(qbs)):
+                for b in range(a + 1, len(qbs)):
+                    active_pairs.add(
+                        (min(qbs[a], qbs[b]), max(qbs[a], qbs[b]))
+                    )
+        return qubits_in_part, active_pairs
 
-        The older density tie-breaker preserved minimum partition count, which
-        still lets a sparse 3q block beat the two 2q blocks it would replace.
-        This cost is intentionally allowed to cross that boundary:
+    @staticmethod
+    def _filter_embeddable_three_qubit_parts(allparts, gate_dict, topology):
+        """Keep only 3q parts whose active-pair graph fits hardware shape.
 
-          one active pair  -> strongly prefer the equivalent 2q partition
-          two active pairs -> prefer two 2q partitions unless the block is reused
-          three pairs      -> keep 3q attractive, but penalize triangle-free HW
+        On a line, connected 3q hardware subtopologies have two edges, so
+        only connected two-edge 3q interaction graphs are legal 3q merge
+        candidates. Logical triangles stay as separate 2q partitions.
         """
-        N = max(len(allparts), 1)
-        reuse_discount = float(reuse_discount or 0.0)
-        reuse_discount_cap = float(reuse_discount_cap or 0.0)
-        two_pair_penalty = (
-            sparse_penalty / 2.0
-            if two_pair_3q_penalty is None
-            else float(two_pair_3q_penalty)
+        max_edges = qgd_Partition_Aware_Mapping._max_edges_in_three_qubit_subtopologies(
+            topology
         )
-        triangle_free_penalty = float(triangle_free_3q_penalty or 0.0)
-        if dense_3q_penalty is None:
-            dense_penalty = (
-                triangle_free_penalty
-                if not qgd_Partition_Aware_Mapping._topology_has_triangle(topology)
-                else 0.0
-            )
-        else:
-            dense_penalty = float(dense_3q_penalty)
-
-        weights = []
+        support_and_pairs = (
+            qgd_Partition_Aware_Mapping._part_support_and_active_pairs
+        )
+        if max_edges <= 0:
+            filtered = []
+            for part in allparts:
+                qubits_in_part, _ = support_and_pairs(part, gate_dict)
+                if len(qubits_in_part) != 3:
+                    filtered.append(part)
+            return filtered
+
+        filtered = []
         for part in allparts:
-            qubits_in_part = set()
-            active_pairs = set()
-            multi_qubit_gate_count = 0
-
-            for gate_idx in part:
-                gate = gate_dict.get(gate_idx)
-                if gate is None:
-                    continue
-                qbs = list(gate.get_Involved_Qbits())
-                qubits_in_part.update(qbs)
-                if len(qbs) < 2:
-                    continue
-                multi_qubit_gate_count += 1
-                for a in range(len(qbs)):
-                    for b in range(a + 1, len(qbs)):
-                        active_pairs.add(
-                            (min(qbs[a], qbs[b]), max(qbs[a], qbs[b]))
-                        )
-
-            if len(qubits_in_part) != 3:
-                weights.append(0.0)
+            qubits_in_part, active_pairs = support_and_pairs(part, gate_dict)
+            if len(qubits_in_part) == 3 and (
+                len(active_pairs) < 2 or len(active_pairs) > max_edges
+            ):
                 continue
+            filtered.append(part)
+        return filtered
 
-            n_pairs = len(active_pairs)
-            if n_pairs <= 1:
-                penalty = float(sparse_penalty)
-            elif n_pairs == 2:
-                penalty = two_pair_penalty
-            else:
-                penalty = dense_penalty
-
-            extra_reuse = (
-                max(0, multi_qubit_gate_count - n_pairs)
-                if n_pairs >= 2
-                else 0
-            )
-            if extra_reuse and reuse_discount > 0:
-                discount = min(
-                    float(reuse_discount_cap),
-                    float(reuse_discount) * float(extra_reuse),
-                )
-                penalty = max(0.0, penalty - discount)
+    @staticmethod
+    def _parts_to_embeddable_edge_weights(allparts):
+        """Uniform conceptual cost: every allowed partition has weight 1.
 
-            weights.append(penalty / N)
-        return weights
+        ``ilp_global_optimal`` turns stored weight ``w`` into objective cost
+        ``1 + N*w``.  Returning zero therefore gives every already-filtered
+        2q part and embeddable 3q part the same conceptual cost of one.
+        """
+        return [0.0] * len(allparts)
 
     @staticmethod
     def _topo_key(mini_topology):
@@ -643,42 +563,27 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
         qbit_num_orig_circuit = working_circ.get_Qbit_Num()
         gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())}
+        if self.config.get('embeddable_3q_partitions_only', True):
+            allparts = self._filter_embeddable_three_qubit_parts(
+                allparts,
+                gate_dict,
+                self.topology,
+            )
 
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
         # ---- Phase 2: ILP partition selection ----
-        # Route-aware weights let sparse 3q blocks lose to the 2q blocks they
-        # would otherwise replace.  Without weights, the ILP minimizes selected
-        # partition count and therefore over-selects 3q partitions.
+        # Invalid 3q merge candidates were removed above.  The remaining
+        # candidates all have conceptual cost 1: 2q parts cost 1, and 3q
+        # parts cost 1 only when their active-pair graph is embeddable.
         ilp_weights = None
-        if self.config.get('routing_aware_partitioning', True):
-            sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
-            ilp_weights = self._parts_to_routing_aware_weights(
-                allparts,
-                gate_dict,
-                topology=self.topology,
-                sparse_penalty=sparse_penalty,
-                two_pair_3q_penalty=self.config.get(
-                    'two_pair_3q_penalty', None
-                ),
-                dense_3q_penalty=self.config.get('dense_3q_penalty', None),
-                triangle_free_3q_penalty=self.config.get(
-                    'triangle_free_3q_penalty', 1.0
-                ),
-                reuse_discount=self.config.get(
-                    'three_qubit_reuse_discount', 0.15
-                ),
-                reuse_discount_cap=self.config.get(
-                    'three_qubit_reuse_discount_cap', 1.0
-                ),
-            )
-        elif self.config.get('size_density_weight', False):
-            sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
-            ilp_weights = self._parts_to_density_weights(
-                allparts, gate_dict, sparse_penalty=sparse_penalty
-            )
+        if (
+            self.config.get('routing_aware_partitioning', True)
+            and self.config.get('embeddable_3q_partitions_only', True)
+        ):
+            ilp_weights = self._parts_to_embeddable_edge_weights(allparts)
         L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----
@@ -1040,11 +945,7 @@ def _run_layout_trials(
         n_trials,
         random_seed,
     ):
-        use_boundary_beam = (
-            int(self.config.get("boundary_beam_width", 1) or 1) > 1
-            and int(self.config.get("boundary_beam_depth", 1) or 1) > 1
-        )
-        use_cpp = self.config.get('use_cpp_router', True) and not use_boundary_beam
+        use_cpp = self.config.get('use_cpp_router', True)
         if use_cpp:
             return self._run_layout_trials_cpp(
                 seeded_pi, DAG, IDAG, layout_partitions,
@@ -1142,6 +1043,14 @@ def _run_layout_trials_cpp(
             cfg.three_qubit_exit_weight = self.config.get(
                 'three_qubit_exit_weight', 1.0
             )
+        if hasattr(cfg, 'boundary_beam_width'):
+            cfg.boundary_beam_width = self.config.get(
+                'boundary_beam_width', 1
+            )
+        if hasattr(cfg, 'boundary_beam_depth'):
+            cfg.boundary_beam_depth = self.config.get(
+                'boundary_beam_depth', 1
+            )
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index fe2368fd2..95e8d4630 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -134,7 +134,9 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("decay_delta", &SabreConfig::decay_delta)
         .def_readwrite("swap_burst_budget", &SabreConfig::swap_burst_budget)
         .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
-        .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight);
+        .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight)
+        .def_readwrite("boundary_beam_width", &SabreConfig::boundary_beam_width)
+        .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")

From f2e90f4cab73134f259776376fecf04c0aeaa72f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 12:54:07 +0200
Subject: [PATCH 172/232] rework partitioning

---
 squander/synthesis/PartAM.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 792884a4d..09ddbaaab 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -272,14 +272,25 @@ def _filter_embeddable_three_qubit_parts(allparts, gate_dict, topology):
         return filtered
 
     @staticmethod
-    def _parts_to_embeddable_edge_weights(allparts):
-        """Uniform conceptual cost: every allowed partition has weight 1.
+    def _parts_to_active_edge_weights(allparts, gate_dict):
+        """Use active logical two-qubit edges as the structural part cost.
 
         ``ilp_global_optimal`` turns stored weight ``w`` into objective cost
-        ``1 + N*w``.  Returning zero therefore gives every already-filtered
-        2q part and embeddable 3q part the same conceptual cost of one.
+        ``1 + N*w``.  A 2q partition therefore has stored weight zero
+        because its active-edge cost is one.  A path-shaped 3q partition has
+        cost two, matching the two 2q edges it represents instead of making
+        the merge artificially cheaper.
         """
-        return [0.0] * len(allparts)
+        N = max(len(allparts), 1)
+        weights = []
+        for part in allparts:
+            _, active_pairs = (
+                qgd_Partition_Aware_Mapping
+                ._part_support_and_active_pairs(part, gate_dict)
+            )
+            cost = max(1, len(active_pairs))
+            weights.append((cost - 1) / N)
+        return weights
 
     @staticmethod
     def _topo_key(mini_topology):
@@ -576,14 +587,17 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
         # ---- Phase 2: ILP partition selection ----
         # Invalid 3q merge candidates were removed above.  The remaining
-        # candidates all have conceptual cost 1: 2q parts cost 1, and 3q
-        # parts cost 1 only when their active-pair graph is embeddable.
+        # candidates are priced by active logical two-qubit edges, so a valid
+        # 3q path is no cheaper than the two 2q interactions it represents.
         ilp_weights = None
         if (
             self.config.get('routing_aware_partitioning', True)
             and self.config.get('embeddable_3q_partitions_only', True)
         ):
-            ilp_weights = self._parts_to_embeddable_edge_weights(allparts)
+            ilp_weights = self._parts_to_active_edge_weights(
+                allparts,
+                gate_dict,
+            )
         L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----

From ee0987fb422e2741d59d128c0edf8b25c05099fa Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 13:23:22 +0200
Subject: [PATCH 173/232] revert partitioning

---
 squander/synthesis/PartAM.py | 152 ++++++++++++-----------------------
 1 file changed, 51 insertions(+), 101 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 09ddbaaab..b30a03e36 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -163,8 +163,8 @@ def __init__(self, config):
         self.config.setdefault('three_qubit_exit_weight', 1.0)
         self.config.setdefault('boundary_beam_width', 1)
         self.config.setdefault('boundary_beam_depth', 1)
-        self.config.setdefault('routing_aware_partitioning', True)
-        self.config.setdefault('embeddable_3q_partitions_only', True)
+        self.config.setdefault('size_density_weight', False)
+        self.config.setdefault('sparse_penalty', 3.0)
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -208,88 +208,43 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
     # ------------------------------------------------------------------------
 
     @staticmethod
-    def _max_edges_in_three_qubit_subtopologies(topology):
-        """Maximum edge count of any connected 3-node hardware subtopology."""
-        if not topology:
-            return 0
-        return max(
-            (
-                len({tuple(sorted(edge)) for edge in mini_topology})
-                for mini_topology in get_unique_subtopologies(topology, 3)
-            ),
-            default=0,
-        )
-
-    @staticmethod
-    def _part_support_and_active_pairs(part, gate_dict):
-        qubits_in_part = set()
-        active_pairs = set()
-        for gate_idx in part:
-            gate = gate_dict.get(gate_idx)
-            if gate is None:
-                continue
-            qbs = list(gate.get_Involved_Qbits())
-            qubits_in_part.update(qbs)
-            if len(qbs) < 2:
-                continue
-            for a in range(len(qbs)):
-                for b in range(a + 1, len(qbs)):
-                    active_pairs.add(
-                        (min(qbs[a], qbs[b]), max(qbs[a], qbs[b]))
-                    )
-        return qubits_in_part, active_pairs
-
-    @staticmethod
-    def _filter_embeddable_three_qubit_parts(allparts, gate_dict, topology):
-        """Keep only 3q parts whose active-pair graph fits hardware shape.
-
-        On a line, connected 3q hardware subtopologies have two edges, so
-        only connected two-edge 3q interaction graphs are legal 3q merge
-        candidates. Logical triangles stay as separate 2q partitions.
-        """
-        max_edges = qgd_Partition_Aware_Mapping._max_edges_in_three_qubit_subtopologies(
-            topology
-        )
-        support_and_pairs = (
-            qgd_Partition_Aware_Mapping._part_support_and_active_pairs
-        )
-        if max_edges <= 0:
-            filtered = []
-            for part in allparts:
-                qubits_in_part, _ = support_and_pairs(part, gate_dict)
-                if len(qubits_in_part) != 3:
-                    filtered.append(part)
-            return filtered
-
-        filtered = []
-        for part in allparts:
-            qubits_in_part, active_pairs = support_and_pairs(part, gate_dict)
-            if len(qubits_in_part) == 3 and (
-                len(active_pairs) < 2 or len(active_pairs) > max_edges
-            ):
-                continue
-            filtered.append(part)
-        return filtered
-
-    @staticmethod
-    def _parts_to_active_edge_weights(allparts, gate_dict):
-        """Use active logical two-qubit edges as the structural part cost.
-
-        ``ilp_global_optimal`` turns stored weight ``w`` into objective cost
-        ``1 + N*w``.  A 2q partition therefore has stored weight zero
-        because its active-edge cost is one.  A path-shaped 3q partition has
-        cost two, matching the two 2q edges it represents instead of making
-        the merge artificially cheaper.
+    def _parts_to_density_weights(allparts, gate_dict, sparse_penalty=3.0):
+        """Per-part ILP weights that penalise sparse 3-qubit partitions.
+
+        Penalty by active-pair count for a 3q partition:
+          1 pair  -> sparse_penalty        (e.g. 3 -> total ILP cost 4)
+          2 pairs -> sparse_penalty / 3    (e.g. 1 -> total ILP cost 2)
+          3 pairs -> 0                     (no penalty)
+        For 2q (or 1q) partitions the weight is always 0.
         """
         N = max(len(allparts), 1)
         weights = []
         for part in allparts:
-            _, active_pairs = (
-                qgd_Partition_Aware_Mapping
-                ._part_support_and_active_pairs(part, gate_dict)
-            )
-            cost = max(1, len(active_pairs))
-            weights.append((cost - 1) / N)
+            qubits_in_part = set()
+            for gate_idx in part:
+                gate = gate_dict.get(gate_idx)
+                if gate is not None:
+                    qubits_in_part.update(gate.get_Involved_Qbits())
+            if len(qubits_in_part) != 3:
+                weights.append(0.0)
+                continue
+            active_pairs = set()
+            for gate_idx in part:
+                gate = gate_dict.get(gate_idx)
+                if gate is None:
+                    continue
+                qbs = list(gate.get_Involved_Qbits())
+                for a in range(len(qbs)):
+                    for b in range(a + 1, len(qbs)):
+                        active_pairs.add((min(qbs[a], qbs[b]), max(qbs[a], qbs[b])))
+            n_pairs = len(active_pairs)
+            if n_pairs >= 3:
+                penalty = 0.0
+            elif n_pairs == 2:
+                penalty = sparse_penalty / 3.0
+            else:
+                penalty = sparse_penalty
+            weights.append(penalty / N)
         return weights
 
     @staticmethod
@@ -574,29 +529,22 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
         qbit_num_orig_circuit = working_circ.get_Qbit_Num()
         gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())}
-        if self.config.get('embeddable_3q_partitions_only', True):
-            allparts = self._filter_embeddable_three_qubit_parts(
-                allparts,
-                gate_dict,
-                self.topology,
-            )
 
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
         single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]}
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
         # ---- Phase 2: ILP partition selection ----
-        # Invalid 3q merge candidates were removed above.  The remaining
-        # candidates are priced by active logical two-qubit edges, so a valid
-        # 3q path is no cheaper than the two 2q interactions it represents.
+        # Minimize total partition count so PAM gets the largest blocks possible
+        # under max_partition_size. Larger blocks = more (P_i, P_o) freedom to
+        # absorb routing SWAPs. Overlap-based tie-breaker (when enabled)
+        # picks deterministically among min-count covers, preferring covers
+        # whose parts share more logical qubits with their DAG successors.
         ilp_weights = None
-        if (
-            self.config.get('routing_aware_partitioning', True)
-            and self.config.get('embeddable_3q_partitions_only', True)
-        ):
-            ilp_weights = self._parts_to_active_edge_weights(
-                allparts,
-                gate_dict,
+        if self.config.get('size_density_weight', False):
+            sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
+            ilp_weights = self._parts_to_density_weights(
+                allparts, gate_dict, sparse_penalty=sparse_penalty
             )
         L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 
@@ -719,9 +667,10 @@ def _run_parallel_synthesis(self, partition_meta):
                     meta['qbit_map'],
                 )
 
-            # ---- Stage 1: sweep all P_i (and all P_o for N==2 partitions) ----
-            # For N==2 there are only 4 permutation pairs total, so we enumerate
-            # them all here and skip Stage 2 for those partitions.
+            # ---- Stage 1: sweep all boundary permutations for small partitions ----
+            # For N<=3 the full (P_i, P_o) space is at most 36 pairs.  Routing
+            # needs that complete boundary-state set; otherwise 3q partitions
+            # expose less layout freedom than 2q partitions.
             stage1_futures = []
             stage1_cached = []
             known_pairs = {}
@@ -733,7 +682,7 @@ def _run_parallel_synthesis(self, partition_meta):
                 N = meta['N']
                 perms_all = list(permutations(range(N)))
                 for topology_idx, mini_topology in enumerate(meta['mini_topologies']):
-                    if N == 2:
+                    if N <= 3:
                         full_enum_keys.add((partition_idx, topology_idx))
                         po_sweep = perms_all
                     else:
@@ -784,7 +733,8 @@ def _run_parallel_synthesis(self, partition_meta):
                     )
 
             # ---- Stage 2: fix top-k P_i from Stage 1, sweep all P_o ----
-            # Skipped for partitions already fully enumerated in Stage 1 (N==2).
+            # Skipped for partitions already fully enumerated in Stage 1
+            # (currently all N<=3 partitions).
             top_k_pi = self.config.get('top_k_pi', 1)
             stage2_futures = []
             stage2_cached = []

From 64787d9d5fe84eb3d9e3370091b6e9c7dd7fd4d5 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 13:25:43 +0200
Subject: [PATCH 174/232] more config stuff

---
 squander/synthesis/PartAM.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index b30a03e36..719b1a4b5 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -576,6 +576,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 involved.update(gate_dict[g].get_Involved_Qbits())
             size = len(involved)
             size_counts[size] = size_counts.get(size, 0) + 1
+        self._selected_partition_counts = dict(size_counts)
         print(f"Selected partitions: 2-qubit={size_counts.get(2, 0)}, 3-qubit={size_counts.get(3, 0)}, total_multi={sum(size_counts.get(s, 0) for s in size_counts if s > 1)}")
 
         # ---- Phase 4: Assemble partitioned circuit from selected partitions only ----

From a2bd2c03fe0e3f539e6a1c6fc3faf6ac2f2ef04b Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 16:13:46 +0200
Subject: [PATCH 175/232] new partitioning

---
 squander/synthesis/PartAM.py | 187 +++++++++++++++++++++++++++++++++--
 1 file changed, 180 insertions(+), 7 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 719b1a4b5..ff34d2908 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -165,6 +165,7 @@ def __init__(self, config):
         self.config.setdefault('boundary_beam_depth', 1)
         self.config.setdefault('size_density_weight', False)
         self.config.setdefault('sparse_penalty', 3.0)
+        self.config.setdefault('partition_weight_model', 'density')
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -247,6 +248,128 @@ def _parts_to_density_weights(allparts, gate_dict, sparse_penalty=3.0):
             weights.append(penalty / N)
         return weights
 
+    @staticmethod
+    def _part_support_and_active_pairs(part, gate_dict):
+        qubits_in_part = set()
+        active_pairs = set()
+        for gate_idx in part:
+            gate = gate_dict.get(gate_idx)
+            if gate is None:
+                continue
+            qbs = list(gate.get_Involved_Qbits())
+            qubits_in_part.update(qbs)
+            if len(qbs) < 2:
+                continue
+            for a in range(len(qbs)):
+                for b in range(a + 1, len(qbs)):
+                    active_pairs.add(
+                        (min(qbs[a], qbs[b]), max(qbs[a], qbs[b]))
+                    )
+        return frozenset(qubits_in_part), frozenset(active_pairs)
+
+    @staticmethod
+    def _turnover_between_supports(support_a, support_b):
+        if len(support_a) < 2 or len(support_b) < 2:
+            return None
+        return min(len(support_a), len(support_b)) - len(support_a & support_b)
+
+    @staticmethod
+    def _average_turnover(part_idx, part, neighbor_gate_sets,
+                          gate_to_parts, allparts, supports):
+        turnovers = []
+        for gate_set in neighbor_gate_sets:
+            for gate_idx in gate_set - part:
+                for other_idx in gate_to_parts.get(gate_idx, ()):
+                    if other_idx == part_idx:
+                        continue
+                    other_part = allparts[other_idx]
+                    if part & other_part:
+                        continue
+                    turnover = (
+                        qgd_Partition_Aware_Mapping._turnover_between_supports(
+                            supports[part_idx],
+                            supports[other_idx],
+                        )
+                    )
+                    if turnover is not None:
+                        turnovers.append(turnover)
+        if not turnovers:
+            return None
+        return sum(turnovers) / len(turnovers)
+
+    @staticmethod
+    def _parts_to_window_turnover_weights(allparts, gate_dict, g):
+        """Linear ILP weights for 3q window continuity.
+
+        Dense 3q blocks are only routing-friendly when their local qubit window
+        persists into adjacent work.  A block like (0, i, j) followed by
+        (0, k, l) replaces two qubits in the 3q window, which is exactly the
+        expensive pattern on a line.  This cost keeps 2q parts at conceptual
+        cost one and charges 3q parts for active-pair count plus average
+        predecessor/successor window turnover.
+        """
+        N = max(len(allparts), 1)
+        supports = []
+        active_pairs_by_part = []
+        for part in allparts:
+            support, active_pairs = (
+                qgd_Partition_Aware_Mapping._part_support_and_active_pairs(
+                    part,
+                    gate_dict,
+                )
+            )
+            supports.append(support)
+            active_pairs_by_part.append(active_pairs)
+
+        gate_to_parts = defaultdict(list)
+        for part_idx, part in enumerate(allparts):
+            for gate_idx in part:
+                gate_to_parts[gate_idx].append(part_idx)
+
+        rg = defaultdict(set)
+        for src, dsts in g.items():
+            for dst in dsts:
+                rg[dst].add(src)
+
+        weights = []
+        for part_idx, part in enumerate(allparts):
+            support = supports[part_idx]
+            active_pairs = active_pairs_by_part[part_idx]
+            if len(support) < 3:
+                weights.append(0.0)
+                continue
+
+            succ_gate_sets = [g.get(gate_idx, set()) for gate_idx in part]
+            pred_gate_sets = [rg.get(gate_idx, set()) for gate_idx in part]
+            succ_turnover = qgd_Partition_Aware_Mapping._average_turnover(
+                part_idx,
+                part,
+                succ_gate_sets,
+                gate_to_parts,
+                allparts,
+                supports,
+            )
+            pred_turnover = qgd_Partition_Aware_Mapping._average_turnover(
+                part_idx,
+                part,
+                pred_gate_sets,
+                gate_to_parts,
+                allparts,
+                supports,
+            )
+            boundary_turnover = len(support)
+            if succ_turnover is None:
+                succ_turnover = boundary_turnover
+            if pred_turnover is None:
+                pred_turnover = boundary_turnover
+            conceptual_cost = (
+                max(len(support), len(active_pairs), 1)
+                + succ_turnover
+                + pred_turnover
+            )
+            weights.append((conceptual_cost - 1.0) / N)
+        return weights
+
     @staticmethod
     def _topo_key(mini_topology):
         return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
@@ -535,13 +658,21 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
         # ---- Phase 2: ILP partition selection ----
-        # Minimize total partition count so PAM gets the largest blocks possible
-        # under max_partition_size. Larger blocks = more (P_i, P_o) freedom to
-        # absorb routing SWAPs. Overlap-based tie-breaker (when enabled)
-        # picks deterministically among min-count covers, preferring covers
-        # whose parts share more logical qubits with their DAG successors.
+        # By default this minimizes partition count. Optional weight models can
+        # replace that unit cost with a routing-oriented conceptual cost while
+        # preserving a linear ILP objective.
         ilp_weights = None
-        if self.config.get('size_density_weight', False):
+        partition_weight_model = self.config.get(
+            'partition_weight_model',
+            'density',
+        )
+        if partition_weight_model == 'window_turnover':
+            ilp_weights = self._parts_to_window_turnover_weights(
+                allparts,
+                gate_dict,
+                g,
+            )
+        elif self.config.get('size_density_weight', False):
             sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
             ilp_weights = self._parts_to_density_weights(
                 allparts, gate_dict, sparse_penalty=sparse_penalty
@@ -577,7 +708,16 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             size = len(involved)
             size_counts[size] = size_counts.get(size, 0) + 1
         self._selected_partition_counts = dict(size_counts)
-        print(f"Selected partitions: 2-qubit={size_counts.get(2, 0)}, 3-qubit={size_counts.get(3, 0)}, total_multi={sum(size_counts.get(s, 0) for s in size_counts if s > 1)}")
+        if self.config.get('verbosity', 0) > 0:
+            selected_multi = sum(
+                count for size, count in size_counts.items() if size > 1
+            )
+            print(
+                "Selected partitions: "
+                f"2-qubit={size_counts.get(2, 0)}, "
+                f"3-qubit={size_counts.get(3, 0)}, "
+                f"total_multi={selected_multi}"
+            )
 
         # ---- Phase 4: Assemble partitioned circuit from selected partitions only ----
         partitioned_circuit = Circuit(qbit_num_orig_circuit)
@@ -1103,6 +1243,19 @@ def _restore_single_qubit_circuits(optimized_partitions, saved_circuits):
         for idx, orig in saved_circuits.items():
             optimized_partitions[idx].circuit = orig.copy()
 
+    @staticmethod
+    def _partition_order_cnot_breakdown(partition_order):
+        routing_cnot = 0
+        partition_cnot = 0
+        for part in partition_order:
+            if isinstance(part, Circuit):
+                routing_cnot += part.get_Gate_Nums().get('CNOT', 0)
+            elif isinstance(part, SingleQubitPartitionResult):
+                continue
+            else:
+                partition_cnot += int(getattr(part, 'cnot_count', 0))
+        return routing_cnot, partition_cnot
+
     def _partition_order_from_cpp_steps(
         self, steps, optimized_partitions, candidate_cache, N
     ):
@@ -1228,6 +1381,8 @@ def Partition_Aware_Mapping(
         do_cleanup = self.config.get('cleanup', True)
 
         routing_start = time.time()
+        routing_swap_cnot = 0
+        partition_body_cnot = 0
 
         if n_iterations == 0:
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
@@ -1244,6 +1399,9 @@ def Partition_Aware_Mapping(
             final_circuit, final_parameters = self.Construct_circuit_from_HS(
                 partition_order, optimized_partitions, N
             )
+            routing_swap_cnot, partition_body_cnot = (
+                self._partition_order_cnot_breakdown(partition_order)
+            )
 
         else:
             trial_results = self._run_layout_trials(
@@ -1302,6 +1460,8 @@ def Partition_Aware_Mapping(
                 best_pi = None
                 best_cost = float('inf')
                 best_pre_cleanup = None
+                best_routing_swap_cnot = 0
+                best_partition_body_cnot = 0
                 cleanup_total = 0.0
 
                 for _, trial_pi, _, trace_pi_init, route_steps in top_layouts:
@@ -1335,6 +1495,9 @@ def Partition_Aware_Mapping(
                     pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get(
                         'CNOT', 0
                     )
+                    trial_routing_cnot, trial_partition_cnot = (
+                        self._partition_order_cnot_breakdown(partition_order)
+                    )
 
                     cleanup_t0 = time.time()
                     cleaned_circuit, cleaned_params = wco.OptimizeWideCircuit(
@@ -1353,11 +1516,15 @@ def Partition_Aware_Mapping(
                         best_params = cleaned_params
                         best_pi_init = pi_init
                         best_pi = pi_out
+                        best_routing_swap_cnot = trial_routing_cnot
+                        best_partition_body_cnot = trial_partition_cnot
 
                 final_circuit = best_circuit
                 final_parameters = best_params
                 pi_initial = best_pi_init
                 pi = best_pi
+                routing_swap_cnot = best_routing_swap_cnot
+                partition_body_cnot = best_partition_body_cnot
 
             else:
                 _, best_pi, _, trace_pi_init, route_steps = trial_results[0]
@@ -1389,6 +1556,9 @@ def Partition_Aware_Mapping(
                 final_circuit, final_parameters = self.Construct_circuit_from_HS(
                     partition_order, optimized_partitions, N
                 )
+                routing_swap_cnot, partition_body_cnot = (
+                    self._partition_order_cnot_breakdown(partition_order)
+                )
 
         if do_cleanup and n_iterations > 0:
             self._routing_time = time.time() - routing_start - cleanup_total
@@ -1416,6 +1586,9 @@ def Partition_Aware_Mapping(
                     final_circuit.get_Flat_Circuit(), final_parameters
                 )
 
+        self._routing_swap_cnot = routing_swap_cnot
+        self._partition_body_cnot = partition_body_cnot
+
         return final_circuit, final_parameters, pi_initial, pi
 
     # ------------------------------------------------------------------------

From 1f765633d9f0ec6c821e69596f3befe330d692c1 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 17:27:12 +0200
Subject: [PATCH 176/232] new weights

---
 squander/synthesis/PartAM.py | 178 ++++++++++++++++++++++++++++++++++-
 1 file changed, 177 insertions(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index ff34d2908..75111db52 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -177,6 +177,7 @@ def __init__(self, config):
         self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
         self._swap_cache = {}     # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)}
         self._adj = None          # Precomputed adjacency list (built by compute_distances_bfs)
+        self._decomp_cache = {}   # {(rounded unitary bytes, topology): synthesis result}
 
     # ------------------------------------------------------------------------
     # Caching Methods
@@ -370,6 +371,171 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g):
             weights.append((conceptual_cost - 1.0) / N)
         return weights
 
+    @staticmethod
+    def _subcircuit_from_gate_set(gates, gate_dict, parameters, go, rgo,
+                                  gate_to_qubit, qbit_num):
+        subcircuit = Circuit(qbit_num)
+        subparams = []
+        ordered_gates = _get_topo_order(
+            {gate_idx: go[gate_idx] & gates for gate_idx in gates},
+            {gate_idx: rgo[gate_idx] & gates for gate_idx in gates},
+            gate_to_qubit,
+        )
+        for gate_idx in ordered_gates:
+            gate = gate_dict[gate_idx]
+            subcircuit.add_Gate(gate)
+            start = gate.get_Parameter_Start_Index()
+            stop = start + gate.get_Parameter_Num()
+            subparams.append(parameters[start:stop])
+        return subcircuit, np.concatenate(subparams, axis=0)
+
+    def _meta_from_gate_set(self, gates, gate_dict, parameters, go, rgo,
+                            gate_to_qubit, qbit_num):
+        subcircuit, subparams = self._subcircuit_from_gate_set(
+            gates,
+            gate_dict,
+            parameters,
+            go,
+            rgo,
+            gate_to_qubit,
+            qbit_num,
+        )
+        involved_qbits = subcircuit.get_Qbits()
+        qbit_num_sub = len(involved_qbits)
+        qbit_map = {
+            involved_qbits[idx]: idx for idx in range(len(involved_qbits))
+        }
+        remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num_sub)
+        return {
+            'N': qbit_num_sub,
+            'circuit': remapped_subcircuit,
+            'params': subparams,
+            'mini_topologies': get_unique_subtopologies(
+                self.topology,
+                qbit_num_sub,
+            ),
+            'involved_qbits': involved_qbits,
+            'qbit_map': qbit_map,
+            'original_cnot_count': subcircuit.get_Gate_Nums().get('CNOT', 0),
+        }
+
+    @staticmethod
+    def _synthesis_score_pairs(N):
+        identity = tuple(range(N))
+        pairs = []
+        seen = set()
+        for perm in permutations(range(N)):
+            for pair in ((identity, tuple(perm)), (tuple(perm), identity)):
+                if pair in seen:
+                    continue
+                seen.add(pair)
+                pairs.append(pair)
+        return pairs
+
+    def _synthesis_score_fallback(self, meta):
+        best_cost = float('inf')
+        for mini_topology in meta['mini_topologies']:
+            fb_circuit, _ = self._qiskit_routing_fallback(meta, mini_topology)
+            if fb_circuit is not None:
+                best_cost = min(
+                    best_cost,
+                    fb_circuit.get_Gate_Nums().get('CNOT', 0),
+                )
+        if best_cost < float('inf'):
+            return best_cost
+        return max(1, int(meta.get('original_cnot_count', 1)))
+
+    def _parts_to_synthesis_cnot_weights(self, allparts, gate_dict, parameters,
+                                         go, rgo, gate_to_qubit, qbit_num):
+        """Linear ILP weights from measured SeqPAM CNOT cost.
+
+        Each candidate partition is synthesized over the input-identity and
+        output-identity boundary sweeps, i.e. up to 2*N! local decompositions
+        per local topology.  The selected partitions are later fully
+        enumerated by _run_parallel_synthesis, reusing the shared decomposition
+        cache populated here.
+        """
+        N_parts = max(len(allparts), 1)
+        metas = []
+        scores = [None] * len(allparts)
+
+        for part_idx, part in enumerate(allparts):
+            meta = self._meta_from_gate_set(
+                part,
+                gate_dict,
+                parameters,
+                go,
+                rgo,
+                gate_to_qubit,
+                qbit_num,
+            )
+            metas.append(meta)
+            if meta['N'] < 2:
+                scores[part_idx] = 0
+
+        disable_pbar = self.config.get('progressbar', 0) == False
+        futures = []
+        cached = []
+        n_cpus = _available_cpus()
+
+        with Pool(processes=n_cpus, initializer=_init_decompose_worker,
+                  initargs=(self.config,)) as pool:
+            for part_idx, meta in enumerate(metas):
+                if scores[part_idx] is not None:
+                    continue
+                pairs = self._synthesis_score_pairs(meta['N'])
+                for topology_idx, mini_topology in enumerate(
+                    meta['mini_topologies']
+                ):
+                    for P_i, P_o in pairs:
+                        Umtx = self._build_permuted_unitary(meta, P_i, P_o)
+                        ck = self._cache_key(Umtx, mini_topology)
+                        if ck in self._decomp_cache:
+                            cached.append((part_idx, ck))
+                        else:
+                            future = pool.apply_async(
+                                _decompose_one,
+                                (Umtx, mini_topology),
+                            )
+                            futures.append((part_idx, ck, future))
+
+            for part_idx, ck in cached:
+                _, _, synth_err = self._decomp_cache[ck]
+                if synth_err <= self.config['tolerance']:
+                    synth_circuit, _, _ = self._decomp_cache[ck]
+                    cnot_count = synth_circuit.get_Gate_Nums().get('CNOT', 0)
+                    if scores[part_idx] is None:
+                        scores[part_idx] = cnot_count
+                    else:
+                        scores[part_idx] = min(scores[part_idx], cnot_count)
+
+            for part_idx, ck, future in tqdm(
+                futures,
+                desc="Partition Weight Synthesis",
+                disable=disable_pbar,
+            ):
+                synth_circuit, synth_params, synth_err = future.get()
+                self._decomp_cache[ck] = (
+                    synth_circuit,
+                    synth_params,
+                    synth_err,
+                )
+                if synth_err <= self.config['tolerance']:
+                    cnot_count = synth_circuit.get_Gate_Nums().get('CNOT', 0)
+                    if scores[part_idx] is None:
+                        scores[part_idx] = cnot_count
+                    else:
+                        scores[part_idx] = min(scores[part_idx], cnot_count)
+
+        for part_idx, score in enumerate(scores):
+            if score is None:
+                scores[part_idx] = self._synthesis_score_fallback(
+                    metas[part_idx],
+                )
+
+        self._partition_synthesis_cnot_scores = list(scores)
+        return [(float(score) - 1.0) / N_parts for score in scores]
+
     @staticmethod
     def _topo_key(mini_topology):
         return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
@@ -672,6 +838,16 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 gate_dict,
                 g,
             )
+        elif partition_weight_model == 'synthesis_cnot':
+            ilp_weights = self._parts_to_synthesis_cnot_weights(
+                allparts,
+                gate_dict,
+                working_parameters,
+                go,
+                rgo,
+                gate_to_qubit,
+                qbit_num_orig_circuit,
+            )
         elif self.config.get('size_density_weight', False):
             sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
             ilp_weights = self._parts_to_density_weights(
@@ -794,7 +970,7 @@ def _run_parallel_synthesis(self, partition_meta):
         use_auts = self.config.get('use_automorphisms', True)
         disable_pbar = self.config.get('progressbar', 0) == False
         aut_cache = {}
-        decomp_cache = {}
+        decomp_cache = self._decomp_cache
 
         with Pool(processes=n_cpus, initializer=_init_decompose_worker,
                   initargs=(self.config,)) as pool:

From 2068a0902d4c9b4e32063ffde4402f234435bc73 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 18:55:06 +0200
Subject: [PATCH 177/232] better cost

---
 squander/synthesis/PartAM.py | 111 ++++++++++++++++++++++++++++++++++-
 1 file changed, 109 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 75111db52..072d4c013 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -371,6 +371,93 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g):
             weights.append((conceptual_cost - 1.0) / N)
         return weights
 
+    @staticmethod
+    def _side_window_turnover_cnot_cost(support, neighbor_support):
+        if len(support) < 3 or len(neighbor_support) < 2:
+            return None
+        entering_or_leaving = len(support - neighbor_support)
+        if entering_or_leaving == 0:
+            return 0.0
+
+        # A new qubit in a 3q window implies at least one SWAP on a line.
+        # If both sides are 3q candidates the boundary is seen from both
+        # candidate scores, so each side pays half of the 3-CNOT SWAP cost.
+        cnot_per_window_qubit = 1.5 if len(neighbor_support) >= 3 else 3.0
+        return cnot_per_window_qubit * entering_or_leaving
+
+    @staticmethod
+    def _average_window_cnot_cost(part_idx, part, neighbor_gate_sets,
+                                  gate_to_parts, allparts, supports):
+        costs = []
+        support = supports[part_idx]
+        turnover_cost = (
+            qgd_Partition_Aware_Mapping._side_window_turnover_cnot_cost
+        )
+        for gate_set in neighbor_gate_sets:
+            for gate_idx in gate_set - part:
+                for other_idx in gate_to_parts.get(gate_idx, ()):
+                    if other_idx == part_idx:
+                        continue
+                    other_part = allparts[other_idx]
+                    if part & other_part:
+                        continue
+                    cost = turnover_cost(support, supports[other_idx])
+                    if cost is not None:
+                        costs.append(cost)
+        if not costs:
+            return 0.0
+        return sum(costs) / len(costs)
+
+    @staticmethod
+    def _parts_to_window_turnover_cnot_costs(allparts, gate_dict, g):
+        supports = []
+        for part in allparts:
+            support, _ = (
+                qgd_Partition_Aware_Mapping._part_support_and_active_pairs(
+                    part,
+                    gate_dict,
+                )
+            )
+            supports.append(support)
+
+        gate_to_parts = defaultdict(list)
+        for part_idx, part in enumerate(allparts):
+            for gate_idx in part:
+                gate_to_parts[gate_idx].append(part_idx)
+
+        rg = defaultdict(set)
+        for src, dsts in g.items():
+            for dst in dsts:
+                rg[dst].add(src)
+
+        costs = []
+        for part_idx, part in enumerate(allparts):
+            support = supports[part_idx]
+            if len(support) < 3:
+                costs.append(0.0)
+                continue
+            succ_gate_sets = [g.get(gate_idx, set()) for gate_idx in part]
+            pred_gate_sets = [rg.get(gate_idx, set()) for gate_idx in part]
+            costs.append(
+                qgd_Partition_Aware_Mapping._average_window_cnot_cost(
+                    part_idx,
+                    part,
+                    pred_gate_sets,
+                    gate_to_parts,
+                    allparts,
+                    supports,
+                )
+                + qgd_Partition_Aware_Mapping._average_window_cnot_cost(
+                    part_idx,
+                    part,
+                    succ_gate_sets,
+                    gate_to_parts,
+                    allparts,
+                    supports,
+                )
+            )
+        return costs
+
     @staticmethod
     def _subcircuit_from_gate_set(gates, gate_dict, parameters, go, rgo,
                                   gate_to_qubit, qbit_num):
@@ -446,7 +533,9 @@ def _synthesis_score_fallback(self, meta):
         return max(1, int(meta.get('original_cnot_count', 1)))
 
     def _parts_to_synthesis_cnot_weights(self, allparts, gate_dict, parameters,
-                                         go, rgo, gate_to_qubit, qbit_num):
+                                         go, rgo, gate_to_qubit, qbit_num,
+                                         gate_dag=None,
+                                         include_window_route_cost=False):
         """Linear ILP weights from measured SeqPAM CNOT cost.
 
         Each candidate partition is synthesized over the input-identity and
@@ -533,6 +622,17 @@ def _parts_to_synthesis_cnot_weights(self, allparts, gate_dict, parameters,
                     metas[part_idx],
                 )
 
+        if include_window_route_cost and gate_dag is not None:
+            window_route_costs = self._parts_to_window_turnover_cnot_costs(
+                allparts,
+                gate_dict,
+                gate_dag,
+            )
+            scores = [
+                float(score) + window_route_costs[part_idx]
+                for part_idx, score in enumerate(scores)
+            ]
+
         self._partition_synthesis_cnot_scores = list(scores)
         return [(float(score) - 1.0) / N_parts for score in scores]
 
@@ -838,7 +938,10 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 gate_dict,
                 g,
             )
-        elif partition_weight_model == 'synthesis_cnot':
+        elif partition_weight_model in (
+            'synthesis_cnot',
+            'synthesis_route_cnot',
+        ):
             ilp_weights = self._parts_to_synthesis_cnot_weights(
                 allparts,
                 gate_dict,
@@ -847,6 +950,10 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
                 rgo,
                 gate_to_qubit,
                 qbit_num_orig_circuit,
+                gate_dag=g,
+                include_window_route_cost=(
+                    partition_weight_model == 'synthesis_route_cnot'
+                ),
             )
         elif self.config.get('size_density_weight', False):
             sparse_penalty = float(self.config.get('sparse_penalty', 3.0))

From e7b0638c4c054655a21ca2d352218a1aa3adf60f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 30 Apr 2026 21:16:10 +0200
Subject: [PATCH 178/232] Fix

---
 squander/synthesis/PartAM.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 072d4c013..2cc2255a3 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -543,6 +543,11 @@ def _parts_to_synthesis_cnot_weights(self, allparts, gate_dict, parameters,
         per local topology.  The selected partitions are later fully
         enumerated by _run_parallel_synthesis, reusing the shared decomposition
         cache populated here.
+
+        The ILP objective always keeps the original one-unit partition cost.
+        The measured CNOT score is an additional cost, not a replacement for
+        partition count; otherwise small local CNOT savings can fragment the
+        circuit into many routing boundaries.
         """
         N_parts = max(len(allparts), 1)
         metas = []
@@ -634,7 +639,7 @@ def _parts_to_synthesis_cnot_weights(self, allparts, gate_dict, parameters,
             ]
 
         self._partition_synthesis_cnot_scores = list(scores)
-        return [(float(score) - 1.0) / N_parts for score in scores]
+        return [float(score) / N_parts for score in scores]
 
     @staticmethod
     def _topo_key(mini_topology):

From 0589f0dd0b02050b2676f4621f6e6a5cce6041ff Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 01:13:27 +0200
Subject: [PATCH 179/232] hot swapping

---
 benchmark_PartAM.py                           | 184 ----------------
 .../sabre_router/include/sabre_router.hpp     |  22 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 175 +++++++++++++--
 squander/synthesis/PartAM.py                  | 199 +++++++++++++++++-
 squander/synthesis/bindings.cpp               |   3 +
 5 files changed, 379 insertions(+), 204 deletions(-)
 delete mode 100644 benchmark_PartAM.py

diff --git a/benchmark_PartAM.py b/benchmark_PartAM.py
deleted file mode 100644
index 6d7605781..000000000
--- a/benchmark_PartAM.py
+++ /dev/null
@@ -1,184 +0,0 @@
-"""
-Benchmark PartAM cleanup phase per circuit.
-
-Runs each circuit 5 times with PartAM (cleanup=True) and records:
-  - qubit count
-  - initial CNOT count (original QASM circuit)
-  - CNOT count before cleanup (post-synthesis, pre-cleanup)
-  - CNOT count after cleanup (final)
-  - decomposition error
-  - compilation time (seconds)
-
-Results are exported to benchmark_PartAM.csv.
-
-Usage:
-    conda activate qgd
-    python benchmark_PartAM.py
-"""
-
-import numpy as np
-import time
-import os
-import glob
-import csv
-import random
-
-from squander import Partition_Aware_Mapping
-from squander import utils
-from squander import Circuit
-
-N_RUNS = 3
-OUTPUT_CSV = "benchmark_PartAM_layout.csv"
-
-
-def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm):
-    num_qubits = circ.get_Qbit_Num()
-    matrix_size = 1 << num_qubits
-    rng = np.random.RandomState(0)
-    initial_state = rng.uniform(-1, 1, (matrix_size,)) + 1j * rng.uniform(-1, 1, (matrix_size,))
-    initial_state /= np.linalg.norm(initial_state)
-
-    original_state = initial_state.copy()
-    circ_orig.apply_to(parameters_orig, original_state)
-
-    circ_Final = Circuit(num_qubits)
-    output_perm_T = [0] * num_qubits
-    for i, j in enumerate(output_perm):
-        output_perm_T[j] = i
-    circ_Final.add_Permutation([int(x) for x in input_perm])
-    circ_Final.add_Circuit(circ)
-    circ_Final.add_Permutation(output_perm_T)
-
-    state = initial_state.copy()
-    circ_Final.apply_to(params, state)
-    return 1 - abs(np.vdot(state, original_state))
-
-
-def make_linear_topology(n_qubits):
-    return [(i, i + 1) for i in range(n_qubits - 1)]
-
-
-def run_once(circ_orig, parameters_orig, topology):
-    config = {
-        'strategy': "TreeSearch",
-        'test_subcircuits': False,
-        'test_final_circuit': False,
-        'max_partition_size': 3,
-        'progressbar': False,
-        'topology': topology,
-        'verbosity': 0,
-        'cleanup': True,
-        'sabre_iterations':20,
-        'n_layout_trials':128,
-        'random_seed':random.randint(1,100),
-        # Cheap candidate prefilter before full A* scoring.
-        'prefilter_top_k': 50,
-        'prefilter_min_per_partition': 2,
-        'prefilter_min_3q': 12,
-        # Rank every layout trial by actual constructed routing, not only by
-        # the heuristic trial cost.
-        'actual_routing_rank_top_k': None,
-        'top_k_pi': 1,
-        # Boundary-state beam routing is a Python prototype on this branch.
-        # Set width/depth to 1 to recover the greedy router.
-        'boundary_beam_width': 4,
-        'boundary_beam_depth': 3,
-        'cnot_cost': 0.5 / 3.0,  # old: swap_cost=15, local_cost_weight=1.0 -> 15:1 swap:cnot
-        'cleanup_top_k': 3,
-        "parallel_layout_trials": True,
-        "layout_trial_workers": 0,
-        'max_E_size': 40,
-        'max_lookahead': 6,
-        'E_weight': 0.3,
-        'E_alpha': 1.0,        # LightSABRE-style uniform lookahead (no per-depth decay)
-        'decay_delta': 0.001,
-        'swap_burst_budget': 5,
-        'path_tiebreak_weight': 0.2,
-        'three_qubit_exit_weight': 1.5,
-        'routing_aware_partitioning': True,
-        'sparse_penalty': 3.0,
-        'two_pair_3q_penalty': 1.5,
-        'triangle_free_3q_penalty': 1.0,
-        'three_qubit_reuse_discount': 0.15,
-        'three_qubit_reuse_discount_cap': 1.0,
-    }
-
-    # Clean the initial circuit using the same config pattern as in PartAM.py
-    from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
-    cleanup_config = dict(config)
-    cleanup_config['topology'] = None
-    cleanup_config['routed'] = False
-    cleanup_config['test_subcircuits'] = False
-    cleanup_config['test_final_circuit'] = False
-    cleanup_config['global_min'] = True
-    cleanup_config['pre-opt-strategy'] = 'TreeSearch'
-
-    wco = qgd_Wide_Circuit_Optimization(cleanup_config)
-    #circ_orig, parameters_orig = wco.OptimizeWideCircuit(circ_orig.get_Flat_Circuit(), parameters_orig)
-
-    start = time.time()
-    pam = Partition_Aware_Mapping(config)
-    circ, params, pi_in, pi_out = pam.Partition_Aware_Mapping(circ_orig.get_Flat_Circuit(), parameters_orig)
-    elapsed = time.time() - start
-    routing_time = pam._routing_time
-    cnot_before_cleanup = pam._cnot_pre_cleanup
-    cnot_after_cleanup = circ.get_Gate_Nums().get('CNOT', 0)
-    error = validate_result(circ_orig, parameters_orig, circ, params, pi_in, pi_out)
-
-    return cnot_before_cleanup, cnot_after_cleanup, error, elapsed, routing_time
-
-
-if __name__ == '__main__':
-    circs_dir = "circs"
-    qasm_files = sorted(glob.glob(os.path.join(circs_dir, "*.qasm")))
-
-    if not qasm_files:
-        print(f"No .qasm files found in {circs_dir}/")
-        exit(1)
-
-    print(f"Found {len(qasm_files)} circuits in {circs_dir}/")
-    print(f"Running {N_RUNS} times per circuit (cleanup=True)\n")
-
-    fieldnames = [
-        'circuit', 'n_qubits', 'run',
-        'initial_cnot', 'cnot_pre_cleanup', 'cnot_post_cleanup',
-        'error', 'time_s','routing_time_s'
-    ]
-
-    # Open CSV once and flush after each circuit so partial results are never lost
-    with open(OUTPUT_CSV, 'w', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=fieldnames)
-        writer.writeheader()
-
-        for filepath in qasm_files:
-            name = os.path.basename(filepath)
-            print(f"{'='*70}")
-            print(f"Circuit: {name}")
-
-            circ_orig, parameters_orig, _ = utils.qasm_to_squander_circuit(filepath)
-            n_qubits = circ_orig.get_Qbit_Num()
-            topology = make_linear_topology(n_qubits)
-
-            initial_cnot = circ_orig.get_Gate_Nums().get('CNOT', 0)
-            print(f"Qubits: {n_qubits}, Initial CNOTs: {initial_cnot}")
-            print(f"{'Run':>4} {'Pre-cleanup':>12} {'Post-cleanup':>12} {'Error':>12} {'Time(s)':>10} {'Routing time(s)':>10}")
-
-            for run_idx in range(N_RUNS):
-                cnot_pre, cnot_post, error, elapsed, routing_time = run_once(circ_orig, parameters_orig, topology)
-                print(f"{run_idx:>4} {cnot_pre:>12} {cnot_post:>12} {error:>12.2e} {elapsed:>10.1f} {routing_time:>10.1f}")
-                writer.writerow({
-                    'circuit': name,
-                    'n_qubits': n_qubits,
-                    'run': run_idx,
-                    'initial_cnot': initial_cnot,
-                    'cnot_pre_cleanup': cnot_pre,
-                    'cnot_post_cleanup': cnot_post,
-                    'error': error,
-                    'time_s': round(elapsed, 3),
-                    'routing_time_s': round(routing_time,3)
-                })
-                f.flush()
-
-            print()
-
-    print(f"Results saved to {OUTPUT_CSV}")
diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 7a0154e23..14a11c25a 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -92,6 +92,9 @@ struct SabreConfig {
     int swap_burst_budget = 5; // Qiskit LightSABRE DECAY_RESET_INTERVAL
     double path_tiebreak_weight = 0.2;
     double three_qubit_exit_weight = 1.0;
+    double hot_qubit_swap_weight = 0.15;
+    double hot_qubit_active_discount = 0.35;
+    double hot_qubit_depth_decay = 0.7;
     int boundary_beam_width = 1;
     int boundary_beam_depth = 1;
 };
@@ -279,7 +282,8 @@ class SabreRouter {
         const std::vector<double>* decay = nullptr,
         std::vector<std::pair<int,int>>* out_swaps = nullptr,
         std::vector<int>* out_pi_new = nullptr,
-        const NeighborInfo* cached_neighbor_info = nullptr
+        const NeighborInfo* cached_neighbor_info = nullptr,
+        const std::vector<double>* hot_qubit_weights = nullptr
     ) const;
 
     // Route and update layout for a candidate (port of transform_pi)
@@ -312,6 +316,18 @@ class SabreRouter {
         double decay_factor = 1.0
     ) const;
 
+    std::vector<double> build_hot_qubit_weights(
+        const std::vector<int>& F_snapshot,
+        const std::vector<std::pair<int,int>>& E
+    ) const;
+
+    double hot_qubit_swap_tax(
+        const std::vector<std::pair<int,int>>& swaps,
+        const std::vector<int>& pi,
+        const CandidateData& cand,
+        const std::vector<double>& hot_qubit_weights
+    ) const;
+
     double future_partition_cost(
         int partition_idx,
         const std::vector<int>& pi,
@@ -377,13 +393,15 @@ class SabreRouter {
         const std::vector<double>& scores,
         const std::vector<std::vector<std::pair<int,int>>>& cached_swaps,
         const std::vector<std::vector<int>>& cached_pi,
+        const std::vector<int>& pi,
         const std::vector<int>& F_snapshot,
         const std::vector<uint8_t>& resolved,
         const std::vector<std::vector<int>>& children_graph,
         const std::vector<std::vector<int>>& parents_graph,
         bool reverse,
         const std::unordered_map<int, CanonicalEntry>& canonical_data,
-        SwapCache* swap_cache
+        SwapCache* swap_cache,
+        const std::vector<double>& hot_qubit_weights
     ) const;
 
     // Check if partition is single-qubit
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index d1444a26d..fc91d8f84 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -399,6 +399,108 @@ double SabreRouter::routing_objective(
     );
 }
 
+std::vector<double> SabreRouter::build_hot_qubit_weights(
+    const std::vector<int>& F_snapshot,
+    const std::vector<std::pair<int,int>>& E
+) const {
+    std::vector<double> weights(N_, 0.0);
+    if (config_.hot_qubit_swap_weight <= 0.0) {
+        return weights;
+    }
+
+    auto add_partition = [&](int partition_idx, double weight) {
+        if (
+            weight <= 0.0
+            || partition_idx < 0
+            || partition_idx >= static_cast<int>(layout_partitions_.size())
+            || layout_partitions_[partition_idx].is_single
+        ) {
+            return;
+        }
+        for (int q : layout_partitions_[partition_idx].involved_qbits) {
+            if (q >= 0 && q < N_) {
+                weights[q] += weight;
+            }
+        }
+    };
+
+    for (int partition_idx : F_snapshot) {
+        add_partition(partition_idx, 1.0);
+    }
+
+    const double depth_decay = std::max(0.0, config_.hot_qubit_depth_decay);
+    for (auto [partition_idx, depth] : E) {
+        add_partition(partition_idx, std::pow(depth_decay, std::max(0, depth)));
+    }
+
+    const double max_weight = *std::max_element(weights.begin(), weights.end());
+    if (max_weight <= 0.0) {
+        return weights;
+    }
+    for (double& weight : weights) {
+        weight /= max_weight;
+    }
+    return weights;
+}
+
+double SabreRouter::hot_qubit_swap_tax(
+    const std::vector<std::pair<int,int>>& swaps,
+    const std::vector<int>& pi,
+    const CandidateData& cand,
+    const std::vector<double>& hot_qubit_weights
+) const {
+    if (
+        swaps.empty()
+        || config_.hot_qubit_swap_weight <= 0.0
+        || hot_qubit_weights.empty()
+    ) {
+        return 0.0;
+    }
+
+    std::vector<int> p2v(N_, -1);
+    for (int q = 0; q < static_cast<int>(pi.size()); q++) {
+        const int p = pi[q];
+        if (p >= 0 && p < N_) {
+            p2v[p] = q;
+        }
+    }
+
+    std::vector<uint8_t> active(N_, 0);
+    for (int q : cand.involved_qbits) {
+        if (q >= 0 && q < N_) {
+            active[q] = 1;
+        }
+    }
+    const double active_discount = std::min(
+        1.0,
+        std::max(0.0, config_.hot_qubit_active_discount)
+    );
+
+    double tax = 0.0;
+    auto add_q = [&](int q) {
+        if (q < 0 || q >= static_cast<int>(hot_qubit_weights.size())) {
+            return;
+        }
+        double contribution = hot_qubit_weights[q];
+        if (q < static_cast<int>(active.size()) && active[q]) {
+            contribution *= active_discount;
+        }
+        tax += contribution;
+    };
+
+    for (auto [p1, p2] : swaps) {
+        if (p1 < 0 || p1 >= N_ || p2 < 0 || p2 >= N_) {
+            continue;
+        }
+        const int q1 = p2v[p1];
+        const int q2 = p2v[p2];
+        add_q(q1);
+        add_q(q2);
+        std::swap(p2v[p1], p2v[p2]);
+    }
+    return tax;
+}
+
 void SabreRouter::apply_decay_for_swaps(
     const std::vector<std::pair<int,int>>& swaps,
     std::vector<double>& decay
@@ -1184,7 +1286,8 @@ double SabreRouter::score_candidate(
     const std::vector<double>* decay,
     std::vector<std::pair<int,int>>* out_swaps,
     std::vector<int>* out_pi_new,
-    const NeighborInfo* cached_neighbor_info
+    const NeighborInfo* cached_neighbor_info,
+    const std::vector<double>* hot_qubit_weights
 ) const {
     NeighborInfo local_neighbor_info;
     const NeighborInfo* neighbor_ptr;
@@ -1213,6 +1316,14 @@ double SabreRouter::score_candidate(
         1.0,
         decay_factor
     );
+    if (hot_qubit_weights != nullptr && config_.hot_qubit_swap_weight > 0.0) {
+        score += config_.hot_qubit_swap_weight * hot_qubit_swap_tax(
+            swaps,
+            pi,
+            cand,
+            *hot_qubit_weights
+        );
+    }
 
     const int cand_idx = cand.partition_idx;
     double future_score = future_context_cost(
@@ -1430,13 +1541,15 @@ size_t SabreRouter::boundary_beam_select_index(
     const std::vector<double>& scores,
     const std::vector<std::vector<std::pair<int,int>>>& cached_swaps,
     const std::vector<std::vector<int>>& cached_pi,
+    const std::vector<int>& pi,
     const std::vector<int>& F_snapshot,
     const std::vector<uint8_t>& resolved,
     const std::vector<std::vector<int>>& children_graph,
     const std::vector<std::vector<int>>& parents_graph,
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data,
-    SwapCache* swap_cache
+    SwapCache* swap_cache,
+    const std::vector<double>& hot_qubit_weights
 ) const {
     size_t fallback_idx = 0;
     for (size_t i = 1; i < scores.size(); i++) {
@@ -1471,11 +1584,25 @@ size_t SabreRouter::boundary_beam_select_index(
         size_t first_idx;
     };
 
-    auto transition_cost = [&](const CandidateData& cand, size_t idx) {
-        return routing_objective(
-            static_cast<double>(cached_swaps[idx].size()),
+    auto transition_cost = [&](
+        const CandidateData& cand,
+        const std::vector<std::pair<int,int>>& swaps,
+        const std::vector<int>& pi_state,
+        const std::vector<double>& hot_weights
+    ) {
+        double cost = routing_objective(
+            static_cast<double>(swaps.size()),
             cand.cnot_count
         );
+        if (config_.hot_qubit_swap_weight > 0.0) {
+            cost += config_.hot_qubit_swap_weight * hot_qubit_swap_tax(
+                swaps,
+                pi_state,
+                cand,
+                hot_weights
+            );
+        }
+        return cost;
     };
 
     auto sort_states = [](const BeamState& a, const BeamState& b) {
@@ -1495,7 +1622,12 @@ size_t SabreRouter::boundary_beam_select_index(
             children_graph,
             parents_graph
         );
-        const double trans_cost = transition_cost(cand, idx);
+        const double trans_cost = transition_cost(
+            cand,
+            cached_swaps[idx],
+            pi,
+            hot_qubit_weights
+        );
         states.push_back(BeamState{
             scores[idx],
             trans_cost,
@@ -1536,6 +1668,10 @@ size_t SabreRouter::boundary_beam_select_index(
                 children_graph,
                 parents_graph
             );
+            auto rollout_hot_qubit_weights = build_hot_qubit_weights(
+                state.F,
+                E
+            );
 
             auto rollout_candidates = obtain_partition_candidates(state.F);
             if (rollout_candidates.empty()) {
@@ -1581,11 +1717,14 @@ size_t SabreRouter::boundary_beam_select_index(
                     nullptr,
                     &swaps,
                     &output_perm,
-                    &neighbor_info
+                    &neighbor_info,
+                    &rollout_hot_qubit_weights
                 );
-                const double trans_cost = routing_objective(
-                    static_cast<double>(swaps.size()),
-                    cand->cnot_count
+                const double trans_cost = transition_cost(
+                    *cand,
+                    swaps,
+                    state.pi,
+                    rollout_hot_qubit_weights
                 );
                 const double future_cost = score - trans_cost;
                 const double new_total = state.total_cost + trans_cost;
@@ -1737,6 +1876,7 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
 
         // Generate extended set
         auto E = generate_extended_set(F, resolved, cg, pg);
+        auto hot_qubit_weights = build_hot_qubit_weights(F, E);
 
         // Prefilter with a cheap estimate of the candidate's future context.
         auto candidates = prefilter_candidates(
@@ -1768,7 +1908,8 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
                 F, pi, E, reverse, canonical_data,
                 &swap_cache, &decay,
                 &cached_swaps[ci], &cached_pi[ci],
-                &cached_ni
+                &cached_ni,
+                &hot_qubit_weights
             );
         }
 
@@ -1778,13 +1919,15 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             scores,
             cached_swaps,
             cached_pi,
+            pi,
             F,
             resolved,
             cg,
             pg,
             reverse,
             canonical_data,
-            &swap_cache
+            &swap_cache,
+            hot_qubit_weights
         );
         const auto& best = *candidates[best_ci];
 
@@ -1806,6 +1949,14 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             1.0,
             decay_factor
         );
+        if (config_.hot_qubit_swap_weight > 0.0) {
+            total_cost += config_.hot_qubit_swap_weight * hot_qubit_swap_tax(
+                swaps,
+                pi,
+                best,
+                hot_qubit_weights
+            );
+        }
         if (route_trace) {
             if (!swaps.empty()) {
                 RouteStep swap_step;
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2cc2255a3..6ce60276d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -149,6 +149,9 @@ def __init__(self, config):
         self.config.setdefault('decay_delta', 0.001)  # Qiskit LightSABRE DECAY_RATE
         self.config.setdefault('swap_burst_budget', 5)  # Qiskit LightSABRE DECAY_RESET_INTERVAL
         self.config.setdefault('path_tiebreak_weight', 0.2)
+        self.config.setdefault('hot_qubit_swap_weight', 0.15)
+        self.config.setdefault('hot_qubit_active_discount', 0.35)
+        self.config.setdefault('hot_qubit_depth_decay', 0.7)
         # The neighbor heuristic is normalized to [0, 1] and added to A*'s f-value.
         # g-deltas are integer and h-deltas are half-integer, so preserving
         # swap-count optimality requires weight < 0.5.
@@ -1436,6 +1439,18 @@ def _run_layout_trials_cpp(
             cfg.three_qubit_exit_weight = self.config.get(
                 'three_qubit_exit_weight', 1.0
             )
+        if hasattr(cfg, 'hot_qubit_swap_weight'):
+            cfg.hot_qubit_swap_weight = self.config.get(
+                'hot_qubit_swap_weight', 0.15
+            )
+        if hasattr(cfg, 'hot_qubit_active_discount'):
+            cfg.hot_qubit_active_discount = self.config.get(
+                'hot_qubit_active_discount', 0.35
+            )
+        if hasattr(cfg, 'hot_qubit_depth_decay'):
+            cfg.hot_qubit_depth_decay = self.config.get(
+                'hot_qubit_depth_decay', 0.7
+            )
         if hasattr(cfg, 'boundary_beam_width'):
             cfg.boundary_beam_width = self.config.get(
                 'boundary_beam_width', 1
@@ -2295,6 +2310,92 @@ def add_edges(target_idx, edge_weight):
             "weight": weight,
         }
 
+    @staticmethod
+    def _build_hot_qubit_weights(
+        F,
+        E,
+        layout_partitions,
+        depth_decay=0.7,
+    ):
+        """Return normalized future-use weights by logical qubit.
+
+        Front-layer partitions count at full weight.  Extended-set partitions
+        are depth-decayed, making repeatedly reused near-future qubits more
+        expensive to move as bystanders.
+        """
+        if layout_partitions is None:
+            return {}
+
+        depth_decay = max(0.0, float(depth_decay))
+        weights = defaultdict(float)
+
+        def add_partition(partition_idx, weight):
+            if (
+                weight <= 0
+                or partition_idx < 0
+                or partition_idx >= len(layout_partitions)
+            ):
+                return
+            if qgd_Partition_Aware_Mapping._partition_is_single(
+                layout_partitions[partition_idx]
+            ):
+                return
+            for qbit in qgd_Partition_Aware_Mapping._partition_involved_qbits(
+                layout_partitions[partition_idx]
+            ):
+                weights[int(qbit)] += weight
+
+        for partition_idx in F:
+            add_partition(int(partition_idx), 1.0)
+
+        if E:
+            for partition_idx, depth in E:
+                add_partition(
+                    int(partition_idx),
+                    depth_decay ** max(0, int(depth)),
+                )
+
+        if not weights:
+            return {}
+        max_weight = max(weights.values())
+        if max_weight <= 0:
+            return {}
+        return {qbit: weight / max_weight for qbit, weight in weights.items()}
+
+    @staticmethod
+    def _hot_qubit_swap_tax(
+        swaps,
+        pi,
+        hot_qubit_weights,
+        active_qbits=None,
+        active_discount=0.35,
+    ):
+        if not swaps or not hot_qubit_weights:
+            return 0.0
+
+        active_discount = min(1.0, max(0.0, float(active_discount)))
+        active = set(int(qbit) for qbit in (active_qbits or ()))
+        pi_list = [int(x) for x in pi]
+        p2v = [None] * len(pi_list)
+        for logical_q, physical_q in enumerate(pi_list):
+            p2v[int(physical_q)] = logical_q
+
+        tax = 0.0
+        for p1, p2 in swaps:
+            p1 = int(p1)
+            p2 = int(p2)
+            q1 = p2v[p1]
+            q2 = p2v[p2]
+            for qbit in (q1, q2):
+                if qbit is None:
+                    continue
+                contribution = float(hot_qubit_weights.get(int(qbit), 0.0))
+                if qbit in active:
+                    contribution *= active_discount
+                tax += contribution
+            p2v[p1], p2v[p2] = q2, q1
+        return tax
+
     def _advance_layout_frontier(
         self,
         selected_partition_idx,
@@ -2339,6 +2440,7 @@ def _boundary_beam_select_index(
         scores,
         cached_swaps,
         cached_pi,
+        pi,
         F_snapshot,
         resolved_partitions,
         DAG,
@@ -2353,6 +2455,7 @@ def _boundary_beam_select_index(
         alpha=1.0,
         cnot_cost=1.0 / 3.0,
         adj=None,
+        hot_qubit_weights=None,
     ):
         """Choose the next candidate by rolling out boundary-layout states.
 
@@ -2373,19 +2476,36 @@ def _boundary_beam_select_index(
         top_k = self.config.get("prefilter_top_k", 50)
         path_weight = self.config.get("path_tiebreak_weight", 0.2)
         three_q_weight = self.config.get("three_qubit_exit_weight", 1.0)
+        hot_weight = self.config.get("hot_qubit_swap_weight", 0.15)
+        hot_discount = self.config.get("hot_qubit_active_discount", 0.35)
+        hot_decay = self.config.get("hot_qubit_depth_decay", 0.7)
 
-        def transition_cost(cand, swaps):
-            return self._routing_objective(
+        def transition_cost(cand, swaps, pi_state, hot_weights):
+            route_score = self._routing_objective(
                 len(swaps or ()),
                 cand.cnot_count,
                 cnot_cost,
             )
+            if hot_weight <= 0:
+                return route_score
+            return route_score + hot_weight * self._hot_qubit_swap_tax(
+                swaps,
+                pi_state,
+                hot_weights,
+                active_qbits=cand.involved_qbits,
+                active_discount=hot_discount,
+            )
 
         states = []
         for idx, cand in enumerate(partition_candidates):
             if cached_pi[idx] is None:
                 continue
-            trans_cost = transition_cost(cand, cached_swaps[idx])
+            trans_cost = transition_cost(
+                cand,
+                cached_swaps[idx],
+                pi,
+                hot_qubit_weights,
+            )
             F_next, resolved_next = self._advance_layout_frontier(
                 cand.partition_idx,
                 F_snapshot,
@@ -2431,6 +2551,12 @@ def transition_cost(cand, swaps):
                     max_E_size=max_E_size,
                     max_lookahead=max_lookahead,
                 )
+                rollout_hot_weights = self._build_hot_qubit_weights(
+                    F_state,
+                    E,
+                    optimized_partitions,
+                    depth_decay=hot_decay,
+                )
                 candidates = self.obtain_partition_candidates(
                     F_list,
                     optimized_partitions,
@@ -2488,8 +2614,16 @@ def transition_cost(cand, swaps):
                         layout_partitions=optimized_partitions,
                         return_transforms=True,
                         three_qubit_exit_weight=three_q_weight,
+                        hot_qubit_weights=rollout_hot_weights,
+                        hot_qubit_swap_weight=hot_weight,
+                        hot_qubit_active_discount=hot_discount,
+                    )
+                    trans_cost = transition_cost(
+                        cand,
+                        swaps,
+                        pi_state,
+                        rollout_hot_weights,
                     )
-                    trans_cost = transition_cost(cand, swaps)
                     future_cost = float(score) - trans_cost
                     new_total = total_cost + trans_cost
                     rank_cost = new_total + future_cost
@@ -2627,6 +2761,12 @@ def Heuristic_Search(
                 max_E_size=max_E_size,
                 max_lookahead=max_lookahead,
             )
+            hot_qubit_weights = self._build_hot_qubit_weights(
+                F_snapshot,
+                E,
+                optimized_partitions,
+                depth_decay=self.config.get("hot_qubit_depth_decay", 0.7),
+            )
 
             partition_candidates = self.obtain_partition_candidates(
                 F,
@@ -2700,6 +2840,13 @@ def Heuristic_Search(
                     three_qubit_exit_weight=self.config.get(
                         "three_qubit_exit_weight", 1.0
                     ),
+                    hot_qubit_weights=hot_qubit_weights,
+                    hot_qubit_swap_weight=self.config.get(
+                        "hot_qubit_swap_weight", 0.15
+                    ),
+                    hot_qubit_active_discount=self.config.get(
+                        "hot_qubit_active_discount", 0.35
+                    ),
                 )
                 scores[ci] = score
                 cached_swaps[ci] = swaps
@@ -2710,6 +2857,7 @@ def Heuristic_Search(
                 scores,
                 cached_swaps,
                 cached_pi,
+                pi,
                 F_snapshot,
                 resolved_partitions,
                 DAG,
@@ -2723,6 +2871,7 @@ def Heuristic_Search(
                 alpha=E_alpha,
                 cnot_cost=self.config.get("cnot_cost", 1.0 / 3.0),
                 adj=self._adj,
+                hot_qubit_weights=hot_qubit_weights,
             )
             min_partition_candidate = partition_candidates[best_idx]
 
@@ -2825,6 +2974,9 @@ def _heuristic_search_layout_only(
         E_alpha = self.config.get("E_alpha", 1.0)
         cnot_cost = self.config.get("cnot_cost", 1.0 / 3.0)
         swap_burst_budget = self.config.get("swap_burst_budget", 5)
+        hot_weight = self.config.get("hot_qubit_swap_weight", 0.15)
+        hot_discount = self.config.get("hot_qubit_active_discount", 0.35)
+        hot_decay = self.config.get("hot_qubit_depth_decay", 0.7)
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=reverse
@@ -2863,6 +3015,12 @@ def _heuristic_search_layout_only(
                 max_E_size=max_E_size,
                 max_lookahead=max_lookahead,
             )
+            hot_qubit_weights = self._build_hot_qubit_weights(
+                F_snapshot,
+                E,
+                optimized_partitions,
+                depth_decay=hot_decay,
+            )
 
             partition_candidates = self.obtain_partition_candidates(
                 F,
@@ -2938,6 +3096,9 @@ def _heuristic_search_layout_only(
                     three_qubit_exit_weight=self.config.get(
                         "three_qubit_exit_weight", 1.0
                     ),
+                    hot_qubit_weights=hot_qubit_weights,
+                    hot_qubit_swap_weight=hot_weight,
+                    hot_qubit_active_discount=hot_discount,
                 )
                 scores[ci] = score
                 cached_swaps[ci] = swaps
@@ -2948,6 +3109,7 @@ def _heuristic_search_layout_only(
                 scores,
                 cached_swaps,
                 cached_pi,
+                pi,
                 F_snapshot,
                 resolved_partitions,
                 DAG,
@@ -2962,12 +3124,14 @@ def _heuristic_search_layout_only(
                 alpha=E_alpha,
                 cnot_cost=cnot_cost,
                 adj=self._adj,
+                hot_qubit_weights=hot_qubit_weights,
             )
             best = partition_candidates[best_idx]
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 
-            swaps, pi = cached_swaps[best_idx], cached_pi[best_idx]
+            swaps = cached_swaps[best_idx]
+            pi_next = cached_pi[best_idx]
             decay_factor = 1.0
             if swaps:
                 decay_factor = self._decay_factor_for_swaps(swaps, decay)
@@ -2977,6 +3141,15 @@ def _heuristic_search_layout_only(
                 cnot_cost,
                 decay_factor=decay_factor,
             )
+            if hot_weight > 0:
+                total_cost += hot_weight * self._hot_qubit_swap_tax(
+                    swaps,
+                    pi,
+                    hot_qubit_weights,
+                    active_qbits=best.involved_qbits,
+                    active_discount=hot_discount,
+                )
+            pi = pi_next
             if swaps:
                 self._apply_decay_for_swaps(swaps, decay)
                 swap_heavy_partitions += 1
@@ -3104,7 +3277,10 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   candidate_cache=None,
                                   layout_partitions=None,
                                   return_transforms=False,
-                                  three_qubit_exit_weight=1.0):
+                                  three_qubit_exit_weight=1.0,
+                                  hot_qubit_weights=None,
+                                  hot_qubit_swap_weight=0.0,
+                                  hot_qubit_active_discount=0.35):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = |swaps|
@@ -3145,6 +3321,17 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             cnot_cost,
             decay_factor=decay_factor,
         )
+        if hot_qubit_swap_weight > 0:
+            score += (
+                float(hot_qubit_swap_weight)
+                * qgd_Partition_Aware_Mapping._hot_qubit_swap_tax(
+                    swaps,
+                    pi,
+                    hot_qubit_weights,
+                    active_qbits=partition_candidate.involved_qbits,
+                    active_discount=hot_qubit_active_discount,
+                )
+            )
 
         if candidate_cache is None:
             if return_transforms:
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 95e8d4630..ca20b8105 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -135,6 +135,9 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("swap_burst_budget", &SabreConfig::swap_burst_budget)
         .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
         .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight)
+        .def_readwrite("hot_qubit_swap_weight", &SabreConfig::hot_qubit_swap_weight)
+        .def_readwrite("hot_qubit_active_discount", &SabreConfig::hot_qubit_active_discount)
+        .def_readwrite("hot_qubit_depth_decay", &SabreConfig::hot_qubit_depth_decay)
         .def_readwrite("boundary_beam_width", &SabreConfig::boundary_beam_width)
         .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth);
 

From 05b0f5c9782ed02da15f45195a27c4046d89a048 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 01:47:58 +0200
Subject: [PATCH 180/232] Revert "hot swapping"

This reverts commit 0589f0dd0b02050b2676f4621f6e6a5cce6041ff.
---
 benchmark_PartAM.py                           | 184 ++++++++++++++++
 .../sabre_router/include/sabre_router.hpp     |  22 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 175 ++-------------
 squander/synthesis/PartAM.py                  | 199 +-----------------
 squander/synthesis/bindings.cpp               |   3 -
 5 files changed, 204 insertions(+), 379 deletions(-)
 create mode 100644 benchmark_PartAM.py

diff --git a/benchmark_PartAM.py b/benchmark_PartAM.py
new file mode 100644
index 000000000..6d7605781
--- /dev/null
+++ b/benchmark_PartAM.py
@@ -0,0 +1,184 @@
+"""
+Benchmark PartAM cleanup phase per circuit.
+
+Runs each circuit 5 times with PartAM (cleanup=True) and records:
+  - qubit count
+  - initial CNOT count (original QASM circuit)
+  - CNOT count before cleanup (post-synthesis, pre-cleanup)
+  - CNOT count after cleanup (final)
+  - decomposition error
+  - compilation time (seconds)
+
+Results are exported to benchmark_PartAM.csv.
+
+Usage:
+    conda activate qgd
+    python benchmark_PartAM.py
+"""
+
+import numpy as np
+import time
+import os
+import glob
+import csv
+import random
+
+from squander import Partition_Aware_Mapping
+from squander import utils
+from squander import Circuit
+
+N_RUNS = 3
+OUTPUT_CSV = "benchmark_PartAM_layout.csv"
+
+
+def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm):
+    num_qubits = circ.get_Qbit_Num()
+    matrix_size = 1 << num_qubits
+    rng = np.random.RandomState(0)
+    initial_state = rng.uniform(-1, 1, (matrix_size,)) + 1j * rng.uniform(-1, 1, (matrix_size,))
+    initial_state /= np.linalg.norm(initial_state)
+
+    original_state = initial_state.copy()
+    circ_orig.apply_to(parameters_orig, original_state)
+
+    circ_Final = Circuit(num_qubits)
+    output_perm_T = [0] * num_qubits
+    for i, j in enumerate(output_perm):
+        output_perm_T[j] = i
+    circ_Final.add_Permutation([int(x) for x in input_perm])
+    circ_Final.add_Circuit(circ)
+    circ_Final.add_Permutation(output_perm_T)
+
+    state = initial_state.copy()
+    circ_Final.apply_to(params, state)
+    return 1 - abs(np.vdot(state, original_state))
+
+
+def make_linear_topology(n_qubits):
+    return [(i, i + 1) for i in range(n_qubits - 1)]
+
+
+def run_once(circ_orig, parameters_orig, topology):
+    config = {
+        'strategy': "TreeSearch",
+        'test_subcircuits': False,
+        'test_final_circuit': False,
+        'max_partition_size': 3,
+        'progressbar': False,
+        'topology': topology,
+        'verbosity': 0,
+        'cleanup': True,
+        'sabre_iterations':20,
+        'n_layout_trials':128,
+        'random_seed':random.randint(1,100),
+        # Cheap candidate prefilter before full A* scoring.
+        'prefilter_top_k': 50,
+        'prefilter_min_per_partition': 2,
+        'prefilter_min_3q': 12,
+        # Rank every layout trial by actual constructed routing, not only by
+        # the heuristic trial cost.
+        'actual_routing_rank_top_k': None,
+        'top_k_pi': 1,
+        # Boundary-state beam routing is a Python prototype on this branch.
+        # Set width/depth to 1 to recover the greedy router.
+        'boundary_beam_width': 4,
+        'boundary_beam_depth': 3,
+        'cnot_cost': 0.5 / 3.0,  # old: swap_cost=15, local_cost_weight=1.0 -> 15:1 swap:cnot
+        'cleanup_top_k': 3,
+        "parallel_layout_trials": True,
+        "layout_trial_workers": 0,
+        'max_E_size': 40,
+        'max_lookahead': 6,
+        'E_weight': 0.3,
+        'E_alpha': 1.0,        # LightSABRE-style uniform lookahead (no per-depth decay)
+        'decay_delta': 0.001,
+        'swap_burst_budget': 5,
+        'path_tiebreak_weight': 0.2,
+        'three_qubit_exit_weight': 1.5,
+        'routing_aware_partitioning': True,
+        'sparse_penalty': 3.0,
+        'two_pair_3q_penalty': 1.5,
+        'triangle_free_3q_penalty': 1.0,
+        'three_qubit_reuse_discount': 0.15,
+        'three_qubit_reuse_discount_cap': 1.0,
+    }
+
+    # Clean the initial circuit using the same config pattern as in PartAM.py
+    from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
+    cleanup_config = dict(config)
+    cleanup_config['topology'] = None
+    cleanup_config['routed'] = False
+    cleanup_config['test_subcircuits'] = False
+    cleanup_config['test_final_circuit'] = False
+    cleanup_config['global_min'] = True
+    cleanup_config['pre-opt-strategy'] = 'TreeSearch'
+
+    wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+    #circ_orig, parameters_orig = wco.OptimizeWideCircuit(circ_orig.get_Flat_Circuit(), parameters_orig)
+
+    start = time.time()
+    pam = Partition_Aware_Mapping(config)
+    circ, params, pi_in, pi_out = pam.Partition_Aware_Mapping(circ_orig.get_Flat_Circuit(), parameters_orig)
+    elapsed = time.time() - start
+    routing_time = pam._routing_time
+    cnot_before_cleanup = pam._cnot_pre_cleanup
+    cnot_after_cleanup = circ.get_Gate_Nums().get('CNOT', 0)
+    error = validate_result(circ_orig, parameters_orig, circ, params, pi_in, pi_out)
+
+    return cnot_before_cleanup, cnot_after_cleanup, error, elapsed, routing_time
+
+
+if __name__ == '__main__':
+    circs_dir = "circs"
+    qasm_files = sorted(glob.glob(os.path.join(circs_dir, "*.qasm")))
+
+    if not qasm_files:
+        print(f"No .qasm files found in {circs_dir}/")
+        exit(1)
+
+    print(f"Found {len(qasm_files)} circuits in {circs_dir}/")
+    print(f"Running {N_RUNS} times per circuit (cleanup=True)\n")
+
+    fieldnames = [
+        'circuit', 'n_qubits', 'run',
+        'initial_cnot', 'cnot_pre_cleanup', 'cnot_post_cleanup',
+        'error', 'time_s','routing_time_s'
+    ]
+
+    # Open CSV once and flush after each circuit so partial results are never lost
+    with open(OUTPUT_CSV, 'w', newline='') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+
+        for filepath in qasm_files:
+            name = os.path.basename(filepath)
+            print(f"{'='*70}")
+            print(f"Circuit: {name}")
+
+            circ_orig, parameters_orig, _ = utils.qasm_to_squander_circuit(filepath)
+            n_qubits = circ_orig.get_Qbit_Num()
+            topology = make_linear_topology(n_qubits)
+
+            initial_cnot = circ_orig.get_Gate_Nums().get('CNOT', 0)
+            print(f"Qubits: {n_qubits}, Initial CNOTs: {initial_cnot}")
+            print(f"{'Run':>4} {'Pre-cleanup':>12} {'Post-cleanup':>12} {'Error':>12} {'Time(s)':>10} {'Routing time(s)':>10}")
+
+            for run_idx in range(N_RUNS):
+                cnot_pre, cnot_post, error, elapsed, routing_time = run_once(circ_orig, parameters_orig, topology)
+                print(f"{run_idx:>4} {cnot_pre:>12} {cnot_post:>12} {error:>12.2e} {elapsed:>10.1f} {routing_time:>10.1f}")
+                writer.writerow({
+                    'circuit': name,
+                    'n_qubits': n_qubits,
+                    'run': run_idx,
+                    'initial_cnot': initial_cnot,
+                    'cnot_pre_cleanup': cnot_pre,
+                    'cnot_post_cleanup': cnot_post,
+                    'error': error,
+                    'time_s': round(elapsed, 3),
+                    'routing_time_s': round(routing_time,3)
+                })
+                f.flush()
+
+            print()
+
+    print(f"Results saved to {OUTPUT_CSV}")
diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 14a11c25a..7a0154e23 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -92,9 +92,6 @@ struct SabreConfig {
     int swap_burst_budget = 5; // Qiskit LightSABRE DECAY_RESET_INTERVAL
     double path_tiebreak_weight = 0.2;
     double three_qubit_exit_weight = 1.0;
-    double hot_qubit_swap_weight = 0.15;
-    double hot_qubit_active_discount = 0.35;
-    double hot_qubit_depth_decay = 0.7;
     int boundary_beam_width = 1;
     int boundary_beam_depth = 1;
 };
@@ -282,8 +279,7 @@ class SabreRouter {
         const std::vector<double>* decay = nullptr,
         std::vector<std::pair<int,int>>* out_swaps = nullptr,
         std::vector<int>* out_pi_new = nullptr,
-        const NeighborInfo* cached_neighbor_info = nullptr,
-        const std::vector<double>* hot_qubit_weights = nullptr
+        const NeighborInfo* cached_neighbor_info = nullptr
     ) const;
 
     // Route and update layout for a candidate (port of transform_pi)
@@ -316,18 +312,6 @@ class SabreRouter {
         double decay_factor = 1.0
     ) const;
 
-    std::vector<double> build_hot_qubit_weights(
-        const std::vector<int>& F_snapshot,
-        const std::vector<std::pair<int,int>>& E
-    ) const;
-
-    double hot_qubit_swap_tax(
-        const std::vector<std::pair<int,int>>& swaps,
-        const std::vector<int>& pi,
-        const CandidateData& cand,
-        const std::vector<double>& hot_qubit_weights
-    ) const;
-
     double future_partition_cost(
         int partition_idx,
         const std::vector<int>& pi,
@@ -393,15 +377,13 @@ class SabreRouter {
         const std::vector<double>& scores,
         const std::vector<std::vector<std::pair<int,int>>>& cached_swaps,
         const std::vector<std::vector<int>>& cached_pi,
-        const std::vector<int>& pi,
         const std::vector<int>& F_snapshot,
         const std::vector<uint8_t>& resolved,
         const std::vector<std::vector<int>>& children_graph,
         const std::vector<std::vector<int>>& parents_graph,
         bool reverse,
         const std::unordered_map<int, CanonicalEntry>& canonical_data,
-        SwapCache* swap_cache,
-        const std::vector<double>& hot_qubit_weights
+        SwapCache* swap_cache
     ) const;
 
     // Check if partition is single-qubit
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index fc91d8f84..d1444a26d 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -399,108 +399,6 @@ double SabreRouter::routing_objective(
     );
 }
 
-std::vector<double> SabreRouter::build_hot_qubit_weights(
-    const std::vector<int>& F_snapshot,
-    const std::vector<std::pair<int,int>>& E
-) const {
-    std::vector<double> weights(N_, 0.0);
-    if (config_.hot_qubit_swap_weight <= 0.0) {
-        return weights;
-    }
-
-    auto add_partition = [&](int partition_idx, double weight) {
-        if (
-            weight <= 0.0
-            || partition_idx < 0
-            || partition_idx >= static_cast<int>(layout_partitions_.size())
-            || layout_partitions_[partition_idx].is_single
-        ) {
-            return;
-        }
-        for (int q : layout_partitions_[partition_idx].involved_qbits) {
-            if (q >= 0 && q < N_) {
-                weights[q] += weight;
-            }
-        }
-    };
-
-    for (int partition_idx : F_snapshot) {
-        add_partition(partition_idx, 1.0);
-    }
-
-    const double depth_decay = std::max(0.0, config_.hot_qubit_depth_decay);
-    for (auto [partition_idx, depth] : E) {
-        add_partition(partition_idx, std::pow(depth_decay, std::max(0, depth)));
-    }
-
-    const double max_weight = *std::max_element(weights.begin(), weights.end());
-    if (max_weight <= 0.0) {
-        return weights;
-    }
-    for (double& weight : weights) {
-        weight /= max_weight;
-    }
-    return weights;
-}
-
-double SabreRouter::hot_qubit_swap_tax(
-    const std::vector<std::pair<int,int>>& swaps,
-    const std::vector<int>& pi,
-    const CandidateData& cand,
-    const std::vector<double>& hot_qubit_weights
-) const {
-    if (
-        swaps.empty()
-        || config_.hot_qubit_swap_weight <= 0.0
-        || hot_qubit_weights.empty()
-    ) {
-        return 0.0;
-    }
-
-    std::vector<int> p2v(N_, -1);
-    for (int q = 0; q < static_cast<int>(pi.size()); q++) {
-        const int p = pi[q];
-        if (p >= 0 && p < N_) {
-            p2v[p] = q;
-        }
-    }
-
-    std::vector<uint8_t> active(N_, 0);
-    for (int q : cand.involved_qbits) {
-        if (q >= 0 && q < N_) {
-            active[q] = 1;
-        }
-    }
-    const double active_discount = std::min(
-        1.0,
-        std::max(0.0, config_.hot_qubit_active_discount)
-    );
-
-    double tax = 0.0;
-    auto add_q = [&](int q) {
-        if (q < 0 || q >= static_cast<int>(hot_qubit_weights.size())) {
-            return;
-        }
-        double contribution = hot_qubit_weights[q];
-        if (q < static_cast<int>(active.size()) && active[q]) {
-            contribution *= active_discount;
-        }
-        tax += contribution;
-    };
-
-    for (auto [p1, p2] : swaps) {
-        if (p1 < 0 || p1 >= N_ || p2 < 0 || p2 >= N_) {
-            continue;
-        }
-        const int q1 = p2v[p1];
-        const int q2 = p2v[p2];
-        add_q(q1);
-        add_q(q2);
-        std::swap(p2v[p1], p2v[p2]);
-    }
-    return tax;
-}
-
 void SabreRouter::apply_decay_for_swaps(
     const std::vector<std::pair<int,int>>& swaps,
     std::vector<double>& decay
@@ -1286,8 +1184,7 @@ double SabreRouter::score_candidate(
     const std::vector<double>* decay,
     std::vector<std::pair<int,int>>* out_swaps,
     std::vector<int>* out_pi_new,
-    const NeighborInfo* cached_neighbor_info,
-    const std::vector<double>* hot_qubit_weights
+    const NeighborInfo* cached_neighbor_info
 ) const {
     NeighborInfo local_neighbor_info;
     const NeighborInfo* neighbor_ptr;
@@ -1316,14 +1213,6 @@ double SabreRouter::score_candidate(
         1.0,
         decay_factor
     );
-    if (hot_qubit_weights != nullptr && config_.hot_qubit_swap_weight > 0.0) {
-        score += config_.hot_qubit_swap_weight * hot_qubit_swap_tax(
-            swaps,
-            pi,
-            cand,
-            *hot_qubit_weights
-        );
-    }
 
     const int cand_idx = cand.partition_idx;
     double future_score = future_context_cost(
@@ -1541,15 +1430,13 @@ size_t SabreRouter::boundary_beam_select_index(
     const std::vector<double>& scores,
     const std::vector<std::vector<std::pair<int,int>>>& cached_swaps,
     const std::vector<std::vector<int>>& cached_pi,
-    const std::vector<int>& pi,
     const std::vector<int>& F_snapshot,
     const std::vector<uint8_t>& resolved,
     const std::vector<std::vector<int>>& children_graph,
     const std::vector<std::vector<int>>& parents_graph,
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data,
-    SwapCache* swap_cache,
-    const std::vector<double>& hot_qubit_weights
+    SwapCache* swap_cache
 ) const {
     size_t fallback_idx = 0;
     for (size_t i = 1; i < scores.size(); i++) {
@@ -1584,25 +1471,11 @@ size_t SabreRouter::boundary_beam_select_index(
         size_t first_idx;
     };
 
-    auto transition_cost = [&](
-        const CandidateData& cand,
-        const std::vector<std::pair<int,int>>& swaps,
-        const std::vector<int>& pi_state,
-        const std::vector<double>& hot_weights
-    ) {
-        double cost = routing_objective(
-            static_cast<double>(swaps.size()),
+    auto transition_cost = [&](const CandidateData& cand, size_t idx) {
+        return routing_objective(
+            static_cast<double>(cached_swaps[idx].size()),
             cand.cnot_count
         );
-        if (config_.hot_qubit_swap_weight > 0.0) {
-            cost += config_.hot_qubit_swap_weight * hot_qubit_swap_tax(
-                swaps,
-                pi_state,
-                cand,
-                hot_weights
-            );
-        }
-        return cost;
     };
 
     auto sort_states = [](const BeamState& a, const BeamState& b) {
@@ -1622,12 +1495,7 @@ size_t SabreRouter::boundary_beam_select_index(
             children_graph,
             parents_graph
         );
-        const double trans_cost = transition_cost(
-            cand,
-            cached_swaps[idx],
-            pi,
-            hot_qubit_weights
-        );
+        const double trans_cost = transition_cost(cand, idx);
         states.push_back(BeamState{
             scores[idx],
             trans_cost,
@@ -1668,10 +1536,6 @@ size_t SabreRouter::boundary_beam_select_index(
                 children_graph,
                 parents_graph
             );
-            auto rollout_hot_qubit_weights = build_hot_qubit_weights(
-                state.F,
-                E
-            );
 
             auto rollout_candidates = obtain_partition_candidates(state.F);
             if (rollout_candidates.empty()) {
@@ -1717,14 +1581,11 @@ size_t SabreRouter::boundary_beam_select_index(
                     nullptr,
                     &swaps,
                     &output_perm,
-                    &neighbor_info,
-                    &rollout_hot_qubit_weights
+                    &neighbor_info
                 );
-                const double trans_cost = transition_cost(
-                    *cand,
-                    swaps,
-                    state.pi,
-                    rollout_hot_qubit_weights
+                const double trans_cost = routing_objective(
+                    static_cast<double>(swaps.size()),
+                    cand->cnot_count
                 );
                 const double future_cost = score - trans_cost;
                 const double new_total = state.total_cost + trans_cost;
@@ -1876,7 +1737,6 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
 
         // Generate extended set
         auto E = generate_extended_set(F, resolved, cg, pg);
-        auto hot_qubit_weights = build_hot_qubit_weights(F, E);
 
         // Prefilter with a cheap estimate of the candidate's future context.
         auto candidates = prefilter_candidates(
@@ -1908,8 +1768,7 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
                 F, pi, E, reverse, canonical_data,
                 &swap_cache, &decay,
                 &cached_swaps[ci], &cached_pi[ci],
-                &cached_ni,
-                &hot_qubit_weights
+                &cached_ni
             );
         }
 
@@ -1919,15 +1778,13 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             scores,
             cached_swaps,
             cached_pi,
-            pi,
             F,
             resolved,
             cg,
             pg,
             reverse,
             canonical_data,
-            &swap_cache,
-            hot_qubit_weights
+            &swap_cache
         );
         const auto& best = *candidates[best_ci];
 
@@ -1949,14 +1806,6 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             1.0,
             decay_factor
         );
-        if (config_.hot_qubit_swap_weight > 0.0) {
-            total_cost += config_.hot_qubit_swap_weight * hot_qubit_swap_tax(
-                swaps,
-                pi,
-                best,
-                hot_qubit_weights
-            );
-        }
         if (route_trace) {
             if (!swaps.empty()) {
                 RouteStep swap_step;
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6ce60276d..2cc2255a3 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -149,9 +149,6 @@ def __init__(self, config):
         self.config.setdefault('decay_delta', 0.001)  # Qiskit LightSABRE DECAY_RATE
         self.config.setdefault('swap_burst_budget', 5)  # Qiskit LightSABRE DECAY_RESET_INTERVAL
         self.config.setdefault('path_tiebreak_weight', 0.2)
-        self.config.setdefault('hot_qubit_swap_weight', 0.15)
-        self.config.setdefault('hot_qubit_active_discount', 0.35)
-        self.config.setdefault('hot_qubit_depth_decay', 0.7)
         # The neighbor heuristic is normalized to [0, 1] and added to A*'s f-value.
         # g-deltas are integer and h-deltas are half-integer, so preserving
         # swap-count optimality requires weight < 0.5.
@@ -1439,18 +1436,6 @@ def _run_layout_trials_cpp(
             cfg.three_qubit_exit_weight = self.config.get(
                 'three_qubit_exit_weight', 1.0
             )
-        if hasattr(cfg, 'hot_qubit_swap_weight'):
-            cfg.hot_qubit_swap_weight = self.config.get(
-                'hot_qubit_swap_weight', 0.15
-            )
-        if hasattr(cfg, 'hot_qubit_active_discount'):
-            cfg.hot_qubit_active_discount = self.config.get(
-                'hot_qubit_active_discount', 0.35
-            )
-        if hasattr(cfg, 'hot_qubit_depth_decay'):
-            cfg.hot_qubit_depth_decay = self.config.get(
-                'hot_qubit_depth_decay', 0.7
-            )
         if hasattr(cfg, 'boundary_beam_width'):
             cfg.boundary_beam_width = self.config.get(
                 'boundary_beam_width', 1
@@ -2310,92 +2295,6 @@ def add_edges(target_idx, edge_weight):
             "weight": weight,
         }
 
-    @staticmethod
-    def _build_hot_qubit_weights(
-        F,
-        E,
-        layout_partitions,
-        depth_decay=0.7,
-    ):
-        """Return normalized future-use weights by logical qubit.
-
-        Front-layer partitions count at full weight.  Extended-set partitions
-        are depth-decayed, making repeatedly reused near-future qubits more
-        expensive to move as bystanders.
-        """
-        if layout_partitions is None:
-            return {}
-
-        depth_decay = max(0.0, float(depth_decay))
-        weights = defaultdict(float)
-
-        def add_partition(partition_idx, weight):
-            if (
-                weight <= 0
-                or partition_idx < 0
-                or partition_idx >= len(layout_partitions)
-            ):
-                return
-            if qgd_Partition_Aware_Mapping._partition_is_single(
-                layout_partitions[partition_idx]
-            ):
-                return
-            for qbit in qgd_Partition_Aware_Mapping._partition_involved_qbits(
-                layout_partitions[partition_idx]
-            ):
-                weights[int(qbit)] += weight
-
-        for partition_idx in F:
-            add_partition(int(partition_idx), 1.0)
-
-        if E:
-            for partition_idx, depth in E:
-                add_partition(
-                    int(partition_idx),
-                    depth_decay ** max(0, int(depth)),
-                )
-
-        if not weights:
-            return {}
-        max_weight = max(weights.values())
-        if max_weight <= 0:
-            return {}
-        return {qbit: weight / max_weight for qbit, weight in weights.items()}
-
-    @staticmethod
-    def _hot_qubit_swap_tax(
-        swaps,
-        pi,
-        hot_qubit_weights,
-        active_qbits=None,
-        active_discount=0.35,
-    ):
-        if not swaps or not hot_qubit_weights:
-            return 0.0
-
-        active_discount = min(1.0, max(0.0, float(active_discount)))
-        active = set(int(qbit) for qbit in (active_qbits or ()))
-        pi_list = [int(x) for x in pi]
-        p2v = [None] * len(pi_list)
-        for logical_q, physical_q in enumerate(pi_list):
-            p2v[int(physical_q)] = logical_q
-
-        tax = 0.0
-        for p1, p2 in swaps:
-            p1 = int(p1)
-            p2 = int(p2)
-            q1 = p2v[p1]
-            q2 = p2v[p2]
-            for qbit in (q1, q2):
-                if qbit is None:
-                    continue
-                contribution = float(hot_qubit_weights.get(int(qbit), 0.0))
-                if qbit in active:
-                    contribution *= active_discount
-                tax += contribution
-            p2v[p1], p2v[p2] = q2, q1
-        return tax
-
     def _advance_layout_frontier(
         self,
         selected_partition_idx,
@@ -2440,7 +2339,6 @@ def _boundary_beam_select_index(
         scores,
         cached_swaps,
         cached_pi,
-        pi,
         F_snapshot,
         resolved_partitions,
         DAG,
@@ -2455,7 +2353,6 @@ def _boundary_beam_select_index(
         alpha=1.0,
         cnot_cost=1.0 / 3.0,
         adj=None,
-        hot_qubit_weights=None,
     ):
         """Choose the next candidate by rolling out boundary-layout states.
 
@@ -2476,36 +2373,19 @@ def _boundary_beam_select_index(
         top_k = self.config.get("prefilter_top_k", 50)
         path_weight = self.config.get("path_tiebreak_weight", 0.2)
         three_q_weight = self.config.get("three_qubit_exit_weight", 1.0)
-        hot_weight = self.config.get("hot_qubit_swap_weight", 0.15)
-        hot_discount = self.config.get("hot_qubit_active_discount", 0.35)
-        hot_decay = self.config.get("hot_qubit_depth_decay", 0.7)
 
-        def transition_cost(cand, swaps, pi_state, hot_weights):
-            route_score = self._routing_objective(
+        def transition_cost(cand, swaps):
+            return self._routing_objective(
                 len(swaps or ()),
                 cand.cnot_count,
                 cnot_cost,
             )
-            if hot_weight <= 0:
-                return route_score
-            return route_score + hot_weight * self._hot_qubit_swap_tax(
-                swaps,
-                pi_state,
-                hot_weights,
-                active_qbits=cand.involved_qbits,
-                active_discount=hot_discount,
-            )
 
         states = []
         for idx, cand in enumerate(partition_candidates):
             if cached_pi[idx] is None:
                 continue
-            trans_cost = transition_cost(
-                cand,
-                cached_swaps[idx],
-                pi,
-                hot_qubit_weights,
-            )
+            trans_cost = transition_cost(cand, cached_swaps[idx])
             F_next, resolved_next = self._advance_layout_frontier(
                 cand.partition_idx,
                 F_snapshot,
@@ -2551,12 +2431,6 @@ def transition_cost(cand, swaps, pi_state, hot_weights):
                     max_E_size=max_E_size,
                     max_lookahead=max_lookahead,
                 )
-                rollout_hot_weights = self._build_hot_qubit_weights(
-                    F_state,
-                    E,
-                    optimized_partitions,
-                    depth_decay=hot_decay,
-                )
                 candidates = self.obtain_partition_candidates(
                     F_list,
                     optimized_partitions,
@@ -2614,16 +2488,8 @@ def transition_cost(cand, swaps, pi_state, hot_weights):
                         layout_partitions=optimized_partitions,
                         return_transforms=True,
                         three_qubit_exit_weight=three_q_weight,
-                        hot_qubit_weights=rollout_hot_weights,
-                        hot_qubit_swap_weight=hot_weight,
-                        hot_qubit_active_discount=hot_discount,
-                    )
-                    trans_cost = transition_cost(
-                        cand,
-                        swaps,
-                        pi_state,
-                        rollout_hot_weights,
                     )
+                    trans_cost = transition_cost(cand, swaps)
                     future_cost = float(score) - trans_cost
                     new_total = total_cost + trans_cost
                     rank_cost = new_total + future_cost
@@ -2761,12 +2627,6 @@ def Heuristic_Search(
                 max_E_size=max_E_size,
                 max_lookahead=max_lookahead,
             )
-            hot_qubit_weights = self._build_hot_qubit_weights(
-                F_snapshot,
-                E,
-                optimized_partitions,
-                depth_decay=self.config.get("hot_qubit_depth_decay", 0.7),
-            )
 
             partition_candidates = self.obtain_partition_candidates(
                 F,
@@ -2840,13 +2700,6 @@ def Heuristic_Search(
                     three_qubit_exit_weight=self.config.get(
                         "three_qubit_exit_weight", 1.0
                     ),
-                    hot_qubit_weights=hot_qubit_weights,
-                    hot_qubit_swap_weight=self.config.get(
-                        "hot_qubit_swap_weight", 0.15
-                    ),
-                    hot_qubit_active_discount=self.config.get(
-                        "hot_qubit_active_discount", 0.35
-                    ),
                 )
                 scores[ci] = score
                 cached_swaps[ci] = swaps
@@ -2857,7 +2710,6 @@ def Heuristic_Search(
                 scores,
                 cached_swaps,
                 cached_pi,
-                pi,
                 F_snapshot,
                 resolved_partitions,
                 DAG,
@@ -2871,7 +2723,6 @@ def Heuristic_Search(
                 alpha=E_alpha,
                 cnot_cost=self.config.get("cnot_cost", 1.0 / 3.0),
                 adj=self._adj,
-                hot_qubit_weights=hot_qubit_weights,
             )
             min_partition_candidate = partition_candidates[best_idx]
 
@@ -2974,9 +2825,6 @@ def _heuristic_search_layout_only(
         E_alpha = self.config.get("E_alpha", 1.0)
         cnot_cost = self.config.get("cnot_cost", 1.0 / 3.0)
         swap_burst_budget = self.config.get("swap_burst_budget", 5)
-        hot_weight = self.config.get("hot_qubit_swap_weight", 0.15)
-        hot_discount = self.config.get("hot_qubit_active_discount", 0.35)
-        hot_decay = self.config.get("hot_qubit_depth_decay", 0.7)
 
         canonical_data = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=reverse
@@ -3015,12 +2863,6 @@ def _heuristic_search_layout_only(
                 max_E_size=max_E_size,
                 max_lookahead=max_lookahead,
             )
-            hot_qubit_weights = self._build_hot_qubit_weights(
-                F_snapshot,
-                E,
-                optimized_partitions,
-                depth_decay=hot_decay,
-            )
 
             partition_candidates = self.obtain_partition_candidates(
                 F,
@@ -3096,9 +2938,6 @@ def _heuristic_search_layout_only(
                     three_qubit_exit_weight=self.config.get(
                         "three_qubit_exit_weight", 1.0
                     ),
-                    hot_qubit_weights=hot_qubit_weights,
-                    hot_qubit_swap_weight=hot_weight,
-                    hot_qubit_active_discount=hot_discount,
                 )
                 scores[ci] = score
                 cached_swaps[ci] = swaps
@@ -3109,7 +2948,6 @@ def _heuristic_search_layout_only(
                 scores,
                 cached_swaps,
                 cached_pi,
-                pi,
                 F_snapshot,
                 resolved_partitions,
                 DAG,
@@ -3124,14 +2962,12 @@ def _heuristic_search_layout_only(
                 alpha=E_alpha,
                 cnot_cost=cnot_cost,
                 adj=self._adj,
-                hot_qubit_weights=hot_qubit_weights,
             )
             best = partition_candidates[best_idx]
             F.remove(best.partition_idx)
             resolved_partitions[best.partition_idx] = True
 
-            swaps = cached_swaps[best_idx]
-            pi_next = cached_pi[best_idx]
+            swaps, pi = cached_swaps[best_idx], cached_pi[best_idx]
             decay_factor = 1.0
             if swaps:
                 decay_factor = self._decay_factor_for_swaps(swaps, decay)
@@ -3141,15 +2977,6 @@ def _heuristic_search_layout_only(
                 cnot_cost,
                 decay_factor=decay_factor,
             )
-            if hot_weight > 0:
-                total_cost += hot_weight * self._hot_qubit_swap_tax(
-                    swaps,
-                    pi,
-                    hot_qubit_weights,
-                    active_qbits=best.involved_qbits,
-                    active_discount=hot_discount,
-                )
-            pi = pi_next
             if swaps:
                 self._apply_decay_for_swaps(swaps, decay)
                 swap_heavy_partitions += 1
@@ -3277,10 +3104,7 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
                                   candidate_cache=None,
                                   layout_partitions=None,
                                   return_transforms=False,
-                                  three_qubit_exit_weight=1.0,
-                                  hot_qubit_weights=None,
-                                  hot_qubit_swap_weight=0.0,
-                                  hot_qubit_active_discount=0.35):
+                                  three_qubit_exit_weight=1.0):
         """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1).
 
         H = |swaps|
@@ -3321,17 +3145,6 @@ def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D,
             cnot_cost,
             decay_factor=decay_factor,
         )
-        if hot_qubit_swap_weight > 0:
-            score += (
-                float(hot_qubit_swap_weight)
-                * qgd_Partition_Aware_Mapping._hot_qubit_swap_tax(
-                    swaps,
-                    pi,
-                    hot_qubit_weights,
-                    active_qbits=partition_candidate.involved_qbits,
-                    active_discount=hot_qubit_active_discount,
-                )
-            )
 
         if candidate_cache is None:
             if return_transforms:
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index ca20b8105..95e8d4630 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -135,9 +135,6 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("swap_burst_budget", &SabreConfig::swap_burst_budget)
         .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
         .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight)
-        .def_readwrite("hot_qubit_swap_weight", &SabreConfig::hot_qubit_swap_weight)
-        .def_readwrite("hot_qubit_active_discount", &SabreConfig::hot_qubit_active_discount)
-        .def_readwrite("hot_qubit_depth_decay", &SabreConfig::hot_qubit_depth_decay)
         .def_readwrite("boundary_beam_width", &SabreConfig::boundary_beam_width)
         .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth);
 

From c41b8c51c8fd4935e475a857ef27ea2725fc355f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 01:59:29 +0200
Subject: [PATCH 181/232] cleanup changes

---
 squander/decomposition/qgd_Wide_Circuit_Optimization.py | 6 +-----
 squander/synthesis/PartAM.py                            | 5 +++++
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 2a2cd710d..2a2d9853e 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -2328,11 +2328,7 @@ def OptimizeWideCircuit(
             if self.config["verbosity"] >= 1:
                 print("Optimizing circuit with Squander")
             part_size_start = self.max_partition_size
-            part_size_end = self.max_partition_size
-            if self.config.get("use_osr", False) or self.config.get(
-                "use_graph_search", False
-            ):
-                part_size_end = min(3, circ.get_Qbit_Num())
+            part_size_end = self.config.get("part_size_end",self.max_partition_size)
             count = CNOTGateCount(circ, 0)
             fingerprint_dict = {}
             for max_part_size in range(part_size_start, part_size_end + 1):
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2cc2255a3..bc3521fd9 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1733,6 +1733,11 @@ def Partition_Aware_Mapping(
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
                 cleanup_config['global_min'] = True
+                cleanup_config['use_osr'] = 1
+                cleanup_config['use_graph_search'] = 1
+                cleanup_config['part_size_end'] = 4
+                cleanup_config['max_partition_size'] = 4
+
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
                 saved_sq_circuits = self._snapshot_single_qubit_circuits(

From a50133ecc451494accdd7abc50254d017507e924 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 02:01:27 +0200
Subject: [PATCH 182/232] remove benchmark

---
 benchmark_PartAM.py | 184 --------------------------------------------
 1 file changed, 184 deletions(-)
 delete mode 100644 benchmark_PartAM.py

diff --git a/benchmark_PartAM.py b/benchmark_PartAM.py
deleted file mode 100644
index 6d7605781..000000000
--- a/benchmark_PartAM.py
+++ /dev/null
@@ -1,184 +0,0 @@
-"""
-Benchmark PartAM cleanup phase per circuit.
-
-Runs each circuit 5 times with PartAM (cleanup=True) and records:
-  - qubit count
-  - initial CNOT count (original QASM circuit)
-  - CNOT count before cleanup (post-synthesis, pre-cleanup)
-  - CNOT count after cleanup (final)
-  - decomposition error
-  - compilation time (seconds)
-
-Results are exported to benchmark_PartAM.csv.
-
-Usage:
-    conda activate qgd
-    python benchmark_PartAM.py
-"""
-
-import numpy as np
-import time
-import os
-import glob
-import csv
-import random
-
-from squander import Partition_Aware_Mapping
-from squander import utils
-from squander import Circuit
-
-N_RUNS = 3
-OUTPUT_CSV = "benchmark_PartAM_layout.csv"
-
-
-def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm):
-    num_qubits = circ.get_Qbit_Num()
-    matrix_size = 1 << num_qubits
-    rng = np.random.RandomState(0)
-    initial_state = rng.uniform(-1, 1, (matrix_size,)) + 1j * rng.uniform(-1, 1, (matrix_size,))
-    initial_state /= np.linalg.norm(initial_state)
-
-    original_state = initial_state.copy()
-    circ_orig.apply_to(parameters_orig, original_state)
-
-    circ_Final = Circuit(num_qubits)
-    output_perm_T = [0] * num_qubits
-    for i, j in enumerate(output_perm):
-        output_perm_T[j] = i
-    circ_Final.add_Permutation([int(x) for x in input_perm])
-    circ_Final.add_Circuit(circ)
-    circ_Final.add_Permutation(output_perm_T)
-
-    state = initial_state.copy()
-    circ_Final.apply_to(params, state)
-    return 1 - abs(np.vdot(state, original_state))
-
-
-def make_linear_topology(n_qubits):
-    return [(i, i + 1) for i in range(n_qubits - 1)]
-
-
-def run_once(circ_orig, parameters_orig, topology):
-    config = {
-        'strategy': "TreeSearch",
-        'test_subcircuits': False,
-        'test_final_circuit': False,
-        'max_partition_size': 3,
-        'progressbar': False,
-        'topology': topology,
-        'verbosity': 0,
-        'cleanup': True,
-        'sabre_iterations':20,
-        'n_layout_trials':128,
-        'random_seed':random.randint(1,100),
-        # Cheap candidate prefilter before full A* scoring.
-        'prefilter_top_k': 50,
-        'prefilter_min_per_partition': 2,
-        'prefilter_min_3q': 12,
-        # Rank every layout trial by actual constructed routing, not only by
-        # the heuristic trial cost.
-        'actual_routing_rank_top_k': None,
-        'top_k_pi': 1,
-        # Boundary-state beam routing is a Python prototype on this branch.
-        # Set width/depth to 1 to recover the greedy router.
-        'boundary_beam_width': 4,
-        'boundary_beam_depth': 3,
-        'cnot_cost': 0.5 / 3.0,  # old: swap_cost=15, local_cost_weight=1.0 -> 15:1 swap:cnot
-        'cleanup_top_k': 3,
-        "parallel_layout_trials": True,
-        "layout_trial_workers": 0,
-        'max_E_size': 40,
-        'max_lookahead': 6,
-        'E_weight': 0.3,
-        'E_alpha': 1.0,        # LightSABRE-style uniform lookahead (no per-depth decay)
-        'decay_delta': 0.001,
-        'swap_burst_budget': 5,
-        'path_tiebreak_weight': 0.2,
-        'three_qubit_exit_weight': 1.5,
-        'routing_aware_partitioning': True,
-        'sparse_penalty': 3.0,
-        'two_pair_3q_penalty': 1.5,
-        'triangle_free_3q_penalty': 1.0,
-        'three_qubit_reuse_discount': 0.15,
-        'three_qubit_reuse_discount_cap': 1.0,
-    }
-
-    # Clean the initial circuit using the same config pattern as in PartAM.py
-    from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization
-    cleanup_config = dict(config)
-    cleanup_config['topology'] = None
-    cleanup_config['routed'] = False
-    cleanup_config['test_subcircuits'] = False
-    cleanup_config['test_final_circuit'] = False
-    cleanup_config['global_min'] = True
-    cleanup_config['pre-opt-strategy'] = 'TreeSearch'
-
-    wco = qgd_Wide_Circuit_Optimization(cleanup_config)
-    #circ_orig, parameters_orig = wco.OptimizeWideCircuit(circ_orig.get_Flat_Circuit(), parameters_orig)
-
-    start = time.time()
-    pam = Partition_Aware_Mapping(config)
-    circ, params, pi_in, pi_out = pam.Partition_Aware_Mapping(circ_orig.get_Flat_Circuit(), parameters_orig)
-    elapsed = time.time() - start
-    routing_time = pam._routing_time
-    cnot_before_cleanup = pam._cnot_pre_cleanup
-    cnot_after_cleanup = circ.get_Gate_Nums().get('CNOT', 0)
-    error = validate_result(circ_orig, parameters_orig, circ, params, pi_in, pi_out)
-
-    return cnot_before_cleanup, cnot_after_cleanup, error, elapsed, routing_time
-
-
-if __name__ == '__main__':
-    circs_dir = "circs"
-    qasm_files = sorted(glob.glob(os.path.join(circs_dir, "*.qasm")))
-
-    if not qasm_files:
-        print(f"No .qasm files found in {circs_dir}/")
-        exit(1)
-
-    print(f"Found {len(qasm_files)} circuits in {circs_dir}/")
-    print(f"Running {N_RUNS} times per circuit (cleanup=True)\n")
-
-    fieldnames = [
-        'circuit', 'n_qubits', 'run',
-        'initial_cnot', 'cnot_pre_cleanup', 'cnot_post_cleanup',
-        'error', 'time_s','routing_time_s'
-    ]
-
-    # Open CSV once and flush after each circuit so partial results are never lost
-    with open(OUTPUT_CSV, 'w', newline='') as f:
-        writer = csv.DictWriter(f, fieldnames=fieldnames)
-        writer.writeheader()
-
-        for filepath in qasm_files:
-            name = os.path.basename(filepath)
-            print(f"{'='*70}")
-            print(f"Circuit: {name}")
-
-            circ_orig, parameters_orig, _ = utils.qasm_to_squander_circuit(filepath)
-            n_qubits = circ_orig.get_Qbit_Num()
-            topology = make_linear_topology(n_qubits)
-
-            initial_cnot = circ_orig.get_Gate_Nums().get('CNOT', 0)
-            print(f"Qubits: {n_qubits}, Initial CNOTs: {initial_cnot}")
-            print(f"{'Run':>4} {'Pre-cleanup':>12} {'Post-cleanup':>12} {'Error':>12} {'Time(s)':>10} {'Routing time(s)':>10}")
-
-            for run_idx in range(N_RUNS):
-                cnot_pre, cnot_post, error, elapsed, routing_time = run_once(circ_orig, parameters_orig, topology)
-                print(f"{run_idx:>4} {cnot_pre:>12} {cnot_post:>12} {error:>12.2e} {elapsed:>10.1f} {routing_time:>10.1f}")
-                writer.writerow({
-                    'circuit': name,
-                    'n_qubits': n_qubits,
-                    'run': run_idx,
-                    'initial_cnot': initial_cnot,
-                    'cnot_pre_cleanup': cnot_pre,
-                    'cnot_post_cleanup': cnot_post,
-                    'error': error,
-                    'time_s': round(elapsed, 3),
-                    'routing_time_s': round(routing_time,3)
-                })
-                f.flush()
-
-            print()
-
-    print(f"Results saved to {OUTPUT_CSV}")

From c9646363fb3f9da05ce0ec683d8a977e3de6e6f2 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 12:22:28 +0200
Subject: [PATCH 183/232] fix cleanup

---
 squander/synthesis/PartAM.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index bc3521fd9..70e02a888 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1736,7 +1736,7 @@ def Partition_Aware_Mapping(
                 cleanup_config['use_osr'] = 1
                 cleanup_config['use_graph_search'] = 1
                 cleanup_config['part_size_end'] = 4
-                cleanup_config['max_partition_size'] = 4
+                cleanup_config['max_partition_size'] = 3
 
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 

From fc68b1cd189ff8770782fc70a398c1bf23665862 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 12:51:07 +0200
Subject: [PATCH 184/232] better cleanup

---
 squander/synthesis/PartAM.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 70e02a888..c885cb2f9 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1733,9 +1733,9 @@ def Partition_Aware_Mapping(
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
                 cleanup_config['global_min'] = True
-                cleanup_config['use_osr'] = 1
-                cleanup_config['use_graph_search'] = 1
-                cleanup_config['part_size_end'] = 4
+                cleanup_config['use_osr'] = 0
+                cleanup_config['use_graph_search'] = 0
+                cleanup_config['part_size_end'] = 3
                 cleanup_config['max_partition_size'] = 3
 
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
@@ -1812,8 +1812,19 @@ def Partition_Aware_Mapping(
                         best_routing_swap_cnot = trial_routing_cnot
                         best_partition_body_cnot = trial_partition_cnot
 
-                final_circuit = best_circuit
-                final_parameters = best_params
+                final_cleanup_config = dict(cleanup_config)
+                final_cleanup_config['use_osr'] = 1
+                final_cleanup_config['use_graph_search'] = 1
+                final_cleanup_config['part_size_end'] = 4
+
+                wco = qgd_Wide_Circuit_Optimization(final_cleanup_config)
+
+                cleanup_t0 = time.time()
+                final_circuit, final_parameters = wco.OptimizeWideCircuit(
+                    best_circuit.get_Flat_Circuit(),
+                    best_params,
+                )
+                cleanup_total += time.time() - cleanup_t0
                 pi_initial = best_pi_init
                 pi = best_pi
                 routing_swap_cnot = best_routing_swap_cnot

From 54a43d1682512d91915530cc759b4623784aaf2a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 13:32:32 +0200
Subject: [PATCH 185/232] fix cleanup time

---
 squander/synthesis/PartAM.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index c885cb2f9..ca7008d62 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1671,6 +1671,8 @@ def Partition_Aware_Mapping(
         routing_start = time.time()
         routing_swap_cnot = 0
         partition_body_cnot = 0
+        routing_elapsed_before_cleanup = None
+        cleanup_total = 0.0
 
         if n_iterations == 0:
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
@@ -1721,6 +1723,7 @@ def Partition_Aware_Mapping(
                 candidate_cache,
                 rank_top_k=actual_rank_top_k,
             )
+            routing_elapsed_before_cleanup = time.time() - routing_start
 
             if do_cleanup:
                 from squander.decomposition.qgd_Wide_Circuit_Optimization import (
@@ -1755,7 +1758,6 @@ def Partition_Aware_Mapping(
                 best_pre_cleanup = None
                 best_routing_swap_cnot = 0
                 best_partition_body_cnot = 0
-                cleanup_total = 0.0
 
                 for _, trial_pi, _, trace_pi_init, route_steps in top_layouts:
                     self._restore_single_qubit_circuits(
@@ -1797,10 +1799,10 @@ def Partition_Aware_Mapping(
                         trial_circuit.get_Flat_Circuit(),
                         trial_params,
                     )
-                    cleanup_total += time.time() - cleanup_t0
                     cleaned_cost = cleaned_circuit.get_Gate_Nums().get(
                         'CNOT', 0
                     )
+                    cleanup_total += time.time() - cleanup_t0
 
                     if cleaned_cost < best_cost:
                         best_cost = cleaned_cost
@@ -1865,10 +1867,12 @@ def Partition_Aware_Mapping(
                 )
 
         if do_cleanup and n_iterations > 0:
-            self._routing_time = time.time() - routing_start - cleanup_total
+            self._routing_time = routing_elapsed_before_cleanup
+            self._cleanup_time = cleanup_total
             self._cnot_pre_cleanup = best_pre_cleanup
         else:
             self._routing_time = time.time() - routing_start
+            self._cleanup_time = 0.0
             self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get(
                 'CNOT', 0
             )

From 7fb938062d184f9102bfb7947ebd360fde0973cb Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 21:54:00 +0200
Subject: [PATCH 186/232] Add in OSR compression

---
 CMakeLists.txt                                |    1 +
 .../qgd_N_Qubit_Decompositions_Wrapper.cpp    |   51 +-
 .../qgd_WideCircuitCompression.py             |  267 +++
 .../N_Qubit_Decomposition_OSR_Compression.cpp | 1628 +++++++++++++++++
 .../N_Qubit_Decomposition_OSR_Compression.h   |  182 ++
 5 files changed, 2128 insertions(+), 1 deletion(-)
 create mode 100644 squander/decomposition/qgd_WideCircuitCompression.py
 create mode 100644 squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
 create mode 100644 squander/src-cpp/decomposition/include/N_Qubit_Decomposition_OSR_Compression.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 839eeea81..6df853985 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -833,6 +833,7 @@ list(APPEND qgd_files
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/decomposition/Sub_Matrix_Decomposition.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tabu_Search.cpp
+    ${PROJECT_SOURCE_DIR}/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/variational_quantum_eigensolver/Variational_Quantum_Eigensolver_Base.cpp  
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/variational_quantum_eigensolver/Generative_Quantum_Machine_Learning_Base.cpp  
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/density_matrix/density_matrix.cpp
diff --git a/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp b/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
index 2e36cc3f7..950137356 100644
--- a/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
+++ b/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
@@ -26,6 +26,7 @@
 #include "N_Qubit_Decomposition_adaptive.h"
 #include "N_Qubit_Decomposition_Tree_Search.h"
 #include "N_Qubit_Decomposition_Tabu_Search.h"
+#include "N_Qubit_Decomposition_OSR_Compression.h"
 #include "Gates_block.h"
 
 /**
@@ -320,6 +321,43 @@ qgd_N_Qubit_Decomposition_Tabu_Search_Wrapper_init(qgd_N_Qubit_Decomposition_Wra
     return search_wrapper_init<N_Qubit_Decomposition_Tabu_Search>(self, args, kwds);
 }
 
+static int
+qgd_N_Qubit_Decomposition_OSR_Compression_Wrapper_init(qgd_N_Qubit_Decomposition_Wrapper* self, PyObject* args, PyObject* kwds)
+{
+    static char* kwlist[] = {
+        (char*)"Umtx", (char*)"qbit_num", (char*)"config", (char*)"accelerator_num",
+        (char*)"topology", NULL
+    };
+
+    PyObject *Umtx_arg = NULL, *config_arg = NULL, *topology_arg = NULL;
+    int qbit_num = -1, accelerator_num = 0;
+
+    if (!PyArg_ParseTupleAndKeywords(
+        args, kwds, "O|iOiO", kwlist,
+        &Umtx_arg, &qbit_num, &config_arg, &accelerator_num, &topology_arg)
+    ) {
+        return -1;
+    }
+
+    try {
+        Matrix Umtx_mtx = extract_matrix(Umtx_arg, &self->Umtx);
+        if (qbit_num == -1) {
+            qbit_num = (int)std::round(std::log2(Umtx_mtx.rows));
+        }
+
+        auto config = extract_config(config_arg);
+        auto topology_cpp = extract_topology(topology_arg);
+
+        self->decomp = new N_Qubit_Decomposition_OSR_Compression(
+            Umtx_mtx, qbit_num, topology_cpp, config, accelerator_num);
+
+        return 0;
+    } catch (const std::exception& e) {
+        PyErr_SetString(PyExc_Exception, e.what());
+        return -1;
+    }
+}
+
 /**
  * @brief Deallocate decomposition instance
  */
@@ -3063,6 +3101,14 @@ static PyMethodDef qgd_N_Qubit_Decomposition_Tree_Search_methods[] = {
     {NULL}
 };
 
+/**
+@brief Method table for N_Qubit_Decomposition_OSR_Compression
+*/
+static PyMethodDef qgd_N_Qubit_Decomposition_OSR_Compression_methods[] = {
+    DECOMPOSITION_WRAPPER_BASE_METHODS
+    {NULL}
+};
+
 #define decomposition_wrapper_type_template(decomp_class) \
 static PyTypeObject qgd_##decomp_class##_Wrapper_Type = { \
     PyVarObject_HEAD_INIT(NULL, 0) \
@@ -3121,6 +3167,7 @@ decomposition_wrapper_type_template(N_Qubit_Decomposition_adaptive)
 decomposition_wrapper_type_template(N_Qubit_Decomposition_custom)
 decomposition_wrapper_type_template(N_Qubit_Decomposition_Tree_Search)
 decomposition_wrapper_type_template(N_Qubit_Decomposition_Tabu_Search)
+decomposition_wrapper_type_template(N_Qubit_Decomposition_OSR_Compression)
 
 //////////////////////////////////////////////////////////////////
 
@@ -3162,7 +3209,8 @@ PyInit_qgd_N_Qubit_Decompositions_Wrapper(void)
         PyType_Ready(&qgd_N_Qubit_Decomposition_adaptive_Wrapper_Type) < 0 ||
         PyType_Ready(&qgd_N_Qubit_Decomposition_custom_Wrapper_Type) < 0 ||
         PyType_Ready(&qgd_N_Qubit_Decomposition_Tree_Search_Wrapper_Type) < 0 ||
-        PyType_Ready(&qgd_N_Qubit_Decomposition_Tabu_Search_Wrapper_Type) < 0) {
+        PyType_Ready(&qgd_N_Qubit_Decomposition_Tabu_Search_Wrapper_Type) < 0 ||
+        PyType_Ready(&qgd_N_Qubit_Decomposition_OSR_Compression_Wrapper_Type) < 0) {
         return NULL;
     }
 
@@ -3175,6 +3223,7 @@ PyInit_qgd_N_Qubit_Decompositions_Wrapper(void)
     Py_INCREF_template(N_Qubit_Decomposition_custom);
     Py_INCREF_template(N_Qubit_Decomposition_Tree_Search);
     Py_INCREF_template(N_Qubit_Decomposition_Tabu_Search);
+    Py_INCREF_template(N_Qubit_Decomposition_OSR_Compression);
 
     return m;
 }
diff --git a/squander/decomposition/qgd_WideCircuitCompression.py b/squander/decomposition/qgd_WideCircuitCompression.py
new file mode 100644
index 000000000..ec7e32578
--- /dev/null
+++ b/squander/decomposition/qgd_WideCircuitCompression.py
@@ -0,0 +1,267 @@
+"""
+Wide-circuit compression: partition large circuits into subcircuits and run
+OSR-guided gate-structure compression on each partition.
+"""
+
+from squander import N_Qubit_Decomposition_OSR_Compression
+from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
+from squander.utils import CompareCircuits
+
+from squander.partitioning.partition import PartitionCircuit
+from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+    CNOT_COUNT_DICT,
+    CNOTGateCount,
+    extract_subtopology,
+    qgd_Wide_Circuit_Optimization,
+)
+
+import numpy as np
+import multiprocessing as mp
+from multiprocessing import Pool, parent_process
+import os, contextlib, time
+
+from typing import List, Tuple, Optional, cast
+
+
+class qgd_WideCircuitCompression:
+    """Optimize wide circuits by partitioning and per-partition OSR compression.
+
+    Each partition is treated as a fixed gate structure. The OSR compression
+    decomposer attempts to remove entangling gates while still reproducing the
+    partition's unitary within the configured tolerance. If compression fails,
+    the original partition is kept.
+    """
+
+    def __init__(self, config):
+        config.setdefault("parallel", 0)
+        config.setdefault("verbosity", 0)
+        config.setdefault("tolerance", 1e-8)
+        config.setdefault("test_subcircuits", False)
+        config.setdefault("test_final_circuit", True)
+        config.setdefault("max_partition_size", 3)
+        config.setdefault("partition_strategy", "ilp")
+        config.setdefault("topology", None)
+
+        if config["parallel"] not in (0, 1, 2):
+            raise Exception(
+                f"The parallel configuration should be either of [0, 1, 2], got {config['parallel']}."
+            )
+        if not isinstance(config["verbosity"], int):
+            raise Exception("The verbosity parameter should be an integer.")
+        if not isinstance(config["tolerance"], float):
+            raise Exception("The tolerance parameter should be a float.")
+        if not isinstance(config["test_subcircuits"], bool):
+            raise Exception("The test_subcircuits parameter should be a bool.")
+        if not isinstance(config["test_final_circuit"], bool):
+            raise Exception("The test_final_circuit parameter should be a bool.")
+        if not isinstance(config["max_partition_size"], int):
+            raise Exception("The max_partition_size parameter should be an integer.")
+
+        self.config = config
+        self.max_partition_size = config["max_partition_size"]
+
+    @staticmethod
+    def CompressPartition(
+        subcircuit: Circuit,
+        subcircuit_parameters: np.ndarray,
+        config: dict,
+    ) -> Tuple[Circuit, np.ndarray]:
+        """Run OSR compression on a single partition subcircuit.
+
+        Returns the compressed circuit (remapped to the original wide register)
+        and its parameters. Falls back to the original subcircuit on failure.
+        """
+        qbit_num_orig = subcircuit.get_Qbit_Num()
+        involved = subcircuit.get_Qbits()
+        qbit_num = len(involved)
+
+        qbit_map = {q: i for i, q in enumerate(involved)}
+        remapped = subcircuit.Remap_Qbits(qbit_map, qbit_num)
+
+        # restrict OSR mutations to topology edges that survive the partition
+        local_config = dict(config)
+        if config.get("topology") is not None:
+            mini_topology = extract_subtopology(involved, qbit_map, config)
+            local_config.setdefault("osr_compression_mutate_full_topology", 0)
+        else:
+            mini_topology = None
+
+        # the partition unitary is the OSR target
+        unitary = remapped.get_Matrix(subcircuit_parameters)
+
+        cDecompose = N_Qubit_Decomposition_OSR_Compression(
+            unitary.conj().T,
+            qbit_num=qbit_num,
+            config=local_config,
+            accelerator_num=0,
+            topology=mini_topology,
+        )
+        cDecompose.set_Verbose(config["verbosity"])
+        cDecompose.set_Cost_Function_Variant(3)
+        cDecompose.set_Optimization_Tolerance(config["tolerance"])
+        cDecompose.set_Optimizer("BFGS")
+
+        # supply the existing structure + warm-start parameters
+        cDecompose.set_Gate_Structure(remapped)
+        cDecompose.set_Optimized_Parameters(subcircuit_parameters)
+
+        try:
+            cDecompose.Start_Decomposition()
+        except Exception:
+            return subcircuit, subcircuit_parameters
+
+        new_circ = cDecompose.get_Circuit()
+        new_params = cDecompose.get_Optimized_Parameters()
+        err = cDecompose.get_Decomposition_Error()
+
+        if err > config["tolerance"]:
+            return subcircuit, subcircuit_parameters
+
+        inverse_map = {v: k for k, v in qbit_map.items()}
+        new_circ = new_circ.Remap_Qbits(inverse_map, qbit_num_orig).get_Flat_Circuit()
+
+        if config["test_subcircuits"]:
+            CompareCircuits(
+                subcircuit,
+                subcircuit_parameters,
+                new_circ,
+                new_params,
+                parallel=config["parallel"],
+            )
+
+        return new_circ, new_params
+
+    def InnerCompressWideCircuit(
+        self, circ: Circuit, parameters: np.ndarray
+    ) -> Tuple[Circuit, np.ndarray]:
+        """Single pass: partition ``circ``, OSR-compress each partition, stitch."""
+        from squander.utils import circuit_to_CNOT_basis
+
+        circ, parameters = circuit_to_CNOT_basis(circ, parameters)
+
+        partitioned_circuit, parameters, _ = PartitionCircuit(
+            circ,
+            parameters,
+            self.max_partition_size,
+            strategy=self.config["partition_strategy"],
+        )
+        subcircuits = partitioned_circuit.get_Gates()
+
+        in_parent = parent_process() is not None
+        if not in_parent and self.config["verbosity"] >= 1:
+            print(len(subcircuits), "partitions to compress")
+
+        optimized_subcircuits: List[Optional[Circuit]] = [None] * len(subcircuits)
+        optimized_parameter_list: List[Optional[np.ndarray]] = [None] * len(subcircuits)
+
+        max_gates = sum(
+            y for x, y in circ.get_Gate_Nums().items() if x not in CNOT_COUNT_DICT
+        )
+
+        slices = []
+        for sub in subcircuits:
+            start = sub.get_Parameter_Start_Index()
+            slices.append(parameters[start : start + sub.get_Parameter_Num()])
+
+        nproc = (
+            len(os.sched_getaffinity(0))
+            if hasattr(os, "sched_getaffinity")
+            else mp.cpu_count()
+        )
+        with (
+            contextlib.nullcontext() if in_parent else Pool(processes=nproc)
+        ) as pool:
+            async_results = []
+            for idx, sub in enumerate(subcircuits):
+                args = (sub, slices[idx], self.config)
+                if in_parent:
+                    async_results.append(args)
+                else:
+                    async_results.append(pool.apply_async(self.CompressPartition, args))
+
+            for idx, ar in enumerate(async_results):
+                if in_parent:
+                    new_sub, new_p = self.CompressPartition(*ar)
+                else:
+                    new_sub, new_p = ar.get(timeout=None)
+
+                orig_score = CNOTGateCount(subcircuits[idx], max_gates)
+                new_score = CNOTGateCount(new_sub, max_gates)
+                if new_score < orig_score:
+                    optimized_subcircuits[idx] = new_sub
+                    optimized_parameter_list[idx] = new_p
+                    if self.config["verbosity"] >= 2:
+                        print(
+                            f"partition {idx}: {subcircuits[idx].get_Gate_Nums()} -> {new_sub.get_Gate_Nums()}"
+                        )
+                else:
+                    optimized_subcircuits[idx] = subcircuits[idx]
+                    optimized_parameter_list[idx] = slices[idx]
+
+                if self.config["verbosity"] >= 1 and (idx + 1) % 100 == 0:
+                    print(idx + 1, "partitions compressed")
+
+        wide_parameters = np.concatenate(
+            cast(List[np.ndarray], optimized_parameter_list), axis=0
+        )
+        wide_circuit = Circuit(circ.get_Qbit_Num())
+        for c in cast(List[Circuit], optimized_subcircuits):
+            wide_circuit.add_Circuit(c)
+
+        assert wide_circuit.get_Parameter_Num() == wide_parameters.size, (
+            f"Mismatch in parameter counts: "
+            f"{wide_circuit.get_Parameter_Num()} vs {wide_parameters.size}"
+        )
+
+        if not in_parent and self.config["verbosity"] >= 1:
+            print("original circuit:   ", circ.get_Gate_Nums())
+            print("compressed circuit: ", wide_circuit.get_Gate_Nums())
+
+        qgd_Wide_Circuit_Optimization.check_valid_routing(
+            wide_circuit, self.config["topology"]
+        )
+        if self.config["verbosity"] >= 2:
+            print("InnerCompressWideCircuit: check_compare_circuits")
+        if self.config["test_final_circuit"]:
+            CompareCircuits(circ, parameters, wide_circuit, wide_parameters)
+
+        return wide_circuit, wide_parameters
+
+    def CompressWideCircuit(
+        self, circ: Circuit, parameters: np.ndarray
+    ) -> Tuple[Circuit, np.ndarray]:
+        """Top-level: sweep partition sizes, repeat each pass until no improvement.
+
+        Mirrors the outer loop of ``qgd_Wide_Circuit_Optimization.OptimizeWideCircuit``
+        for the Squander branch. Records ``self.config['compression_time']``.
+        Requires the input circuit to already respect ``config['topology']``;
+        no routing is performed.
+        """
+        if not qgd_Wide_Circuit_Optimization.is_valid_routing(
+            circ, self.config["topology"]
+        ):
+            raise Exception(
+                "Input circuit does not respect the configured topology; "
+                "qgd_WideCircuitCompression does not perform routing."
+            )
+
+        start_time = time.time()
+        part_size_start = self.max_partition_size
+        part_size_end = self.config.get("part_size_end", self.max_partition_size)
+
+        count = CNOTGateCount(circ, 0)
+        for max_part_size in range(part_size_start, part_size_end + 1):
+            inner = qgd_WideCircuitCompression(
+                {**self.config, "max_partition_size": max_part_size}
+            )
+            while True:
+                circ_flat, parameters = inner.InnerCompressWideCircuit(circ, parameters)
+                circ = circ_flat.get_Flat_Circuit()
+                newcount = CNOTGateCount(circ, 0)
+                no_improve = newcount >= count
+                count = newcount
+                if no_improve:
+                    break
+
+        self.config["compression_time"] = time.time() - start_time
+        return circ, parameters
diff --git a/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp b/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
new file mode 100644
index 000000000..ea2505447
--- /dev/null
+++ b/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
@@ -0,0 +1,1628 @@
+/*
+Created on Sat May 02 2026
+Copyright 2026
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+/*! \file N_Qubit_Decomposition_OSR_Compression.cpp
+    \brief OSR-guided top-down compression for an existing gate structure.
+*/
+
+#include "N_Qubit_Decomposition_OSR_Compression.h"
+#include "N_Qubit_Decomposition_Cost_Function.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <set>
+#include <sstream>
+#include <utility>
+
+namespace {
+
+struct CompressionCandidate {
+    std::vector<int> removed_ids;
+    Matrix_real initial_parameters;
+    N_Qubit_Decomposition_OSR_Compression_Score score;
+    int entangling_gate_num;
+    std::shared_ptr<Gates_block> gate_structure;
+    std::string key;
+};
+
+static bool is_entangling_gate_type(gate_type type) {
+    switch (type) {
+    case CNOT_OPERATION:
+    case CZ_OPERATION:
+    case CH_OPERATION:
+    case SYC_OPERATION:
+    case CRY_OPERATION:
+    case CRX_OPERATION:
+    case CRZ_OPERATION:
+    case CP_OPERATION:
+    case CR_OPERATION:
+    case CROT_OPERATION:
+    case CZ_NU_OPERATION:
+    case CU_OPERATION:
+    case ADAPTIVE_OPERATION:
+    case RXX_OPERATION:
+    case RYY_OPERATION:
+    case RZZ_OPERATION:
+    case SWAP_OPERATION:
+    case CSWAP_OPERATION:
+    case CCX_OPERATION:
+        return true;
+    default:
+        return false;
+    }
+}
+
+static bool is_entangling_gate(Gate* gate) {
+    if (gate == NULL || gate->get_type() == BLOCK_OPERATION) {
+        return false;
+    }
+    if (is_entangling_gate_type(gate->get_type())) {
+        return true;
+    }
+    return gate->get_involved_qubits().size() > 1;
+}
+
+static void collect_entangling_gate_paths(Gates_block* block,
+                                          std::vector<int>& prefix,
+                                          std::vector<OSRGatePath>& out) {
+    if (block == NULL) {
+        return;
+    }
+
+    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
+        Gate* gate = block->get_gate(idx);
+        prefix.push_back(idx);
+
+        if (is_entangling_gate(gate)) {
+            OSRGatePath path;
+            path.indices = prefix;
+            out.push_back(path);
+        }
+
+        if (gate != NULL && gate->get_type() == BLOCK_OPERATION) {
+            collect_entangling_gate_paths(static_cast<Gates_block*>(gate), prefix, out);
+        }
+
+        prefix.pop_back();
+    }
+}
+
+static std::vector<OSRGatePath> collect_entangling_gate_paths(Gates_block* block) {
+    std::vector<OSRGatePath> ret;
+    std::vector<int> prefix;
+    collect_entangling_gate_paths(block, prefix, ret);
+    return ret;
+}
+
+static void append_int_vector_signature(std::stringstream& sstream,
+                                        const std::vector<int>& values) {
+    sstream << "[";
+    for (size_t idx = 0; idx < values.size(); ++idx) {
+        if (idx > 0) {
+            sstream << ",";
+        }
+        sstream << values[idx];
+    }
+    sstream << "]";
+}
+
+static void append_gate_structure_signature(Gates_block* block,
+                                            std::stringstream& sstream) {
+    if (block == NULL) {
+        sstream << "NULL";
+        return;
+    }
+
+    sstream << "B" << block->get_gate_num() << "(";
+    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
+        Gate* gate = block->get_gate(idx);
+        if (gate == NULL) {
+            sstream << "NULL;";
+            continue;
+        }
+
+        sstream << static_cast<int>(gate->get_type()) << ":T";
+        std::vector<int> targets = gate->get_target_qbits();
+        if (targets.empty() && gate->get_target_qbit() >= 0) {
+            targets.push_back(gate->get_target_qbit());
+        }
+        append_int_vector_signature(sstream, targets);
+
+        sstream << ":C";
+        std::vector<int> controls = gate->get_control_qbits();
+        if (controls.empty() && gate->get_control_qbit() >= 0) {
+            controls.push_back(gate->get_control_qbit());
+        }
+        append_int_vector_signature(sstream, controls);
+
+        sstream << ":P" << gate->get_parameter_num();
+        if (gate->get_type() == BLOCK_OPERATION) {
+            sstream << "{";
+            append_gate_structure_signature(static_cast<Gates_block*>(gate), sstream);
+            sstream << "}";
+        }
+        sstream << ";";
+    }
+    sstream << ")";
+}
+
+static std::string gate_structure_signature(Gates_block* block) {
+    std::stringstream sstream;
+    append_gate_structure_signature(block, sstream);
+    return sstream.str();
+}
+
+static Gate* gate_at_path(Gates_block* block, const OSRGatePath& path) {
+    Gates_block* current_block = block;
+    for (size_t depth = 0; depth < path.indices.size(); ++depth) {
+        if (current_block == NULL) {
+            return NULL;
+        }
+
+        int gate_idx = path.indices[depth];
+        if (gate_idx < 0 || gate_idx >= current_block->get_gate_num()) {
+            return NULL;
+        }
+
+        Gate* gate = current_block->get_gate(gate_idx);
+        if (depth == path.indices.size() - 1) {
+            return gate;
+        }
+        if (gate == NULL || gate->get_type() != BLOCK_OPERATION) {
+            return NULL;
+        }
+        current_block = static_cast<Gates_block*>(gate);
+    }
+    return NULL;
+}
+
+static bool get_two_qubit_endpoint_pair(Gate* gate, int& q0, int& q1) {
+    if (gate == NULL) {
+        return false;
+    }
+
+    std::vector<int> involved = gate->get_involved_qubits();
+    std::sort(involved.begin(), involved.end());
+    involved.erase(std::unique(involved.begin(), involved.end()), involved.end());
+    if (involved.size() != 2) {
+        return false;
+    }
+
+    q0 = involved[0];
+    q1 = involved[1];
+    return true;
+}
+
+static bool gate_endpoint_sets_are_disjoint(Gate* lhs, Gate* rhs) {
+    int lhs_q0 = 0;
+    int lhs_q1 = 0;
+    int rhs_q0 = 0;
+    int rhs_q1 = 0;
+    if (!get_two_qubit_endpoint_pair(lhs, lhs_q0, lhs_q1) ||
+        !get_two_qubit_endpoint_pair(rhs, rhs_q0, rhs_q1)) {
+        return false;
+    }
+
+    return lhs_q0 != rhs_q0 && lhs_q0 != rhs_q1 &&
+           lhs_q1 != rhs_q0 && lhs_q1 != rhs_q1;
+}
+
+static bool gate_type_is_directional(gate_type type) {
+    switch (type) {
+    case CZ_OPERATION:
+    case SWAP_OPERATION:
+    case RXX_OPERATION:
+    case RYY_OPERATION:
+    case RZZ_OPERATION:
+        return false;
+    default:
+        return true;
+    }
+}
+
+static bool rewire_two_qubit_gate(Gate* gate, int new_target, int new_control) {
+    if (gate == NULL || new_target == new_control) {
+        return false;
+    }
+
+    int old_q0 = 0;
+    int old_q1 = 0;
+    if (!get_two_qubit_endpoint_pair(gate, old_q0, old_q1)) {
+        return false;
+    }
+
+    std::vector<int> controls = gate->get_control_qbits();
+    if (!controls.empty() || gate->get_control_qbit() >= 0) {
+        gate->set_target_qbit(new_target);
+        gate->set_control_qbit(new_control);
+        return true;
+    }
+
+    std::vector<int> targets = gate->get_target_qbits();
+    if (targets.size() >= 2) {
+        std::vector<int> new_targets;
+        new_targets.push_back(new_target);
+        new_targets.push_back(new_control);
+        gate->set_target_qbits(new_targets);
+        return true;
+    }
+
+    return false;
+}
+
+static bool path_has_prefix(const OSRGatePath& path, const std::vector<int>& prefix) {
+    if (path.indices.size() < prefix.size()) {
+        return false;
+    }
+    return std::equal(prefix.begin(), prefix.end(), path.indices.begin());
+}
+
+static bool path_equals_prefix(const OSRGatePath& path, const std::vector<int>& prefix) {
+    return path.indices.size() == prefix.size() && path_has_prefix(path, prefix);
+}
+
+static bool subtree_contains_removed_path(const std::set<OSRGatePath>& removed_paths,
+                                          const std::vector<int>& prefix) {
+    for (std::set<OSRGatePath>::const_iterator it = removed_paths.begin(); it != removed_paths.end(); ++it) {
+        if (path_has_prefix(*it, prefix)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static Gates_block* clone_without_removed_paths(Gates_block* block,
+                                                const std::set<OSRGatePath>& removed_paths,
+                                                std::vector<int>& prefix) {
+    Gates_block* ret = new Gates_block(block->get_qbit_num());
+
+    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
+        Gate* gate = block->get_gate(idx);
+        prefix.push_back(idx);
+
+        bool remove_gate = false;
+        for (std::set<OSRGatePath>::const_iterator it = removed_paths.begin(); it != removed_paths.end(); ++it) {
+            if (path_equals_prefix(*it, prefix)) {
+                remove_gate = true;
+                break;
+            }
+        }
+
+        if (!remove_gate) {
+            if (gate->get_type() == BLOCK_OPERATION && subtree_contains_removed_path(removed_paths, prefix)) {
+                Gates_block* cloned_block = clone_without_removed_paths(
+                    static_cast<Gates_block*>(gate), removed_paths, prefix);
+                ret->add_gate(cloned_block);
+            } else {
+                ret->add_gate(gate->clone());
+            }
+        }
+
+        prefix.pop_back();
+    }
+
+    return ret;
+}
+
+static Gates_block* clone_without_removed_paths(Gates_block* block,
+                                                const std::vector<OSRGatePath>& all_paths,
+                                                const std::vector<int>& removed_ids) {
+    std::set<OSRGatePath> removed_paths;
+    for (size_t idx = 0; idx < removed_ids.size(); ++idx) {
+        removed_paths.insert(all_paths[removed_ids[idx]]);
+    }
+
+    std::vector<int> prefix;
+    return clone_without_removed_paths(block, removed_paths, prefix);
+}
+
+static Gates_block* clone_with_rewired_gate_path(Gates_block* block,
+                                                 const OSRGatePath& path,
+                                                 int depth,
+                                                 int new_target,
+                                                 int new_control) {
+    Gates_block* ret = new Gates_block(block->get_qbit_num());
+
+    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
+        Gate* gate = block->get_gate(idx);
+        if (gate == NULL) {
+            continue;
+        }
+
+        if (depth < static_cast<int>(path.indices.size()) &&
+            idx == path.indices[depth]) {
+            if (depth == static_cast<int>(path.indices.size()) - 1) {
+                Gate* cloned_gate = gate->clone();
+                if (!rewire_two_qubit_gate(cloned_gate, new_target, new_control)) {
+                    delete cloned_gate;
+                    delete ret;
+                    return NULL;
+                }
+                ret->add_gate(cloned_gate);
+            } else {
+                if (gate->get_type() != BLOCK_OPERATION) {
+                    delete ret;
+                    return NULL;
+                }
+
+                Gates_block* rewired_block = clone_with_rewired_gate_path(
+                    static_cast<Gates_block*>(gate), path, depth + 1,
+                    new_target, new_control);
+                if (rewired_block == NULL) {
+                    delete ret;
+                    return NULL;
+                }
+                ret->add_gate(rewired_block);
+            }
+        } else {
+            ret->add_gate(gate->clone());
+        }
+    }
+
+    return ret;
+}
+
+static Gates_block* clone_with_rewired_gate_path(Gates_block* block,
+                                                 const OSRGatePath& path,
+                                                 int new_target,
+                                                 int new_control) {
+    return clone_with_rewired_gate_path(block, path, 0, new_target, new_control);
+}
+
+static Gates_block* clone_with_swapped_sibling_gates(Gates_block* block,
+                                                     const std::vector<int>& parent_path,
+                                                     int depth,
+                                                     int first_idx,
+                                                     int second_idx) {
+    Gates_block* ret = new Gates_block(block->get_qbit_num());
+
+    if (depth == static_cast<int>(parent_path.size())) {
+        for (int idx = 0; idx < block->get_gate_num(); ++idx) {
+            int source_idx = idx;
+            if (idx == first_idx) {
+                source_idx = second_idx;
+            } else if (idx == second_idx) {
+                source_idx = first_idx;
+            }
+            Gate* source_gate = block->get_gate(source_idx);
+            if (source_gate != NULL) {
+                ret->add_gate(source_gate->clone());
+            }
+        }
+        return ret;
+    }
+
+    int selected_idx = parent_path[depth];
+    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
+        Gate* gate = block->get_gate(idx);
+        if (gate == NULL) {
+            continue;
+        }
+
+        if (idx == selected_idx) {
+            if (gate->get_type() != BLOCK_OPERATION) {
+                delete ret;
+                return NULL;
+            }
+            Gates_block* swapped_block = clone_with_swapped_sibling_gates(
+                static_cast<Gates_block*>(gate), parent_path, depth + 1,
+                first_idx, second_idx);
+            if (swapped_block == NULL) {
+                delete ret;
+                return NULL;
+            }
+            ret->add_gate(swapped_block);
+        } else {
+            ret->add_gate(gate->clone());
+        }
+    }
+
+    return ret;
+}
+
+static Gates_block* clone_with_swapped_sibling_gates(Gates_block* block,
+                                                     const OSRGatePath& first_path,
+                                                     const OSRGatePath& second_path) {
+    if (first_path.indices.size() != second_path.indices.size() ||
+        first_path.indices.empty()) {
+        return NULL;
+    }
+
+    std::vector<int> first_parent(
+        first_path.indices.begin(), first_path.indices.end() - 1);
+    std::vector<int> second_parent(
+        second_path.indices.begin(), second_path.indices.end() - 1);
+    if (first_parent != second_parent) {
+        return NULL;
+    }
+
+    int first_idx = first_path.indices.back();
+    int second_idx = second_path.indices.back();
+    if (first_idx == second_idx) {
+        return NULL;
+    }
+    if (second_idx < first_idx) {
+        std::swap(first_idx, second_idx);
+    }
+
+    return clone_with_swapped_sibling_gates(
+        block, first_parent, 0, first_idx, second_idx);
+}
+
+static bool parameter_interval_for_path(Gates_block* block,
+                                        const OSRGatePath& path,
+                                        int depth,
+                                        int offset,
+                                        int& start,
+                                        int& length) {
+    if (block == NULL || depth >= static_cast<int>(path.indices.size())) {
+        return false;
+    }
+
+    int gate_idx = path.indices[depth];
+    Gate* gate = block->get_gate(gate_idx);
+    if (gate == NULL) {
+        return false;
+    }
+
+    int gate_offset = offset + gate->get_parameter_start_idx();
+    if (depth == static_cast<int>(path.indices.size()) - 1) {
+        start = gate_offset;
+        length = gate->get_parameter_num();
+        return true;
+    }
+
+    if (gate->get_type() != BLOCK_OPERATION) {
+        return false;
+    }
+
+    return parameter_interval_for_path(
+        static_cast<Gates_block*>(gate), path, depth + 1, gate_offset, start, length);
+}
+
+static Matrix_real reduced_parameters_without_paths(
+    Gates_block* original_gate_structure,
+    const std::vector<OSRGatePath>& removed_paths,
+    const Matrix_real& original_parameters) {
+    if (original_parameters.size() == 0 ||
+        original_parameters.size() != original_gate_structure->get_parameter_num()) {
+        return Matrix_real(0, 0);
+    }
+
+    std::vector<std::pair<int, int>> intervals;
+    intervals.reserve(removed_paths.size());
+    for (size_t idx = 0; idx < removed_paths.size(); ++idx) {
+        int start = 0;
+        int length = 0;
+        if (parameter_interval_for_path(
+                original_gate_structure, removed_paths[idx], 0, 0, start, length) &&
+            length > 0) {
+            intervals.push_back(std::make_pair(start, start + length));
+        }
+    }
+
+    if (intervals.empty()) {
+        return original_parameters.copy();
+    }
+
+    std::sort(intervals.begin(), intervals.end());
+    std::vector<std::pair<int, int>> merged;
+    for (size_t idx = 0; idx < intervals.size(); ++idx) {
+        if (merged.empty() || intervals[idx].first > merged.back().second) {
+            merged.push_back(intervals[idx]);
+        } else {
+            merged.back().second = std::max(merged.back().second, intervals[idx].second);
+        }
+    }
+
+    int removed_parameter_num = 0;
+    for (size_t idx = 0; idx < merged.size(); ++idx) {
+        removed_parameter_num += merged[idx].second - merged[idx].first;
+    }
+
+    Matrix_real reduced_parameters(1, original_parameters.size() - removed_parameter_num);
+    int src = 0;
+    int dst = 0;
+    for (size_t idx = 0; idx < merged.size(); ++idx) {
+        int keep_num = merged[idx].first - src;
+        if (keep_num > 0) {
+            std::memcpy(reduced_parameters.get_data() + dst,
+                        original_parameters.get_data() + src,
+                        keep_num * sizeof(double));
+            dst += keep_num;
+        }
+        src = merged[idx].second;
+    }
+
+    if (src < original_parameters.size()) {
+        int keep_num = original_parameters.size() - src;
+        std::memcpy(reduced_parameters.get_data() + dst,
+                    original_parameters.get_data() + src,
+                    keep_num * sizeof(double));
+    }
+
+    return reduced_parameters;
+}
+
+static Matrix_real reduced_parameters_without_removed_paths(
+    Gates_block* original_gate_structure,
+    const std::vector<OSRGatePath>& all_paths,
+    const std::vector<int>& removed_ids,
+    const Matrix_real& original_parameters) {
+    std::vector<OSRGatePath> removed_paths;
+    removed_paths.reserve(removed_ids.size());
+    for (size_t idx = 0; idx < removed_ids.size(); ++idx) {
+        removed_paths.push_back(all_paths[removed_ids[idx]]);
+    }
+    return reduced_parameters_without_paths(
+        original_gate_structure, removed_paths, original_parameters);
+}
+
+static void add_topology_edge(std::set<std::pair<int, int>>& edges, int q0, int q1) {
+    if (q0 == q1) {
+        return;
+    }
+    if (q1 < q0) {
+        std::swap(q0, q1);
+    }
+    edges.insert(std::make_pair(q0, q1));
+}
+
+static void collect_topology_edges(Gates_block* block, std::set<std::pair<int, int>>& edges) {
+    if (block == NULL) {
+        return;
+    }
+
+    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
+        Gate* gate = block->get_gate(idx);
+        if (gate == NULL) {
+            continue;
+        }
+
+        if (gate->get_type() == BLOCK_OPERATION) {
+            collect_topology_edges(static_cast<Gates_block*>(gate), edges);
+            continue;
+        }
+
+        if (!is_entangling_gate(gate)) {
+            continue;
+        }
+
+        std::vector<int> involved = gate->get_involved_qubits();
+        for (size_t q0_idx = 0; q0_idx < involved.size(); ++q0_idx) {
+            for (size_t q1_idx = q0_idx + 1; q1_idx < involved.size(); ++q1_idx) {
+                add_topology_edge(edges, involved[q0_idx], involved[q1_idx]);
+            }
+        }
+    }
+}
+
+static std::vector<matrix_base<int>> topology_from_gate_structure(Gates_block* gate_structure, int qbit_num) {
+    std::set<std::pair<int, int>> edges;
+    collect_topology_edges(gate_structure, edges);
+
+    if (edges.empty() && qbit_num > 1) {
+        for (int q0 = 0; q0 < qbit_num; ++q0) {
+            for (int q1 = q0 + 1; q1 < qbit_num; ++q1) {
+                edges.insert(std::make_pair(q0, q1));
+            }
+        }
+    }
+
+    std::vector<matrix_base<int>> topology;
+    topology.reserve(edges.size());
+    for (std::set<std::pair<int, int>>::const_iterator it = edges.begin(); it != edges.end(); ++it) {
+        matrix_base<int> edge(2, 1);
+        edge[0] = it->first;
+        edge[1] = it->second;
+        topology.push_back(edge);
+    }
+    return topology;
+}
+
+static std::vector<std::pair<int, int>> topology_pairs_from_matrices(
+    const std::vector<matrix_base<int>>& topology) {
+    std::vector<std::pair<int, int>> pairs;
+    pairs.reserve(topology.size());
+    for (size_t idx = 0; idx < topology.size(); ++idx) {
+        int q0 = topology[idx][0];
+        int q1 = topology[idx][1];
+        if (q0 == q1) {
+            continue;
+        }
+        if (q1 < q0) {
+            std::swap(q0, q1);
+        }
+        std::pair<int, int> edge(q0, q1);
+        if (std::find(pairs.begin(), pairs.end(), edge) == pairs.end()) {
+            pairs.push_back(edge);
+        }
+    }
+    return pairs;
+}
+
+static std::vector<std::pair<int, int>> complete_topology_pairs(int qbit_num) {
+    std::vector<std::pair<int, int>> pairs;
+    for (int q0 = 0; q0 < qbit_num; ++q0) {
+        for (int q1 = q0 + 1; q1 < qbit_num; ++q1) {
+            pairs.push_back(std::make_pair(q0, q1));
+        }
+    }
+    return pairs;
+}
+
+static Gates_block* construct_cnot_skeleton_gate_structure(
+    int qbit_num,
+    const std::vector<std::pair<int, int>>& edges,
+    const std::vector<int>& sequence) {
+    Gates_block* gate_structure = new Gates_block(qbit_num);
+
+    for (size_t idx = 0; idx < sequence.size(); ++idx) {
+        int edge_idx = sequence[idx];
+        if (edge_idx < 0 || edge_idx >= static_cast<int>(edges.size())) {
+            delete gate_structure;
+            return NULL;
+        }
+
+        Gates_block* layer = new Gates_block(qbit_num);
+        int target = edges[edge_idx].first;
+        int control = edges[edge_idx].second;
+        layer->add_u3(target);
+        layer->add_u3(control);
+        layer->add_cnot(target, control);
+        gate_structure->add_gate(layer);
+    }
+
+    Gates_block* final_layer = new Gates_block(qbit_num);
+    for (int qbit = 0; qbit < qbit_num; ++qbit) {
+        final_layer->add_u3(qbit);
+    }
+    gate_structure->add_gate(final_layer);
+
+    return gate_structure;
+}
+
+static int64_t limited_integer_power(int base, int exponent, int64_t limit) {
+    int64_t value = 1;
+    for (int idx = 0; idx < exponent; ++idx) {
+        if (base <= 0 || value > limit / base) {
+            return limit + 1;
+        }
+        value *= base;
+    }
+    return value;
+}
+
+static std::vector<CompressionCandidate> generate_cnot_skeleton_candidates(
+    int qbit_num,
+    int original_entangling_gate_num,
+    const std::vector<std::pair<int, int>>& edges,
+    const N_Qubit_Decomposition_OSR_Compression_Options& options) {
+    std::vector<CompressionCandidate> candidates;
+    if (!options.enable_skeleton_search || edges.empty() ||
+        options.skeleton_max_candidates <= 0 || original_entangling_gate_num <= 0) {
+        return candidates;
+    }
+
+    int target_depth = options.skeleton_target_cnots;
+    if (target_depth < 0) {
+        int removed = options.max_removed_gates >= 0 ? options.max_removed_gates : 2;
+        target_depth = original_entangling_gate_num - removed;
+    }
+    if (target_depth < 0 || target_depth >= original_entangling_gate_num) {
+        return candidates;
+    }
+
+    int edge_num = static_cast<int>(edges.size());
+    int64_t combination_num = limited_integer_power(
+        edge_num, target_depth, static_cast<int64_t>(options.skeleton_max_candidates));
+
+    if (combination_num > options.skeleton_max_candidates) {
+        return candidates;
+    }
+
+    std::set<std::string> seen;
+    for (int64_t state = 0; state < combination_num; ++state) {
+        int64_t value = state;
+        std::vector<int> sequence(target_depth, 0);
+        for (int depth = target_depth - 1; depth >= 0; --depth) {
+            sequence[depth] = static_cast<int>(value % edge_num);
+            value /= edge_num;
+        }
+
+        std::shared_ptr<Gates_block> gate_structure(
+            construct_cnot_skeleton_gate_structure(qbit_num, edges, sequence));
+        if (!gate_structure) {
+            continue;
+        }
+
+        std::string key = gate_structure_signature(gate_structure.get());
+        if (!seen.insert(key).second) {
+            continue;
+        }
+
+        CompressionCandidate candidate;
+        candidate.entangling_gate_num = target_depth;
+        candidate.gate_structure = gate_structure;
+        candidate.key = key;
+        candidate.initial_parameters = Matrix_real(0, 0);
+        candidates.push_back(candidate);
+    }
+
+    return candidates;
+}
+
+static double residual_sum(const std::vector<std::pair<int, double>>& cut_bounds) {
+    return std::accumulate(cut_bounds.begin(), cut_bounds.end(), 0.0,
+        [&cut_bounds](double acc, const std::pair<int, double>& item) {
+            return acc + item.first * cut_bounds.size() + item.second;
+        });
+}
+
+static bool score_less(const N_Qubit_Decomposition_OSR_Compression_Score& lhs,
+                       const N_Qubit_Decomposition_OSR_Compression_Score& rhs) {
+    if (lhs.min_remaining_cnots != rhs.min_remaining_cnots) {
+        return lhs.min_remaining_cnots < rhs.min_remaining_cnots;
+    }
+    if (lhs.kappa != rhs.kappa) {
+        return lhs.kappa < rhs.kappa;
+    }
+    return lhs.residual < rhs.residual;
+}
+
+static bool beam_candidate_less(const CompressionCandidate& lhs,
+                                const CompressionCandidate& rhs) {
+    if (score_less(lhs.score, rhs.score)) {
+        return true;
+    }
+    if (score_less(rhs.score, lhs.score)) {
+        return false;
+    }
+    return lhs.key < rhs.key;
+}
+
+static bool final_candidate_less(const CompressionCandidate& lhs,
+                                 const CompressionCandidate& rhs) {
+    if (lhs.entangling_gate_num != rhs.entangling_gate_num) {
+        return lhs.entangling_gate_num < rhs.entangling_gate_num;
+    }
+    return beam_candidate_less(lhs, rhs);
+}
+
+static std::string compression_candidate_key(const CompressionCandidate& candidate) {
+    if (!candidate.key.empty()) {
+        return candidate.key;
+    }
+
+    std::stringstream sstream;
+    sstream << "removed:";
+    for (size_t idx = 0; idx < candidate.removed_ids.size(); ++idx) {
+        if (idx > 0) {
+            sstream << ",";
+        }
+        sstream << candidate.removed_ids[idx];
+    }
+    return sstream.str();
+}
+
+static void sort_unique_candidates(std::vector<CompressionCandidate>& candidates, bool final_sort) {
+    std::sort(candidates.begin(), candidates.end(),
+              final_sort ? final_candidate_less : beam_candidate_less);
+
+    std::set<std::string> seen;
+    std::vector<CompressionCandidate> unique_candidates;
+    unique_candidates.reserve(candidates.size());
+    for (size_t idx = 0; idx < candidates.size(); ++idx) {
+        std::string key = compression_candidate_key(candidates[idx]);
+        if (seen.insert(key).second) {
+            unique_candidates.push_back(candidates[idx]);
+        }
+    }
+    candidates.swap(unique_candidates);
+}
+
+static Gates_block* clone_gate_structure_for_candidate(
+    Gates_block* original_gate_structure,
+    const std::vector<OSRGatePath>& original_paths,
+    const CompressionCandidate& candidate) {
+    if (candidate.gate_structure) {
+        return candidate.gate_structure->clone();
+    }
+    return clone_without_removed_paths(
+        original_gate_structure, original_paths, candidate.removed_ids);
+}
+
+static bool edge_shares_endpoint(const std::pair<int, int>& edge, int q0, int q1) {
+    return edge.first == q0 || edge.first == q1 ||
+           edge.second == q0 || edge.second == q1;
+}
+
+static bool same_undirected_edge(const std::pair<int, int>& edge, int q0, int q1) {
+    int a = q0;
+    int b = q1;
+    if (b < a) {
+        std::swap(a, b);
+    }
+    return edge.first == a && edge.second == b;
+}
+
+static bool same_parent_and_adjacent(const OSRGatePath& lhs,
+                                     const OSRGatePath& rhs) {
+    if (lhs.indices.size() != rhs.indices.size() || lhs.indices.empty()) {
+        return false;
+    }
+
+    for (size_t idx = 0; idx + 1 < lhs.indices.size(); ++idx) {
+        if (lhs.indices[idx] != rhs.indices[idx]) {
+            return false;
+        }
+    }
+
+    return std::abs(lhs.indices.back() - rhs.indices.back()) == 1;
+}
+
+static void append_candidate_if_new(std::vector<CompressionCandidate>& out,
+                                    std::set<std::string>& seen,
+                                    CompressionCandidate& candidate) {
+    if (!candidate.gate_structure) {
+        return;
+    }
+
+    candidate.key = gate_structure_signature(candidate.gate_structure.get());
+    if (seen.insert(candidate.key).second) {
+        candidate.entangling_gate_num =
+            static_cast<int>(collect_entangling_gate_paths(candidate.gate_structure.get()).size());
+        out.push_back(candidate);
+    }
+}
+
+static std::vector<CompressionCandidate> generate_local_mutation_candidates(
+    Gates_block* base_structure,
+    const Matrix_real& base_parameters,
+    const CompressionCandidate& parent,
+    const std::vector<std::pair<int, int>>& mutation_edges,
+    const N_Qubit_Decomposition_OSR_Compression_Options& options) {
+    std::vector<CompressionCandidate> candidates;
+    if (!options.enable_mutations || options.mutation_candidates <= 0 ||
+        base_structure == NULL) {
+        return candidates;
+    }
+
+    std::vector<OSRGatePath> paths = collect_entangling_gate_paths(base_structure);
+    if (paths.empty()) {
+        return candidates;
+    }
+
+    std::set<std::string> seen;
+    seen.insert(gate_structure_signature(base_structure));
+
+    for (size_t idx = 0; idx + 1 < paths.size() &&
+                         static_cast<int>(candidates.size()) < options.mutation_candidates; ++idx) {
+        const OSRGatePath& lhs_path = paths[idx];
+        const OSRGatePath& rhs_path = paths[idx + 1];
+        if (!same_parent_and_adjacent(lhs_path, rhs_path)) {
+            continue;
+        }
+
+        Gate* lhs_gate = gate_at_path(base_structure, lhs_path);
+        Gate* rhs_gate = gate_at_path(base_structure, rhs_path);
+        if (!gate_endpoint_sets_are_disjoint(lhs_gate, rhs_gate)) {
+            continue;
+        }
+
+        std::shared_ptr<Gates_block> swapped(
+            clone_with_swapped_sibling_gates(base_structure, lhs_path, rhs_path));
+        if (!swapped) {
+            continue;
+        }
+
+        CompressionCandidate child = parent;
+        child.gate_structure = swapped;
+        if (base_parameters.size() == base_structure->get_parameter_num() &&
+            swapped->get_parameter_num() == base_parameters.size()) {
+            child.initial_parameters = base_parameters.copy();
+        } else {
+            child.initial_parameters = Matrix_real(0, 0);
+        }
+        append_candidate_if_new(candidates, seen, child);
+    }
+
+    for (size_t path_idx = 0; path_idx < paths.size() &&
+                              static_cast<int>(candidates.size()) < options.mutation_candidates; ++path_idx) {
+        Gate* gate = gate_at_path(base_structure, paths[path_idx]);
+        if (gate == NULL) {
+            continue;
+        }
+
+        int old_q0 = 0;
+        int old_q1 = 0;
+        if (!get_two_qubit_endpoint_pair(gate, old_q0, old_q1)) {
+            continue;
+        }
+
+        bool directional = gate_type_is_directional(gate->get_type()) &&
+            (!gate->get_control_qbits().empty() || gate->get_control_qbit() >= 0);
+
+        for (int pass = 0; pass < 2 &&
+                           static_cast<int>(candidates.size()) < options.mutation_candidates; ++pass) {
+            for (size_t edge_idx = 0; edge_idx < mutation_edges.size() &&
+                                      static_cast<int>(candidates.size()) < options.mutation_candidates; ++edge_idx) {
+                const std::pair<int, int>& edge = mutation_edges[edge_idx];
+                if (same_undirected_edge(edge, old_q0, old_q1)) {
+                    continue;
+                }
+
+                bool shares_endpoint = edge_shares_endpoint(edge, old_q0, old_q1);
+                if ((pass == 0 && !shares_endpoint) || (pass == 1 && shares_endpoint)) {
+                    continue;
+                }
+
+                int orientation_count = directional ? 2 : 1;
+                for (int orientation = 0;
+                     orientation < orientation_count &&
+                     static_cast<int>(candidates.size()) < options.mutation_candidates;
+                     ++orientation) {
+                    int new_target = (orientation == 0) ? edge.first : edge.second;
+                    int new_control = (orientation == 0) ? edge.second : edge.first;
+
+                    std::shared_ptr<Gates_block> rewired(
+                        clone_with_rewired_gate_path(
+                            base_structure, paths[path_idx], new_target, new_control));
+                    if (!rewired) {
+                        continue;
+                    }
+
+                    CompressionCandidate child = parent;
+                    child.gate_structure = rewired;
+                    if (base_parameters.size() == base_structure->get_parameter_num() &&
+                        rewired->get_parameter_num() == base_parameters.size()) {
+                        child.initial_parameters = base_parameters.copy();
+                    } else {
+                        child.initial_parameters = Matrix_real(0, 0);
+                    }
+                    append_candidate_if_new(candidates, seen, child);
+                }
+            }
+        }
+    }
+
+    return candidates;
+}
+
+static bool candidate_is_osr_admissible(const CompressionCandidate& candidate,
+                                        const N_Qubit_Decomposition_OSR_Compression_Options& options) {
+    return candidate.score.min_remaining_cnots <= options.osr_bound_limit;
+}
+
+} // namespace
+
+N_Qubit_Decomposition_OSR_Compression_Result::N_Qubit_Decomposition_OSR_Compression_Result()
+    : current_minimum(std::numeric_limits<double>::infinity()),
+      original_entangling_gate_num(0),
+      compressed_entangling_gate_num(0),
+      validated(false),
+      reached_tolerance(false),
+      decomposition_error(std::numeric_limits<double>::infinity()) {}
+
+N_Qubit_Decomposition_OSR_Compression::N_Qubit_Decomposition_OSR_Compression()
+    : N_Qubit_Decomposition_custom() {
+    name = "OSR_Compression";
+}
+
+N_Qubit_Decomposition_OSR_Compression::N_Qubit_Decomposition_OSR_Compression(
+    Matrix Umtx_in,
+    int qbit_num_in,
+    std::map<std::string, Config_Element>& config,
+    int accelerator_num)
+    : N_Qubit_Decomposition_custom(Umtx_in, qbit_num_in, false, config, RANDOM, accelerator_num) {
+    name = "OSR_Compression";
+}
+
+N_Qubit_Decomposition_OSR_Compression::N_Qubit_Decomposition_OSR_Compression(
+    Matrix Umtx_in,
+    int qbit_num_in,
+    std::vector<matrix_base<int>> topology_in,
+    std::map<std::string, Config_Element>& config,
+    int accelerator_num)
+    : N_Qubit_Decomposition_custom(Umtx_in, qbit_num_in, false, config, RANDOM, accelerator_num),
+      topology(std::move(topology_in)) {
+    name = "OSR_Compression";
+}
+
+N_Qubit_Decomposition_OSR_Compression::~N_Qubit_Decomposition_OSR_Compression() {}
+
+void N_Qubit_Decomposition_OSR_Compression::start_decomposition() {
+    std::unique_ptr<Gates_block> original_gate_structure(clone());
+    Matrix_real original_parameters = optimized_parameters_mtx.size() > 0
+        ? optimized_parameters_mtx.copy()
+        : Matrix_real(0, 0);
+
+    double optimization_tolerance_loc;
+    if (config.count("optimization_tolerance") > 0) {
+        config["optimization_tolerance"].get_property(optimization_tolerance_loc);
+    } else {
+        optimization_tolerance_loc = optimization_tolerance;
+    }
+
+    N_Qubit_Decomposition_OSR_Compression_Result result =
+        compress_gate_structure(original_gate_structure.get());
+
+    if (!result.reached_tolerance &&
+        original_parameters.size() == original_gate_structure->get_parameter_num()) {
+        double original_cost = optimization_problem(original_parameters);
+        if (original_cost < optimization_tolerance_loc ||
+            original_cost < result.current_minimum) {
+            result.gate_structure.reset(original_gate_structure->clone());
+            result.optimized_parameters = original_parameters.copy();
+            result.current_minimum = original_cost;
+            result.decomposition_error = original_cost;
+            result.compressed_entangling_gate_num =
+                result.original_entangling_gate_num;
+            result.removed_gate_paths.clear();
+            result.validated = true;
+            result.reached_tolerance = original_cost < optimization_tolerance_loc;
+        }
+    }
+
+    release_gates();
+    combine(result.gate_structure.get());
+
+    if (result.validated && result.optimized_parameters.size() == get_parameter_num()) {
+        optimized_parameters_mtx = result.optimized_parameters.copy();
+        current_minimum = result.current_minimum;
+        decomposition_error = result.decomposition_error;
+    } else {
+        N_Qubit_Decomposition_custom::start_decomposition();
+    }
+}
+
+N_Qubit_Decomposition_OSR_Compression_Options
+N_Qubit_Decomposition_OSR_Compression::get_osr_compression_options() {
+    N_Qubit_Decomposition_OSR_Compression_Options options;
+
+    long long int_value;
+    double double_value;
+
+    if (config.count("osr_compression_beam") > 0) {
+        config["osr_compression_beam"].get_property(int_value);
+        options.beam_width = static_cast<int>(int_value);
+    }
+    if (config.count("osr_compression_max_removed") > 0) {
+        config["osr_compression_max_removed"].get_property(int_value);
+        options.max_removed_gates = static_cast<int>(int_value);
+    }
+    if (config.count("osr_compression_bound_limit") > 0) {
+        config["osr_compression_bound_limit"].get_property(int_value);
+        options.osr_bound_limit = static_cast<int>(int_value);
+    }
+    if (config.count("osr_compression_validation_trials") > 0) {
+        config["osr_compression_validation_trials"].get_property(int_value);
+        options.validation_trials = static_cast<int>(int_value);
+    }
+    if (config.count("osr_compression_validate") > 0) {
+        config["osr_compression_validate"].get_property(int_value);
+        options.validate_final = int_value != 0;
+    }
+    if (config.count("osr_compression_osr_tolerance") > 0) {
+        config["osr_compression_osr_tolerance"].get_property(double_value);
+        options.osr_tolerance = double_value;
+    }
+    if (config.count("osr_compression_enable_mutations") > 0) {
+        config["osr_compression_enable_mutations"].get_property(int_value);
+        options.enable_mutations = int_value != 0;
+    }
+    if (config.count("osr_compression_mutation_rounds") > 0) {
+        config["osr_compression_mutation_rounds"].get_property(int_value);
+        options.mutation_rounds = static_cast<int>(int_value);
+    }
+    if (config.count("osr_compression_mutation_candidates") > 0) {
+        config["osr_compression_mutation_candidates"].get_property(int_value);
+        options.mutation_candidates = static_cast<int>(int_value);
+    }
+    if (config.count("osr_compression_mutate_full_topology") > 0) {
+        config["osr_compression_mutate_full_topology"].get_property(int_value);
+        options.mutate_full_topology = int_value != 0;
+    }
+    if (config.count("osr_compression_enable_skeleton_search") > 0) {
+        config["osr_compression_enable_skeleton_search"].get_property(int_value);
+        options.enable_skeleton_search = int_value != 0;
+    }
+    if (config.count("osr_compression_skeleton_target_cnots") > 0) {
+        config["osr_compression_skeleton_target_cnots"].get_property(int_value);
+        options.skeleton_target_cnots = static_cast<int>(int_value);
+    }
+    if (config.count("osr_compression_skeleton_max_candidates") > 0) {
+        config["osr_compression_skeleton_max_candidates"].get_property(int_value);
+        options.skeleton_max_candidates = static_cast<int>(int_value);
+    }
+
+    options.beam_width = std::max(options.beam_width, 1);
+    options.validation_trials = std::max(options.validation_trials, 1);
+    options.osr_bound_limit = std::max(options.osr_bound_limit, 0);
+    options.mutation_rounds = std::max(options.mutation_rounds, 0);
+    options.mutation_candidates = std::max(options.mutation_candidates, 0);
+    options.skeleton_max_candidates = std::max(options.skeleton_max_candidates, 0);
+
+    return options;
+}
+
+N_Qubit_Decomposition_custom
+N_Qubit_Decomposition_OSR_Compression::prepare_custom_optimizer(
+    Gates_block* gate_structure_in,
+    cost_function_type cost_function_variant) {
+    double optimization_tolerance_loc;
+    if (config.count("optimization_tolerance") > 0) {
+        config["optimization_tolerance"].get_property(optimization_tolerance_loc);
+    } else {
+        optimization_tolerance_loc = optimization_tolerance;
+    }
+
+    N_Qubit_Decomposition_custom cDecomp_custom_random =
+        N_Qubit_Decomposition_custom(Umtx.copy(), qbit_num, false, config, RANDOM, accelerator_num);
+    cDecomp_custom_random.set_custom_gate_structure(gate_structure_in);
+    cDecomp_custom_random.set_optimization_blocks(gate_structure_in->get_gate_num());
+    cDecomp_custom_random.set_max_iteration(max_outer_iterations);
+#ifndef __DFE__
+    cDecomp_custom_random.set_verbose(verbose);
+#else
+    cDecomp_custom_random.set_verbose(0);
+#endif
+    cDecomp_custom_random.set_cost_function_variant(cost_function_variant);
+    cDecomp_custom_random.set_debugfile("");
+    cDecomp_custom_random.set_optimization_tolerance(optimization_tolerance_loc);
+    cDecomp_custom_random.set_trace_offset(trace_offset);
+    cDecomp_custom_random.set_optimizer(alg);
+
+    if (alg == ADAM || alg == BFGS2) {
+        int max_inner_iterations_loc = 10000;
+        int param_num_loc = gate_structure_in->get_parameter_num();
+        max_inner_iterations_loc = static_cast<int>((double)param_num_loc / 852 * 10000000.0);
+        cDecomp_custom_random.set_max_inner_iterations(max_inner_iterations_loc);
+        cDecomp_custom_random.set_random_shift_count_max(5);
+    } else if (alg == ADAM_BATCHED) {
+        int max_inner_iterations_loc = 2000;
+        cDecomp_custom_random.set_max_inner_iterations(max_inner_iterations_loc);
+        cDecomp_custom_random.set_random_shift_count_max(5);
+    } else if (alg == BFGS) {
+        int max_inner_iterations_loc = 10000;
+        cDecomp_custom_random.set_max_inner_iterations(max_inner_iterations_loc);
+    }
+
+    return cDecomp_custom_random;
+}
+
+N_Qubit_Decomposition_OSR_Compression_Score
+N_Qubit_Decomposition_OSR_Compression::evaluate_gate_structure_osr(
+    Gates_block* gate_structure_in,
+    const Matrix_real& initial_parameters,
+    MinCnotBoundSolver& osr_bound_solver,
+    std::vector<std::vector<int>>& all_cuts) {
+    N_Qubit_Decomposition_OSR_Compression_Options options = get_osr_compression_options();
+    N_Qubit_Decomposition_OSR_Compression_Score best_score;
+    best_score.min_remaining_cnots = std::numeric_limits<int>::max();
+    best_score.kappa = std::numeric_limits<double>::infinity();
+    best_score.residual = std::numeric_limits<double>::infinity();
+
+    if (qbit_num <= 1 || all_cuts.empty()) {
+        best_score.min_remaining_cnots = 0;
+        best_score.kappa = 0.0;
+        best_score.residual = 0.0;
+        return best_score;
+    }
+
+    double Fnorm = std::sqrt(static_cast<double>(1 << qbit_num));
+    std::uniform_real_distribution<> distrib_real(0.0, 2 * M_PI);
+
+    N_Qubit_Decomposition_custom cDecomp_custom_random =
+        prepare_custom_optimizer(gate_structure_in, OSR_ENTANGLEMENT);
+    std::vector<double> optimized_parameters(cDecomp_custom_random.get_parameter_num());
+    if (initial_parameters.size() == cDecomp_custom_random.get_parameter_num()) {
+        std::copy(initial_parameters.get_data(),
+                  initial_parameters.get_data() + initial_parameters.size(),
+                  optimized_parameters.begin());
+    } else if (optimized_parameters_mtx.size() == cDecomp_custom_random.get_parameter_num()) {
+        std::copy(optimized_parameters_mtx.get_data(),
+                  optimized_parameters_mtx.get_data() + optimized_parameters_mtx.size(),
+                  optimized_parameters.begin());
+    } else {
+        for (size_t idx = 0; idx < optimized_parameters.size(); ++idx) {
+            optimized_parameters[idx] = distrib_real(gen);
+        }
+    }
+    if (!optimized_parameters.empty()) {
+        cDecomp_custom_random.set_optimized_parameters(
+            optimized_parameters.data(), static_cast<int>(optimized_parameters.size()));
+    }
+
+    for (const std::vector<int>& cut : all_cuts) {
+        if (cut.size() != 1) {
+            continue;
+        }
+
+        int cut_size = static_cast<int>(cut.size());
+        int max_rank = 2 * std::min(cut_size, qbit_num - cut_size);
+        max_rank = std::max(max_rank, 1);
+
+        for (int rank = max_rank - 1; rank >= 0; --rank) {
+            cDecomp_custom_random.set_osr_params({cut}, rank, false);
+            cDecomp_custom_random.start_decomposition();
+
+            Matrix U = Umtx.copy();
+            Matrix_real params = cDecomp_custom_random.get_optimized_parameters();
+            cDecomp_custom_random.apply_to(params, U);
+
+            std::vector<std::pair<int, double>> osr_result;
+            osr_result.reserve(all_cuts.size());
+            int newrank = rank;
+            for (const std::vector<int>& eval_cut : all_cuts) {
+                osr_result.emplace_back(
+                    operator_schmidt_rank(U, qbit_num, eval_cut, Fnorm, options.osr_tolerance));
+                if (cut == eval_cut) {
+                    newrank = osr_result.back().first;
+                }
+            }
+
+            double kappa = std::numeric_limits<double>::infinity();
+            std::vector<int> edge_counts;
+            int min_cnots = osr_bound_solver.solve_min_cnots(osr_result, kappa, edge_counts);
+
+            N_Qubit_Decomposition_OSR_Compression_Score score;
+            score.min_remaining_cnots = min_cnots;
+            score.kappa = kappa;
+            score.residual = residual_sum(osr_result);
+            score.edge_counts = edge_counts;
+            score.cut_bounds = osr_result;
+
+            if (score_less(score, best_score)) {
+                best_score = score;
+            }
+
+            if (newrank > rank) {
+                break;
+            }
+            rank = std::min(rank, newrank);
+        }
+    }
+
+    if (best_score.min_remaining_cnots == std::numeric_limits<int>::max()) {
+        std::vector<std::pair<int, double>> osr_result(all_cuts.size(), std::make_pair(0, 0.0));
+        double kappa = std::numeric_limits<double>::infinity();
+        std::vector<int> edge_counts;
+        best_score.min_remaining_cnots = osr_bound_solver.solve_min_cnots(osr_result, kappa, edge_counts);
+        best_score.kappa = kappa;
+        best_score.residual = 0.0;
+        best_score.edge_counts = edge_counts;
+        best_score.cut_bounds = osr_result;
+    }
+
+    return best_score;
+}
+
+void N_Qubit_Decomposition_OSR_Compression::validate_compressed_gate_structure(
+    Gates_block* gate_structure_in,
+    const Matrix_real& initial_parameters,
+    N_Qubit_Decomposition_OSR_Compression_Result& result) {
+    N_Qubit_Decomposition_OSR_Compression_Options options = get_osr_compression_options();
+
+    double optimization_tolerance_loc;
+    if (config.count("optimization_tolerance") > 0) {
+        config["optimization_tolerance"].get_property(optimization_tolerance_loc);
+    } else {
+        optimization_tolerance_loc = optimization_tolerance;
+    }
+
+    result.validated = true;
+    result.current_minimum = std::numeric_limits<double>::infinity();
+    result.decomposition_error = std::numeric_limits<double>::infinity();
+
+    std::uniform_real_distribution<> distrib_real(0.0, 2 * M_PI);
+    for (int iter = 0; iter < options.validation_trials; ++iter) {
+        N_Qubit_Decomposition_custom cDecomp_custom_random =
+            prepare_custom_optimizer(gate_structure_in, cost_fnc);
+
+        std::vector<double> optimized_parameters(cDecomp_custom_random.get_parameter_num());
+        if (iter == 0 && initial_parameters.size() == cDecomp_custom_random.get_parameter_num()) {
+            std::copy(initial_parameters.get_data(),
+                      initial_parameters.get_data() + initial_parameters.size(),
+                      optimized_parameters.begin());
+        } else if (iter == 0 && optimized_parameters_mtx.size() == cDecomp_custom_random.get_parameter_num()) {
+            std::copy(optimized_parameters_mtx.get_data(),
+                      optimized_parameters_mtx.get_data() + optimized_parameters_mtx.size(),
+                      optimized_parameters.begin());
+        } else {
+            for (size_t idx = 0; idx < optimized_parameters.size(); ++idx) {
+                optimized_parameters[idx] = distrib_real(gen);
+            }
+        }
+        if (!optimized_parameters.empty()) {
+            cDecomp_custom_random.set_optimized_parameters(
+                optimized_parameters.data(), static_cast<int>(optimized_parameters.size()));
+        }
+
+        cDecomp_custom_random.start_decomposition();
+        Matrix_real optimized_parameters_tmp = cDecomp_custom_random.get_optimized_parameters();
+        double current_minimum_tmp = cDecomp_custom_random.optimization_problem(optimized_parameters_tmp);
+        if (current_minimum_tmp < result.current_minimum) {
+            result.current_minimum = current_minimum_tmp;
+            result.optimized_parameters = optimized_parameters_tmp.copy();
+            result.decomposition_error = cDecomp_custom_random.get_decomposition_error();
+        }
+        if (current_minimum_tmp < optimization_tolerance_loc &&
+            cDecomp_custom_random.get_decomposition_error() < optimization_tolerance_loc) {
+            result.reached_tolerance = true;
+            break;
+        }
+    }
+}
+
+N_Qubit_Decomposition_OSR_Compression_Result
+N_Qubit_Decomposition_OSR_Compression::compress_gate_structure(
+    Gates_block* gate_structure_in) {
+    if (gate_structure_in == NULL) {
+        std::string err("N_Qubit_Decomposition_OSR_Compression::compress_gate_structure: gate_structure is null");
+        throw err;
+    }
+
+    N_Qubit_Decomposition_OSR_Compression_Options options = get_osr_compression_options();
+    std::vector<OSRGatePath> removable_paths = collect_entangling_gate_paths(gate_structure_in);
+
+    std::vector<std::vector<int>> all_cuts = unique_cuts(qbit_num);
+    std::sort(all_cuts.begin(), all_cuts.end(), [](const std::vector<int>& a, const std::vector<int>& b) {
+        if (a.size() != b.size()) {
+            return a.size() < b.size();
+        }
+        return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end());
+    });
+
+    std::vector<matrix_base<int>> active_topology = !this->topology.empty()
+        ? this->topology
+        : topology_from_gate_structure(gate_structure_in, qbit_num);
+    std::vector<std::pair<int, int>> mutation_edges = options.mutate_full_topology
+        ? complete_topology_pairs(qbit_num)
+        : topology_pairs_from_matrices(active_topology);
+    MinCnotBoundSolver osr_bound_solver(qbit_num, all_cuts, active_topology);
+
+    CompressionCandidate root;
+    root.entangling_gate_num = static_cast<int>(removable_paths.size());
+    root.key = gate_structure_signature(gate_structure_in);
+    if (optimized_parameters_mtx.size() == gate_structure_in->get_parameter_num()) {
+        root.initial_parameters = optimized_parameters_mtx.copy();
+    }
+    root.score = evaluate_gate_structure_osr(
+        gate_structure_in, root.initial_parameters, osr_bound_solver, all_cuts);
+
+    CompressionCandidate best = root;
+    std::vector<CompressionCandidate> beam(1, root);
+
+    int max_removed = options.max_removed_gates < 0
+        ? static_cast<int>(removable_paths.size())
+        : std::min(options.max_removed_gates, static_cast<int>(removable_paths.size()));
+
+    for (int depth = 1; depth <= max_removed; ++depth) {
+        std::vector<CompressionCandidate> next_candidates;
+
+        for (size_t beam_idx = 0; beam_idx < beam.size(); ++beam_idx) {
+            const CompressionCandidate& parent = beam[beam_idx];
+            int start_id = parent.removed_ids.empty() ? 0 : parent.removed_ids.back() + 1;
+
+            for (int remove_id = start_id; remove_id < static_cast<int>(removable_paths.size()); ++remove_id) {
+                CompressionCandidate child;
+                child.removed_ids = parent.removed_ids;
+                child.removed_ids.push_back(remove_id);
+                child.entangling_gate_num =
+                    static_cast<int>(removable_paths.size()) - static_cast<int>(child.removed_ids.size());
+
+                std::unique_ptr<Gates_block> candidate_gate_structure(
+                    clone_without_removed_paths(gate_structure_in, removable_paths, child.removed_ids));
+                child.initial_parameters = reduced_parameters_without_removed_paths(
+                    gate_structure_in, removable_paths, child.removed_ids, optimized_parameters_mtx);
+                child.key = gate_structure_signature(candidate_gate_structure.get());
+                child.score = evaluate_gate_structure_osr(
+                    candidate_gate_structure.get(), child.initial_parameters, osr_bound_solver, all_cuts);
+
+                if (candidate_is_osr_admissible(child, options)) {
+                    next_candidates.push_back(child);
+                }
+            }
+        }
+
+        if (next_candidates.empty()) {
+            break;
+        }
+
+        sort_unique_candidates(next_candidates, false);
+        if (static_cast<int>(next_candidates.size()) > options.beam_width) {
+            next_candidates.resize(options.beam_width);
+        }
+
+        for (size_t idx = 0; idx < next_candidates.size(); ++idx) {
+            if (candidate_is_osr_admissible(next_candidates[idx], options) &&
+                final_candidate_less(next_candidates[idx], best)) {
+                best = next_candidates[idx];
+            }
+        }
+
+        beam.swap(next_candidates);
+    }
+
+    std::vector<CompressionCandidate> validation_pool = beam;
+    validation_pool.push_back(root);
+    validation_pool.push_back(best);
+
+    if (options.enable_mutations && options.mutation_rounds > 0 &&
+        options.mutation_candidates > 0 && !mutation_edges.empty()) {
+        std::vector<CompressionCandidate> mutation_seeds = validation_pool;
+        sort_unique_candidates(mutation_seeds, true);
+        if (static_cast<int>(mutation_seeds.size()) > options.beam_width) {
+            mutation_seeds.resize(options.beam_width);
+        }
+
+        for (int round = 0; round < options.mutation_rounds; ++round) {
+            std::vector<CompressionCandidate> round_mutations;
+
+            for (size_t seed_idx = 0; seed_idx < mutation_seeds.size(); ++seed_idx) {
+                const CompressionCandidate& seed = mutation_seeds[seed_idx];
+                std::unique_ptr<Gates_block> seed_gate_structure(
+                    clone_gate_structure_for_candidate(
+                        gate_structure_in, removable_paths, seed));
+
+                std::vector<CompressionCandidate> local_mutations =
+                    generate_local_mutation_candidates(
+                        seed_gate_structure.get(), seed.initial_parameters, seed,
+                        mutation_edges, options);
+
+                for (size_t mut_idx = 0; mut_idx < local_mutations.size(); ++mut_idx) {
+                    CompressionCandidate& mutation = local_mutations[mut_idx];
+                    mutation.score = evaluate_gate_structure_osr(
+                        mutation.gate_structure.get(), mutation.initial_parameters,
+                        osr_bound_solver, all_cuts);
+                    if (candidate_is_osr_admissible(mutation, options)) {
+                        round_mutations.push_back(mutation);
+                    }
+                }
+            }
+
+            if (round_mutations.empty()) {
+                break;
+            }
+
+            sort_unique_candidates(round_mutations, false);
+            if (static_cast<int>(round_mutations.size()) > options.mutation_candidates) {
+                round_mutations.resize(options.mutation_candidates);
+            }
+
+            validation_pool.insert(
+                validation_pool.end(), round_mutations.begin(), round_mutations.end());
+            mutation_seeds.swap(round_mutations);
+        }
+    }
+
+    if (options.enable_skeleton_search) {
+        std::vector<std::pair<int, int>> skeleton_edges =
+            (options.mutate_full_topology || qbit_num <= 3)
+                ? complete_topology_pairs(qbit_num)
+                : mutation_edges;
+        std::vector<CompressionCandidate> skeleton_candidates =
+            generate_cnot_skeleton_candidates(
+                qbit_num, static_cast<int>(removable_paths.size()),
+                skeleton_edges, options);
+        validation_pool.insert(
+            validation_pool.end(), skeleton_candidates.begin(), skeleton_candidates.end());
+    }
+
+    sort_unique_candidates(validation_pool, true);
+    int validation_pool_limit = options.beam_width;
+    if (options.enable_mutations) {
+        validation_pool_limit += options.mutation_candidates;
+    }
+    if (options.enable_skeleton_search) {
+        validation_pool_limit += options.skeleton_max_candidates;
+    }
+    validation_pool_limit = std::max(validation_pool_limit, options.beam_width);
+    if (static_cast<int>(validation_pool.size()) > validation_pool_limit) {
+        validation_pool.resize(validation_pool_limit);
+    }
+    bool has_root_candidate = false;
+    for (size_t idx = 0; idx < validation_pool.size(); ++idx) {
+        if (compression_candidate_key(validation_pool[idx]) == root.key) {
+            has_root_candidate = true;
+            break;
+        }
+    }
+    if (!has_root_candidate) {
+        validation_pool.push_back(root);
+    }
+
+    N_Qubit_Decomposition_OSR_Compression_Result result;
+    result.original_entangling_gate_num = static_cast<int>(removable_paths.size());
+
+    if (!options.validate_final) {
+        result.gate_structure.reset(
+            clone_gate_structure_for_candidate(gate_structure_in, removable_paths, best));
+        result.osr_score = best.score;
+        for (size_t idx = 0; idx < best.removed_ids.size(); ++idx) {
+            result.removed_gate_paths.push_back(removable_paths[best.removed_ids[idx]]);
+        }
+        result.compressed_entangling_gate_num = best.entangling_gate_num;
+        return result;
+    }
+
+    bool selected_validated_candidate = false;
+    N_Qubit_Decomposition_OSR_Compression_Result best_validated_result;
+
+    best_validated_result.gate_structure.reset(
+        clone_gate_structure_for_candidate(gate_structure_in, removable_paths, root));
+    best_validated_result.osr_score = root.score;
+    best_validated_result.original_entangling_gate_num = static_cast<int>(removable_paths.size());
+    best_validated_result.compressed_entangling_gate_num = root.entangling_gate_num;
+    validate_compressed_gate_structure(
+        best_validated_result.gate_structure.get(), root.initial_parameters, best_validated_result);
+    selected_validated_candidate = true;
+
+    for (size_t idx = 0; idx < validation_pool.size(); ++idx) {
+        const CompressionCandidate& candidate = validation_pool[idx];
+        if (compression_candidate_key(candidate) == root.key) {
+            continue;
+        }
+        N_Qubit_Decomposition_OSR_Compression_Result candidate_result;
+        candidate_result.gate_structure.reset(
+            clone_gate_structure_for_candidate(gate_structure_in, removable_paths, candidate));
+        candidate_result.osr_score = candidate.score;
+        candidate_result.original_entangling_gate_num = static_cast<int>(removable_paths.size());
+        candidate_result.compressed_entangling_gate_num = candidate.entangling_gate_num;
+        for (size_t removed_idx = 0; removed_idx < candidate.removed_ids.size(); ++removed_idx) {
+            candidate_result.removed_gate_paths.push_back(removable_paths[candidate.removed_ids[removed_idx]]);
+        }
+
+        validate_compressed_gate_structure(
+            candidate_result.gate_structure.get(), candidate.initial_parameters, candidate_result);
+
+        if ((candidate_result.reached_tolerance && !best_validated_result.reached_tolerance) ||
+            (candidate_result.reached_tolerance && best_validated_result.reached_tolerance &&
+             candidate_result.compressed_entangling_gate_num < best_validated_result.compressed_entangling_gate_num) ||
+            (candidate_result.reached_tolerance && best_validated_result.reached_tolerance &&
+             candidate_result.compressed_entangling_gate_num == best_validated_result.compressed_entangling_gate_num &&
+             candidate_result.current_minimum < best_validated_result.current_minimum) ||
+            (!candidate_result.reached_tolerance && !best_validated_result.reached_tolerance &&
+             candidate_result.current_minimum < best_validated_result.current_minimum)) {
+            best_validated_result = std::move(candidate_result);
+        }
+
+        if (best_validated_result.reached_tolerance &&
+            best_validated_result.compressed_entangling_gate_num == validation_pool.front().entangling_gate_num) {
+            break;
+        }
+    }
+
+    if (selected_validated_candidate) {
+        return best_validated_result;
+    }
+
+    result.gate_structure.reset(
+        clone_gate_structure_for_candidate(gate_structure_in, removable_paths, best));
+    result.osr_score = best.score;
+    for (size_t idx = 0; idx < best.removed_ids.size(); ++idx) {
+        result.removed_gate_paths.push_back(removable_paths[best.removed_ids[idx]]);
+    }
+    result.compressed_entangling_gate_num = best.entangling_gate_num;
+    return result;
+}
diff --git a/squander/src-cpp/decomposition/include/N_Qubit_Decomposition_OSR_Compression.h b/squander/src-cpp/decomposition/include/N_Qubit_Decomposition_OSR_Compression.h
new file mode 100644
index 000000000..80936330a
--- /dev/null
+++ b/squander/src-cpp/decomposition/include/N_Qubit_Decomposition_OSR_Compression.h
@@ -0,0 +1,182 @@
+/*
+Created on Sat May 02 2026
+Copyright 2026
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+/*! \file N_Qubit_Decomposition_OSR_Compression.h
+    \brief OSR-guided top-down compression for an existing gate structure.
+*/
+
+#ifndef N_Qubit_Decomposition_OSR_Compression_H
+#define N_Qubit_Decomposition_OSR_Compression_H
+
+#include "Gates_block.h"
+#include "N_Qubit_Decomposition_custom.h"
+#include "N_Qubit_Decomposition_Tree_Search.h"
+#include "config_element.h"
+#include "matrix.h"
+#include "matrix_real.h"
+
+#include <map>
+#include <memory>
+#include <vector>
+
+/**
+@brief Path to a gate inside a possibly nested Gates_block.
+
+Each entry is an index inside the corresponding block. For example, path
+{3, 2} means the third top-level gate is a block and its second child gate is
+the selected gate.
+*/
+struct OSRGatePath {
+    std::vector<int> indices;
+
+    bool operator<(const OSRGatePath& other) const {
+        return indices < other.indices;
+    }
+};
+
+/**
+@brief Tunable controls for OSR-guided compression.
+*/
+struct N_Qubit_Decomposition_OSR_Compression_Options {
+    /// Number of candidates kept after each deletion depth.
+    int beam_width = 8;
+    /// Maximal number of entangling gates to remove. Negative means no limit.
+    int max_removed_gates = -1;
+    /// Keep OSR candidates whose estimated remaining CNOT count is at most this value.
+    int osr_bound_limit = 0;
+    /// Number of full Hilbert-Schmidt validation trials for final candidates.
+    int validation_trials = 3;
+    /// If true, run full optimization on final candidates before returning.
+    bool validate_final = true;
+    /// OSR numerical rank tolerance.
+    double osr_tolerance = 1e-3;
+    /// If true, augment deletion candidates with local circuit mutations before validation.
+    bool enable_mutations = true;
+    /// Number of local mutation rounds applied after the deletion beam.
+    int mutation_rounds = 1;
+    /// Maximal number of mutation candidates generated from each round.
+    int mutation_candidates = 32;
+    /// If true, rewiring mutations may use all qubit pairs instead of only observed edges.
+    bool mutate_full_topology = false;
+    /// If true, validate freshly synthesized U3+CNOT skeletons at compressed depths.
+    bool enable_skeleton_search = true;
+    /// Exact CNOT skeleton depth to test. Negative derives the target from max_removed_gates.
+    int skeleton_target_cnots = -1;
+    /// Maximal number of synthesized skeletons admitted to final validation.
+    int skeleton_max_candidates = 4096;
+};
+
+/**
+@brief OSR score of one compressed candidate.
+*/
+struct N_Qubit_Decomposition_OSR_Compression_Score {
+    /// Lower-bound estimate of remaining CNOTs required by the residual OSR.
+    int min_remaining_cnots = 0;
+    /// Secondary bound-solver objective used as tie-breaker.
+    double kappa = 0.0;
+    /// Aggregate OSR residual tie-breaker.
+    double residual = 0.0;
+    /// Best per-topology-edge CNOT-count composition found by the bound solver.
+    std::vector<int> edge_counts;
+    /// Per-cut OSR rank/loss pairs used to derive the bound.
+    std::vector<std::pair<int, double>> cut_bounds;
+};
+
+/**
+@brief Result of OSR-guided compression.
+*/
+struct N_Qubit_Decomposition_OSR_Compression_Result {
+    /// Newly allocated compressed gate structure. The input gate structure is not modified.
+    std::unique_ptr<Gates_block> gate_structure;
+    /// Parameters from the best validation run, if validation was enabled and run.
+    Matrix_real optimized_parameters;
+    /// Best final cost from validation, or infinity when validation was not run.
+    double current_minimum;
+    /// OSR score of the returned structure.
+    N_Qubit_Decomposition_OSR_Compression_Score osr_score;
+    /// Removed entangling-gate paths from the original gate structure.
+    std::vector<OSRGatePath> removed_gate_paths;
+    /// Number of entangling gates found in the input structure.
+    int original_entangling_gate_num;
+    /// Number of entangling gates left in the returned structure.
+    int compressed_entangling_gate_num;
+    /// Whether final Hilbert-Schmidt validation was run.
+    bool validated;
+    /// Whether the final validation reached optimization_tolerance.
+    bool reached_tolerance;
+
+    double decomposition_error;
+
+    N_Qubit_Decomposition_OSR_Compression_Result();
+    N_Qubit_Decomposition_OSR_Compression_Result(const N_Qubit_Decomposition_OSR_Compression_Result&) = delete;
+    N_Qubit_Decomposition_OSR_Compression_Result& operator=(const N_Qubit_Decomposition_OSR_Compression_Result&) = delete;
+    N_Qubit_Decomposition_OSR_Compression_Result(N_Qubit_Decomposition_OSR_Compression_Result&&) = default;
+    N_Qubit_Decomposition_OSR_Compression_Result& operator=(N_Qubit_Decomposition_OSR_Compression_Result&&) = default;
+};
+
+/**
+@brief Decomposition class that compresses an already supplied gate structure with OSR guidance.
+
+The class assumes the starting circuit is already present in the inherited
+Gates_block, typically through set_custom_gate_structure(...). It then searches
+top-down by deleting entangling gates, uses OSR to keep promising compressed
+candidates, and finally validates the chosen structure with the standard
+optimization cost.
+*/
+class N_Qubit_Decomposition_OSR_Compression : public N_Qubit_Decomposition_custom {
+
+public:
+    N_Qubit_Decomposition_OSR_Compression();
+    N_Qubit_Decomposition_OSR_Compression(Matrix Umtx_in, int qbit_num_in,
+                                          std::map<std::string, Config_Element>& config,
+                                          int accelerator_num = 0);
+    N_Qubit_Decomposition_OSR_Compression(Matrix Umtx_in, int qbit_num_in,
+                                          std::vector<matrix_base<int>> topology_in,
+                                          std::map<std::string, Config_Element>& config,
+                                          int accelerator_num = 0);
+    virtual ~N_Qubit_Decomposition_OSR_Compression();
+
+    /// Externally supplied hardware topology. Empty means infer from the gate structure.
+    std::vector<matrix_base<int>> topology;
+
+    virtual void start_decomposition();
+
+    /**
+    @brief Compress the supplied gate structure without modifying it.
+    */
+    N_Qubit_Decomposition_OSR_Compression_Result compress_gate_structure(
+        Gates_block* gate_structure_in);
+
+    N_Qubit_Decomposition_OSR_Compression_Options get_osr_compression_options();
+
+protected:
+    N_Qubit_Decomposition_OSR_Compression_Score evaluate_gate_structure_osr(
+        Gates_block* gate_structure_in,
+        const Matrix_real& initial_parameters,
+        MinCnotBoundSolver& osr_bound_solver,
+        std::vector<std::vector<int>>& all_cuts);
+
+    N_Qubit_Decomposition_custom prepare_custom_optimizer(
+        Gates_block* gate_structure_in,
+        cost_function_type cost_function_variant);
+
+    void validate_compressed_gate_structure(
+        Gates_block* gate_structure_in,
+        const Matrix_real& initial_parameters,
+        N_Qubit_Decomposition_OSR_Compression_Result& result);
+};
+
+#endif

From 66c98636e14ef0a0abfe010d0cd3f691a530b55f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 21:54:57 +0200
Subject: [PATCH 187/232] missing init

---
 squander/__init__.py         |  2 +-
 squander/synthesis/PartAM.py | 28 ++++++++++------------------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/squander/__init__.py b/squander/__init__.py
index 87755ccc5..7e5002c33 100644
--- a/squander/__init__.py
+++ b/squander/__init__.py
@@ -7,6 +7,7 @@
     qgd_N_Qubit_Decomposition_custom as N_Qubit_Decomposition_custom,
     qgd_N_Qubit_Decomposition_Tree_Search as N_Qubit_Decomposition_Tree_Search,
     qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search,
+    qgd_N_Qubit_Decomposition_OSR_Compression as N_Qubit_Decomposition_OSR_Compression,
 )
 
 # State preparation (depends on adaptive decomposition)
@@ -73,4 +74,3 @@
 from squander.nn.qgd_nn import qgd_nn as NN
 
 
-
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index ca7008d62..6705979e4 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1726,22 +1726,18 @@ def Partition_Aware_Mapping(
             routing_elapsed_before_cleanup = time.time() - routing_start
 
             if do_cleanup:
-                from squander.decomposition.qgd_Wide_Circuit_Optimization import (
-                    qgd_Wide_Circuit_Optimization,
+                from squander.decomposition.qgd_WideCircuitCompression import (
+                    qgd_WideCircuitCompression,
                 )
 
                 cleanup_config = dict(self.config)
                 cleanup_config['topology'] = self.topology
-                cleanup_config['routed'] = True
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
-                cleanup_config['global_min'] = True
-                cleanup_config['use_osr'] = 0
-                cleanup_config['use_graph_search'] = 0
                 cleanup_config['part_size_end'] = 3
                 cleanup_config['max_partition_size'] = 3
 
-                wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+                wco = qgd_WideCircuitCompression(cleanup_config)
 
                 saved_sq_circuits = self._snapshot_single_qubit_circuits(
                     optimized_partitions
@@ -1795,7 +1791,7 @@ def Partition_Aware_Mapping(
                     )
 
                     cleanup_t0 = time.time()
-                    cleaned_circuit, cleaned_params = wco.OptimizeWideCircuit(
+                    cleaned_circuit, cleaned_params = wco.CompressWideCircuit(
                         trial_circuit.get_Flat_Circuit(),
                         trial_params,
                     )
@@ -1815,14 +1811,12 @@ def Partition_Aware_Mapping(
                         best_partition_body_cnot = trial_partition_cnot
 
                 final_cleanup_config = dict(cleanup_config)
-                final_cleanup_config['use_osr'] = 1
-                final_cleanup_config['use_graph_search'] = 1
                 final_cleanup_config['part_size_end'] = 4
 
-                wco = qgd_Wide_Circuit_Optimization(final_cleanup_config)
+                wco = qgd_WideCircuitCompression(final_cleanup_config)
 
                 cleanup_t0 = time.time()
-                final_circuit, final_parameters = wco.OptimizeWideCircuit(
+                final_circuit, final_parameters = wco.CompressWideCircuit(
                     best_circuit.get_Flat_Circuit(),
                     best_params,
                 )
@@ -1878,19 +1872,17 @@ def Partition_Aware_Mapping(
             )
 
             if self.config.get('cleanup', True):
-                from squander.decomposition.qgd_Wide_Circuit_Optimization import (
-                    qgd_Wide_Circuit_Optimization,
+                from squander.decomposition.qgd_WideCircuitCompression import (
+                    qgd_WideCircuitCompression,
                 )
 
                 cleanup_config = dict(self.config)
                 cleanup_config['topology'] = self.topology
-                cleanup_config['routed'] = True
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
-                cleanup_config['global_min'] = True
-                wco = qgd_Wide_Circuit_Optimization(cleanup_config)
+                wco = qgd_WideCircuitCompression(cleanup_config)
 
-                final_circuit, final_parameters = wco.OptimizeWideCircuit(
+                final_circuit, final_parameters = wco.CompressWideCircuit(
                     final_circuit.get_Flat_Circuit(), final_parameters
                 )
 

From a0028b64e84827bc77975a15462909886a921d23 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 2 May 2026 23:01:14 +0200
Subject: [PATCH 188/232] fix topology

---
 .../N_Qubit_Decomposition_OSR_Compression.cpp | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp b/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
index ea2505447..3f9835e86 100644
--- a/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
+++ b/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
@@ -1393,12 +1393,14 @@ N_Qubit_Decomposition_OSR_Compression::compress_gate_structure(
         return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end());
     });
 
-    std::vector<matrix_base<int>> active_topology = !this->topology.empty()
+    bool topology_user_set = !this->topology.empty();
+    std::vector<matrix_base<int>> active_topology = topology_user_set
         ? this->topology
         : topology_from_gate_structure(gate_structure_in, qbit_num);
-    std::vector<std::pair<int, int>> mutation_edges = options.mutate_full_topology
-        ? complete_topology_pairs(qbit_num)
-        : topology_pairs_from_matrices(active_topology);
+    std::vector<std::pair<int, int>> mutation_edges =
+        (options.mutate_full_topology && !topology_user_set)
+            ? complete_topology_pairs(qbit_num)
+            : topology_pairs_from_matrices(active_topology);
     MinCnotBoundSolver osr_bound_solver(qbit_num, all_cuts, active_topology);
 
     CompressionCandidate root;
@@ -1517,10 +1519,11 @@ N_Qubit_Decomposition_OSR_Compression::compress_gate_structure(
     }
 
     if (options.enable_skeleton_search) {
-        std::vector<std::pair<int, int>> skeleton_edges =
-            (options.mutate_full_topology || qbit_num <= 3)
-                ? complete_topology_pairs(qbit_num)
-                : mutation_edges;
+        std::vector<std::pair<int, int>> skeleton_edges = topology_user_set
+            ? topology_pairs_from_matrices(active_topology)
+            : ((options.mutate_full_topology || qbit_num <= 3)
+                   ? complete_topology_pairs(qbit_num)
+                   : mutation_edges);
         std::vector<CompressionCandidate> skeleton_candidates =
             generate_cnot_skeleton_candidates(
                 qbit_num, static_cast<int>(removable_paths.size()),

From 762a283e69d88162a9f0a6cbcf35957c8a65a901 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 3 May 2026 00:43:53 +0200
Subject: [PATCH 189/232] Revert "fix topology"

This reverts commit a0028b64e84827bc77975a15462909886a921d23.
---
 .../N_Qubit_Decomposition_OSR_Compression.cpp | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp b/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
index 3f9835e86..ea2505447 100644
--- a/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
+++ b/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
@@ -1393,14 +1393,12 @@ N_Qubit_Decomposition_OSR_Compression::compress_gate_structure(
         return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end());
     });
 
-    bool topology_user_set = !this->topology.empty();
-    std::vector<matrix_base<int>> active_topology = topology_user_set
+    std::vector<matrix_base<int>> active_topology = !this->topology.empty()
         ? this->topology
         : topology_from_gate_structure(gate_structure_in, qbit_num);
-    std::vector<std::pair<int, int>> mutation_edges =
-        (options.mutate_full_topology && !topology_user_set)
-            ? complete_topology_pairs(qbit_num)
-            : topology_pairs_from_matrices(active_topology);
+    std::vector<std::pair<int, int>> mutation_edges = options.mutate_full_topology
+        ? complete_topology_pairs(qbit_num)
+        : topology_pairs_from_matrices(active_topology);
     MinCnotBoundSolver osr_bound_solver(qbit_num, all_cuts, active_topology);
 
     CompressionCandidate root;
@@ -1519,11 +1517,10 @@ N_Qubit_Decomposition_OSR_Compression::compress_gate_structure(
     }
 
     if (options.enable_skeleton_search) {
-        std::vector<std::pair<int, int>> skeleton_edges = topology_user_set
-            ? topology_pairs_from_matrices(active_topology)
-            : ((options.mutate_full_topology || qbit_num <= 3)
-                   ? complete_topology_pairs(qbit_num)
-                   : mutation_edges);
+        std::vector<std::pair<int, int>> skeleton_edges =
+            (options.mutate_full_topology || qbit_num <= 3)
+                ? complete_topology_pairs(qbit_num)
+                : mutation_edges;
         std::vector<CompressionCandidate> skeleton_candidates =
             generate_cnot_skeleton_candidates(
                 qbit_num, static_cast<int>(removable_paths.size()),

From 29f3d1ee3353162c3e9ad91d431aa7e42ff56d08 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 3 May 2026 00:43:59 +0200
Subject: [PATCH 190/232] Revert "missing init"

This reverts commit 66c98636e14ef0a0abfe010d0cd3f691a530b55f.
---
 squander/__init__.py         |  2 +-
 squander/synthesis/PartAM.py | 28 ++++++++++++++++++----------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/squander/__init__.py b/squander/__init__.py
index 7e5002c33..87755ccc5 100644
--- a/squander/__init__.py
+++ b/squander/__init__.py
@@ -7,7 +7,6 @@
     qgd_N_Qubit_Decomposition_custom as N_Qubit_Decomposition_custom,
     qgd_N_Qubit_Decomposition_Tree_Search as N_Qubit_Decomposition_Tree_Search,
     qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search,
-    qgd_N_Qubit_Decomposition_OSR_Compression as N_Qubit_Decomposition_OSR_Compression,
 )
 
 # State preparation (depends on adaptive decomposition)
@@ -74,3 +73,4 @@
 from squander.nn.qgd_nn import qgd_nn as NN
 
 
+
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6705979e4..ca7008d62 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1726,18 +1726,22 @@ def Partition_Aware_Mapping(
             routing_elapsed_before_cleanup = time.time() - routing_start
 
             if do_cleanup:
-                from squander.decomposition.qgd_WideCircuitCompression import (
-                    qgd_WideCircuitCompression,
+                from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+                    qgd_Wide_Circuit_Optimization,
                 )
 
                 cleanup_config = dict(self.config)
                 cleanup_config['topology'] = self.topology
+                cleanup_config['routed'] = True
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
+                cleanup_config['global_min'] = True
+                cleanup_config['use_osr'] = 0
+                cleanup_config['use_graph_search'] = 0
                 cleanup_config['part_size_end'] = 3
                 cleanup_config['max_partition_size'] = 3
 
-                wco = qgd_WideCircuitCompression(cleanup_config)
+                wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
                 saved_sq_circuits = self._snapshot_single_qubit_circuits(
                     optimized_partitions
@@ -1791,7 +1795,7 @@ def Partition_Aware_Mapping(
                     )
 
                     cleanup_t0 = time.time()
-                    cleaned_circuit, cleaned_params = wco.CompressWideCircuit(
+                    cleaned_circuit, cleaned_params = wco.OptimizeWideCircuit(
                         trial_circuit.get_Flat_Circuit(),
                         trial_params,
                     )
@@ -1811,12 +1815,14 @@ def Partition_Aware_Mapping(
                         best_partition_body_cnot = trial_partition_cnot
 
                 final_cleanup_config = dict(cleanup_config)
+                final_cleanup_config['use_osr'] = 1
+                final_cleanup_config['use_graph_search'] = 1
                 final_cleanup_config['part_size_end'] = 4
 
-                wco = qgd_WideCircuitCompression(final_cleanup_config)
+                wco = qgd_Wide_Circuit_Optimization(final_cleanup_config)
 
                 cleanup_t0 = time.time()
-                final_circuit, final_parameters = wco.CompressWideCircuit(
+                final_circuit, final_parameters = wco.OptimizeWideCircuit(
                     best_circuit.get_Flat_Circuit(),
                     best_params,
                 )
@@ -1872,17 +1878,19 @@ def Partition_Aware_Mapping(
             )
 
             if self.config.get('cleanup', True):
-                from squander.decomposition.qgd_WideCircuitCompression import (
-                    qgd_WideCircuitCompression,
+                from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+                    qgd_Wide_Circuit_Optimization,
                 )
 
                 cleanup_config = dict(self.config)
                 cleanup_config['topology'] = self.topology
+                cleanup_config['routed'] = True
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
-                wco = qgd_WideCircuitCompression(cleanup_config)
+                cleanup_config['global_min'] = True
+                wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
-                final_circuit, final_parameters = wco.CompressWideCircuit(
+                final_circuit, final_parameters = wco.OptimizeWideCircuit(
                     final_circuit.get_Flat_Circuit(), final_parameters
                 )
 

From d6b522febdd2d3d2942ec912ea03fdf7f67c070d Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 3 May 2026 00:44:04 +0200
Subject: [PATCH 191/232] Revert "Add in OSR compression"

This reverts commit 7fb938062d184f9102bfb7947ebd360fde0973cb.
---
 CMakeLists.txt                                |    1 -
 .../qgd_N_Qubit_Decompositions_Wrapper.cpp    |   51 +-
 .../qgd_WideCircuitCompression.py             |  267 ---
 .../N_Qubit_Decomposition_OSR_Compression.cpp | 1628 -----------------
 .../N_Qubit_Decomposition_OSR_Compression.h   |  182 --
 5 files changed, 1 insertion(+), 2128 deletions(-)
 delete mode 100644 squander/decomposition/qgd_WideCircuitCompression.py
 delete mode 100644 squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
 delete mode 100644 squander/src-cpp/decomposition/include/N_Qubit_Decomposition_OSR_Compression.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6df853985..839eeea81 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -833,7 +833,6 @@ list(APPEND qgd_files
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/decomposition/Sub_Matrix_Decomposition.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tree_Search.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/decomposition/N_Qubit_Decomposition_Tabu_Search.cpp
-    ${PROJECT_SOURCE_DIR}/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/variational_quantum_eigensolver/Variational_Quantum_Eigensolver_Base.cpp  
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/variational_quantum_eigensolver/Generative_Quantum_Machine_Learning_Base.cpp  
     ${PROJECT_SOURCE_DIR}/squander/src-cpp/density_matrix/density_matrix.cpp
diff --git a/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp b/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
index 950137356..2e36cc3f7 100644
--- a/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
+++ b/squander/decomposition/qgd_N_Qubit_Decompositions_Wrapper.cpp
@@ -26,7 +26,6 @@
 #include "N_Qubit_Decomposition_adaptive.h"
 #include "N_Qubit_Decomposition_Tree_Search.h"
 #include "N_Qubit_Decomposition_Tabu_Search.h"
-#include "N_Qubit_Decomposition_OSR_Compression.h"
 #include "Gates_block.h"
 
 /**
@@ -321,43 +320,6 @@ qgd_N_Qubit_Decomposition_Tabu_Search_Wrapper_init(qgd_N_Qubit_Decomposition_Wra
     return search_wrapper_init<N_Qubit_Decomposition_Tabu_Search>(self, args, kwds);
 }
 
-static int
-qgd_N_Qubit_Decomposition_OSR_Compression_Wrapper_init(qgd_N_Qubit_Decomposition_Wrapper* self, PyObject* args, PyObject* kwds)
-{
-    static char* kwlist[] = {
-        (char*)"Umtx", (char*)"qbit_num", (char*)"config", (char*)"accelerator_num",
-        (char*)"topology", NULL
-    };
-
-    PyObject *Umtx_arg = NULL, *config_arg = NULL, *topology_arg = NULL;
-    int qbit_num = -1, accelerator_num = 0;
-
-    if (!PyArg_ParseTupleAndKeywords(
-        args, kwds, "O|iOiO", kwlist,
-        &Umtx_arg, &qbit_num, &config_arg, &accelerator_num, &topology_arg)
-    ) {
-        return -1;
-    }
-
-    try {
-        Matrix Umtx_mtx = extract_matrix(Umtx_arg, &self->Umtx);
-        if (qbit_num == -1) {
-            qbit_num = (int)std::round(std::log2(Umtx_mtx.rows));
-        }
-
-        auto config = extract_config(config_arg);
-        auto topology_cpp = extract_topology(topology_arg);
-
-        self->decomp = new N_Qubit_Decomposition_OSR_Compression(
-            Umtx_mtx, qbit_num, topology_cpp, config, accelerator_num);
-
-        return 0;
-    } catch (const std::exception& e) {
-        PyErr_SetString(PyExc_Exception, e.what());
-        return -1;
-    }
-}
-
 /**
  * @brief Deallocate decomposition instance
  */
@@ -3101,14 +3063,6 @@ static PyMethodDef qgd_N_Qubit_Decomposition_Tree_Search_methods[] = {
     {NULL}
 };
 
-/**
-@brief Method table for N_Qubit_Decomposition_OSR_Compression
-*/
-static PyMethodDef qgd_N_Qubit_Decomposition_OSR_Compression_methods[] = {
-    DECOMPOSITION_WRAPPER_BASE_METHODS
-    {NULL}
-};
-
 #define decomposition_wrapper_type_template(decomp_class) \
 static PyTypeObject qgd_##decomp_class##_Wrapper_Type = { \
     PyVarObject_HEAD_INIT(NULL, 0) \
@@ -3167,7 +3121,6 @@ decomposition_wrapper_type_template(N_Qubit_Decomposition_adaptive)
 decomposition_wrapper_type_template(N_Qubit_Decomposition_custom)
 decomposition_wrapper_type_template(N_Qubit_Decomposition_Tree_Search)
 decomposition_wrapper_type_template(N_Qubit_Decomposition_Tabu_Search)
-decomposition_wrapper_type_template(N_Qubit_Decomposition_OSR_Compression)
 
 //////////////////////////////////////////////////////////////////
 
@@ -3209,8 +3162,7 @@ PyInit_qgd_N_Qubit_Decompositions_Wrapper(void)
         PyType_Ready(&qgd_N_Qubit_Decomposition_adaptive_Wrapper_Type) < 0 ||
         PyType_Ready(&qgd_N_Qubit_Decomposition_custom_Wrapper_Type) < 0 ||
         PyType_Ready(&qgd_N_Qubit_Decomposition_Tree_Search_Wrapper_Type) < 0 ||
-        PyType_Ready(&qgd_N_Qubit_Decomposition_Tabu_Search_Wrapper_Type) < 0 ||
-        PyType_Ready(&qgd_N_Qubit_Decomposition_OSR_Compression_Wrapper_Type) < 0) {
+        PyType_Ready(&qgd_N_Qubit_Decomposition_Tabu_Search_Wrapper_Type) < 0) {
         return NULL;
     }
 
@@ -3223,7 +3175,6 @@ PyInit_qgd_N_Qubit_Decompositions_Wrapper(void)
     Py_INCREF_template(N_Qubit_Decomposition_custom);
     Py_INCREF_template(N_Qubit_Decomposition_Tree_Search);
     Py_INCREF_template(N_Qubit_Decomposition_Tabu_Search);
-    Py_INCREF_template(N_Qubit_Decomposition_OSR_Compression);
 
     return m;
 }
diff --git a/squander/decomposition/qgd_WideCircuitCompression.py b/squander/decomposition/qgd_WideCircuitCompression.py
deleted file mode 100644
index ec7e32578..000000000
--- a/squander/decomposition/qgd_WideCircuitCompression.py
+++ /dev/null
@@ -1,267 +0,0 @@
-"""
-Wide-circuit compression: partition large circuits into subcircuits and run
-OSR-guided gate-structure compression on each partition.
-"""
-
-from squander import N_Qubit_Decomposition_OSR_Compression
-from squander.gates.qgd_Circuit import qgd_Circuit as Circuit
-from squander.utils import CompareCircuits
-
-from squander.partitioning.partition import PartitionCircuit
-from squander.decomposition.qgd_Wide_Circuit_Optimization import (
-    CNOT_COUNT_DICT,
-    CNOTGateCount,
-    extract_subtopology,
-    qgd_Wide_Circuit_Optimization,
-)
-
-import numpy as np
-import multiprocessing as mp
-from multiprocessing import Pool, parent_process
-import os, contextlib, time
-
-from typing import List, Tuple, Optional, cast
-
-
-class qgd_WideCircuitCompression:
-    """Optimize wide circuits by partitioning and per-partition OSR compression.
-
-    Each partition is treated as a fixed gate structure. The OSR compression
-    decomposer attempts to remove entangling gates while still reproducing the
-    partition's unitary within the configured tolerance. If compression fails,
-    the original partition is kept.
-    """
-
-    def __init__(self, config):
-        config.setdefault("parallel", 0)
-        config.setdefault("verbosity", 0)
-        config.setdefault("tolerance", 1e-8)
-        config.setdefault("test_subcircuits", False)
-        config.setdefault("test_final_circuit", True)
-        config.setdefault("max_partition_size", 3)
-        config.setdefault("partition_strategy", "ilp")
-        config.setdefault("topology", None)
-
-        if config["parallel"] not in (0, 1, 2):
-            raise Exception(
-                f"The parallel configuration should be either of [0, 1, 2], got {config['parallel']}."
-            )
-        if not isinstance(config["verbosity"], int):
-            raise Exception("The verbosity parameter should be an integer.")
-        if not isinstance(config["tolerance"], float):
-            raise Exception("The tolerance parameter should be a float.")
-        if not isinstance(config["test_subcircuits"], bool):
-            raise Exception("The test_subcircuits parameter should be a bool.")
-        if not isinstance(config["test_final_circuit"], bool):
-            raise Exception("The test_final_circuit parameter should be a bool.")
-        if not isinstance(config["max_partition_size"], int):
-            raise Exception("The max_partition_size parameter should be an integer.")
-
-        self.config = config
-        self.max_partition_size = config["max_partition_size"]
-
-    @staticmethod
-    def CompressPartition(
-        subcircuit: Circuit,
-        subcircuit_parameters: np.ndarray,
-        config: dict,
-    ) -> Tuple[Circuit, np.ndarray]:
-        """Run OSR compression on a single partition subcircuit.
-
-        Returns the compressed circuit (remapped to the original wide register)
-        and its parameters. Falls back to the original subcircuit on failure.
-        """
-        qbit_num_orig = subcircuit.get_Qbit_Num()
-        involved = subcircuit.get_Qbits()
-        qbit_num = len(involved)
-
-        qbit_map = {q: i for i, q in enumerate(involved)}
-        remapped = subcircuit.Remap_Qbits(qbit_map, qbit_num)
-
-        # restrict OSR mutations to topology edges that survive the partition
-        local_config = dict(config)
-        if config.get("topology") is not None:
-            mini_topology = extract_subtopology(involved, qbit_map, config)
-            local_config.setdefault("osr_compression_mutate_full_topology", 0)
-        else:
-            mini_topology = None
-
-        # the partition unitary is the OSR target
-        unitary = remapped.get_Matrix(subcircuit_parameters)
-
-        cDecompose = N_Qubit_Decomposition_OSR_Compression(
-            unitary.conj().T,
-            qbit_num=qbit_num,
-            config=local_config,
-            accelerator_num=0,
-            topology=mini_topology,
-        )
-        cDecompose.set_Verbose(config["verbosity"])
-        cDecompose.set_Cost_Function_Variant(3)
-        cDecompose.set_Optimization_Tolerance(config["tolerance"])
-        cDecompose.set_Optimizer("BFGS")
-
-        # supply the existing structure + warm-start parameters
-        cDecompose.set_Gate_Structure(remapped)
-        cDecompose.set_Optimized_Parameters(subcircuit_parameters)
-
-        try:
-            cDecompose.Start_Decomposition()
-        except Exception:
-            return subcircuit, subcircuit_parameters
-
-        new_circ = cDecompose.get_Circuit()
-        new_params = cDecompose.get_Optimized_Parameters()
-        err = cDecompose.get_Decomposition_Error()
-
-        if err > config["tolerance"]:
-            return subcircuit, subcircuit_parameters
-
-        inverse_map = {v: k for k, v in qbit_map.items()}
-        new_circ = new_circ.Remap_Qbits(inverse_map, qbit_num_orig).get_Flat_Circuit()
-
-        if config["test_subcircuits"]:
-            CompareCircuits(
-                subcircuit,
-                subcircuit_parameters,
-                new_circ,
-                new_params,
-                parallel=config["parallel"],
-            )
-
-        return new_circ, new_params
-
-    def InnerCompressWideCircuit(
-        self, circ: Circuit, parameters: np.ndarray
-    ) -> Tuple[Circuit, np.ndarray]:
-        """Single pass: partition ``circ``, OSR-compress each partition, stitch."""
-        from squander.utils import circuit_to_CNOT_basis
-
-        circ, parameters = circuit_to_CNOT_basis(circ, parameters)
-
-        partitioned_circuit, parameters, _ = PartitionCircuit(
-            circ,
-            parameters,
-            self.max_partition_size,
-            strategy=self.config["partition_strategy"],
-        )
-        subcircuits = partitioned_circuit.get_Gates()
-
-        in_parent = parent_process() is not None
-        if not in_parent and self.config["verbosity"] >= 1:
-            print(len(subcircuits), "partitions to compress")
-
-        optimized_subcircuits: List[Optional[Circuit]] = [None] * len(subcircuits)
-        optimized_parameter_list: List[Optional[np.ndarray]] = [None] * len(subcircuits)
-
-        max_gates = sum(
-            y for x, y in circ.get_Gate_Nums().items() if x not in CNOT_COUNT_DICT
-        )
-
-        slices = []
-        for sub in subcircuits:
-            start = sub.get_Parameter_Start_Index()
-            slices.append(parameters[start : start + sub.get_Parameter_Num()])
-
-        nproc = (
-            len(os.sched_getaffinity(0))
-            if hasattr(os, "sched_getaffinity")
-            else mp.cpu_count()
-        )
-        with (
-            contextlib.nullcontext() if in_parent else Pool(processes=nproc)
-        ) as pool:
-            async_results = []
-            for idx, sub in enumerate(subcircuits):
-                args = (sub, slices[idx], self.config)
-                if in_parent:
-                    async_results.append(args)
-                else:
-                    async_results.append(pool.apply_async(self.CompressPartition, args))
-
-            for idx, ar in enumerate(async_results):
-                if in_parent:
-                    new_sub, new_p = self.CompressPartition(*ar)
-                else:
-                    new_sub, new_p = ar.get(timeout=None)
-
-                orig_score = CNOTGateCount(subcircuits[idx], max_gates)
-                new_score = CNOTGateCount(new_sub, max_gates)
-                if new_score < orig_score:
-                    optimized_subcircuits[idx] = new_sub
-                    optimized_parameter_list[idx] = new_p
-                    if self.config["verbosity"] >= 2:
-                        print(
-                            f"partition {idx}: {subcircuits[idx].get_Gate_Nums()} -> {new_sub.get_Gate_Nums()}"
-                        )
-                else:
-                    optimized_subcircuits[idx] = subcircuits[idx]
-                    optimized_parameter_list[idx] = slices[idx]
-
-                if self.config["verbosity"] >= 1 and (idx + 1) % 100 == 0:
-                    print(idx + 1, "partitions compressed")
-
-        wide_parameters = np.concatenate(
-            cast(List[np.ndarray], optimized_parameter_list), axis=0
-        )
-        wide_circuit = Circuit(circ.get_Qbit_Num())
-        for c in cast(List[Circuit], optimized_subcircuits):
-            wide_circuit.add_Circuit(c)
-
-        assert wide_circuit.get_Parameter_Num() == wide_parameters.size, (
-            f"Mismatch in parameter counts: "
-            f"{wide_circuit.get_Parameter_Num()} vs {wide_parameters.size}"
-        )
-
-        if not in_parent and self.config["verbosity"] >= 1:
-            print("original circuit:   ", circ.get_Gate_Nums())
-            print("compressed circuit: ", wide_circuit.get_Gate_Nums())
-
-        qgd_Wide_Circuit_Optimization.check_valid_routing(
-            wide_circuit, self.config["topology"]
-        )
-        if self.config["verbosity"] >= 2:
-            print("InnerCompressWideCircuit: check_compare_circuits")
-        if self.config["test_final_circuit"]:
-            CompareCircuits(circ, parameters, wide_circuit, wide_parameters)
-
-        return wide_circuit, wide_parameters
-
-    def CompressWideCircuit(
-        self, circ: Circuit, parameters: np.ndarray
-    ) -> Tuple[Circuit, np.ndarray]:
-        """Top-level: sweep partition sizes, repeat each pass until no improvement.
-
-        Mirrors the outer loop of ``qgd_Wide_Circuit_Optimization.OptimizeWideCircuit``
-        for the Squander branch. Records ``self.config['compression_time']``.
-        Requires the input circuit to already respect ``config['topology']``;
-        no routing is performed.
-        """
-        if not qgd_Wide_Circuit_Optimization.is_valid_routing(
-            circ, self.config["topology"]
-        ):
-            raise Exception(
-                "Input circuit does not respect the configured topology; "
-                "qgd_WideCircuitCompression does not perform routing."
-            )
-
-        start_time = time.time()
-        part_size_start = self.max_partition_size
-        part_size_end = self.config.get("part_size_end", self.max_partition_size)
-
-        count = CNOTGateCount(circ, 0)
-        for max_part_size in range(part_size_start, part_size_end + 1):
-            inner = qgd_WideCircuitCompression(
-                {**self.config, "max_partition_size": max_part_size}
-            )
-            while True:
-                circ_flat, parameters = inner.InnerCompressWideCircuit(circ, parameters)
-                circ = circ_flat.get_Flat_Circuit()
-                newcount = CNOTGateCount(circ, 0)
-                no_improve = newcount >= count
-                count = newcount
-                if no_improve:
-                    break
-
-        self.config["compression_time"] = time.time() - start_time
-        return circ, parameters
diff --git a/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp b/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
deleted file mode 100644
index ea2505447..000000000
--- a/squander/src-cpp/decomposition/N_Qubit_Decomposition_OSR_Compression.cpp
+++ /dev/null
@@ -1,1628 +0,0 @@
-/*
-Created on Sat May 02 2026
-Copyright 2026
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-/*! \file N_Qubit_Decomposition_OSR_Compression.cpp
-    \brief OSR-guided top-down compression for an existing gate structure.
-*/
-
-#include "N_Qubit_Decomposition_OSR_Compression.h"
-#include "N_Qubit_Decomposition_Cost_Function.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-#include <limits>
-#include <memory>
-#include <numeric>
-#include <random>
-#include <set>
-#include <sstream>
-#include <utility>
-
-namespace {
-
-struct CompressionCandidate {
-    std::vector<int> removed_ids;
-    Matrix_real initial_parameters;
-    N_Qubit_Decomposition_OSR_Compression_Score score;
-    int entangling_gate_num;
-    std::shared_ptr<Gates_block> gate_structure;
-    std::string key;
-};
-
-static bool is_entangling_gate_type(gate_type type) {
-    switch (type) {
-    case CNOT_OPERATION:
-    case CZ_OPERATION:
-    case CH_OPERATION:
-    case SYC_OPERATION:
-    case CRY_OPERATION:
-    case CRX_OPERATION:
-    case CRZ_OPERATION:
-    case CP_OPERATION:
-    case CR_OPERATION:
-    case CROT_OPERATION:
-    case CZ_NU_OPERATION:
-    case CU_OPERATION:
-    case ADAPTIVE_OPERATION:
-    case RXX_OPERATION:
-    case RYY_OPERATION:
-    case RZZ_OPERATION:
-    case SWAP_OPERATION:
-    case CSWAP_OPERATION:
-    case CCX_OPERATION:
-        return true;
-    default:
-        return false;
-    }
-}
-
-static bool is_entangling_gate(Gate* gate) {
-    if (gate == NULL || gate->get_type() == BLOCK_OPERATION) {
-        return false;
-    }
-    if (is_entangling_gate_type(gate->get_type())) {
-        return true;
-    }
-    return gate->get_involved_qubits().size() > 1;
-}
-
-static void collect_entangling_gate_paths(Gates_block* block,
-                                          std::vector<int>& prefix,
-                                          std::vector<OSRGatePath>& out) {
-    if (block == NULL) {
-        return;
-    }
-
-    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
-        Gate* gate = block->get_gate(idx);
-        prefix.push_back(idx);
-
-        if (is_entangling_gate(gate)) {
-            OSRGatePath path;
-            path.indices = prefix;
-            out.push_back(path);
-        }
-
-        if (gate != NULL && gate->get_type() == BLOCK_OPERATION) {
-            collect_entangling_gate_paths(static_cast<Gates_block*>(gate), prefix, out);
-        }
-
-        prefix.pop_back();
-    }
-}
-
-static std::vector<OSRGatePath> collect_entangling_gate_paths(Gates_block* block) {
-    std::vector<OSRGatePath> ret;
-    std::vector<int> prefix;
-    collect_entangling_gate_paths(block, prefix, ret);
-    return ret;
-}
-
-static void append_int_vector_signature(std::stringstream& sstream,
-                                        const std::vector<int>& values) {
-    sstream << "[";
-    for (size_t idx = 0; idx < values.size(); ++idx) {
-        if (idx > 0) {
-            sstream << ",";
-        }
-        sstream << values[idx];
-    }
-    sstream << "]";
-}
-
-static void append_gate_structure_signature(Gates_block* block,
-                                            std::stringstream& sstream) {
-    if (block == NULL) {
-        sstream << "NULL";
-        return;
-    }
-
-    sstream << "B" << block->get_gate_num() << "(";
-    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
-        Gate* gate = block->get_gate(idx);
-        if (gate == NULL) {
-            sstream << "NULL;";
-            continue;
-        }
-
-        sstream << static_cast<int>(gate->get_type()) << ":T";
-        std::vector<int> targets = gate->get_target_qbits();
-        if (targets.empty() && gate->get_target_qbit() >= 0) {
-            targets.push_back(gate->get_target_qbit());
-        }
-        append_int_vector_signature(sstream, targets);
-
-        sstream << ":C";
-        std::vector<int> controls = gate->get_control_qbits();
-        if (controls.empty() && gate->get_control_qbit() >= 0) {
-            controls.push_back(gate->get_control_qbit());
-        }
-        append_int_vector_signature(sstream, controls);
-
-        sstream << ":P" << gate->get_parameter_num();
-        if (gate->get_type() == BLOCK_OPERATION) {
-            sstream << "{";
-            append_gate_structure_signature(static_cast<Gates_block*>(gate), sstream);
-            sstream << "}";
-        }
-        sstream << ";";
-    }
-    sstream << ")";
-}
-
-static std::string gate_structure_signature(Gates_block* block) {
-    std::stringstream sstream;
-    append_gate_structure_signature(block, sstream);
-    return sstream.str();
-}
-
-static Gate* gate_at_path(Gates_block* block, const OSRGatePath& path) {
-    Gates_block* current_block = block;
-    for (size_t depth = 0; depth < path.indices.size(); ++depth) {
-        if (current_block == NULL) {
-            return NULL;
-        }
-
-        int gate_idx = path.indices[depth];
-        if (gate_idx < 0 || gate_idx >= current_block->get_gate_num()) {
-            return NULL;
-        }
-
-        Gate* gate = current_block->get_gate(gate_idx);
-        if (depth == path.indices.size() - 1) {
-            return gate;
-        }
-        if (gate == NULL || gate->get_type() != BLOCK_OPERATION) {
-            return NULL;
-        }
-        current_block = static_cast<Gates_block*>(gate);
-    }
-    return NULL;
-}
-
-static bool get_two_qubit_endpoint_pair(Gate* gate, int& q0, int& q1) {
-    if (gate == NULL) {
-        return false;
-    }
-
-    std::vector<int> involved = gate->get_involved_qubits();
-    std::sort(involved.begin(), involved.end());
-    involved.erase(std::unique(involved.begin(), involved.end()), involved.end());
-    if (involved.size() != 2) {
-        return false;
-    }
-
-    q0 = involved[0];
-    q1 = involved[1];
-    return true;
-}
-
-static bool gate_endpoint_sets_are_disjoint(Gate* lhs, Gate* rhs) {
-    int lhs_q0 = 0;
-    int lhs_q1 = 0;
-    int rhs_q0 = 0;
-    int rhs_q1 = 0;
-    if (!get_two_qubit_endpoint_pair(lhs, lhs_q0, lhs_q1) ||
-        !get_two_qubit_endpoint_pair(rhs, rhs_q0, rhs_q1)) {
-        return false;
-    }
-
-    return lhs_q0 != rhs_q0 && lhs_q0 != rhs_q1 &&
-           lhs_q1 != rhs_q0 && lhs_q1 != rhs_q1;
-}
-
-static bool gate_type_is_directional(gate_type type) {
-    switch (type) {
-    case CZ_OPERATION:
-    case SWAP_OPERATION:
-    case RXX_OPERATION:
-    case RYY_OPERATION:
-    case RZZ_OPERATION:
-        return false;
-    default:
-        return true;
-    }
-}
-
-static bool rewire_two_qubit_gate(Gate* gate, int new_target, int new_control) {
-    if (gate == NULL || new_target == new_control) {
-        return false;
-    }
-
-    int old_q0 = 0;
-    int old_q1 = 0;
-    if (!get_two_qubit_endpoint_pair(gate, old_q0, old_q1)) {
-        return false;
-    }
-
-    std::vector<int> controls = gate->get_control_qbits();
-    if (!controls.empty() || gate->get_control_qbit() >= 0) {
-        gate->set_target_qbit(new_target);
-        gate->set_control_qbit(new_control);
-        return true;
-    }
-
-    std::vector<int> targets = gate->get_target_qbits();
-    if (targets.size() >= 2) {
-        std::vector<int> new_targets;
-        new_targets.push_back(new_target);
-        new_targets.push_back(new_control);
-        gate->set_target_qbits(new_targets);
-        return true;
-    }
-
-    return false;
-}
-
-static bool path_has_prefix(const OSRGatePath& path, const std::vector<int>& prefix) {
-    if (path.indices.size() < prefix.size()) {
-        return false;
-    }
-    return std::equal(prefix.begin(), prefix.end(), path.indices.begin());
-}
-
-static bool path_equals_prefix(const OSRGatePath& path, const std::vector<int>& prefix) {
-    return path.indices.size() == prefix.size() && path_has_prefix(path, prefix);
-}
-
-static bool subtree_contains_removed_path(const std::set<OSRGatePath>& removed_paths,
-                                          const std::vector<int>& prefix) {
-    for (std::set<OSRGatePath>::const_iterator it = removed_paths.begin(); it != removed_paths.end(); ++it) {
-        if (path_has_prefix(*it, prefix)) {
-            return true;
-        }
-    }
-    return false;
-}
-
-static Gates_block* clone_without_removed_paths(Gates_block* block,
-                                                const std::set<OSRGatePath>& removed_paths,
-                                                std::vector<int>& prefix) {
-    Gates_block* ret = new Gates_block(block->get_qbit_num());
-
-    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
-        Gate* gate = block->get_gate(idx);
-        prefix.push_back(idx);
-
-        bool remove_gate = false;
-        for (std::set<OSRGatePath>::const_iterator it = removed_paths.begin(); it != removed_paths.end(); ++it) {
-            if (path_equals_prefix(*it, prefix)) {
-                remove_gate = true;
-                break;
-            }
-        }
-
-        if (!remove_gate) {
-            if (gate->get_type() == BLOCK_OPERATION && subtree_contains_removed_path(removed_paths, prefix)) {
-                Gates_block* cloned_block = clone_without_removed_paths(
-                    static_cast<Gates_block*>(gate), removed_paths, prefix);
-                ret->add_gate(cloned_block);
-            } else {
-                ret->add_gate(gate->clone());
-            }
-        }
-
-        prefix.pop_back();
-    }
-
-    return ret;
-}
-
-static Gates_block* clone_without_removed_paths(Gates_block* block,
-                                                const std::vector<OSRGatePath>& all_paths,
-                                                const std::vector<int>& removed_ids) {
-    std::set<OSRGatePath> removed_paths;
-    for (size_t idx = 0; idx < removed_ids.size(); ++idx) {
-        removed_paths.insert(all_paths[removed_ids[idx]]);
-    }
-
-    std::vector<int> prefix;
-    return clone_without_removed_paths(block, removed_paths, prefix);
-}
-
-static Gates_block* clone_with_rewired_gate_path(Gates_block* block,
-                                                 const OSRGatePath& path,
-                                                 int depth,
-                                                 int new_target,
-                                                 int new_control) {
-    Gates_block* ret = new Gates_block(block->get_qbit_num());
-
-    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
-        Gate* gate = block->get_gate(idx);
-        if (gate == NULL) {
-            continue;
-        }
-
-        if (depth < static_cast<int>(path.indices.size()) &&
-            idx == path.indices[depth]) {
-            if (depth == static_cast<int>(path.indices.size()) - 1) {
-                Gate* cloned_gate = gate->clone();
-                if (!rewire_two_qubit_gate(cloned_gate, new_target, new_control)) {
-                    delete cloned_gate;
-                    delete ret;
-                    return NULL;
-                }
-                ret->add_gate(cloned_gate);
-            } else {
-                if (gate->get_type() != BLOCK_OPERATION) {
-                    delete ret;
-                    return NULL;
-                }
-
-                Gates_block* rewired_block = clone_with_rewired_gate_path(
-                    static_cast<Gates_block*>(gate), path, depth + 1,
-                    new_target, new_control);
-                if (rewired_block == NULL) {
-                    delete ret;
-                    return NULL;
-                }
-                ret->add_gate(rewired_block);
-            }
-        } else {
-            ret->add_gate(gate->clone());
-        }
-    }
-
-    return ret;
-}
-
-static Gates_block* clone_with_rewired_gate_path(Gates_block* block,
-                                                 const OSRGatePath& path,
-                                                 int new_target,
-                                                 int new_control) {
-    return clone_with_rewired_gate_path(block, path, 0, new_target, new_control);
-}
-
-static Gates_block* clone_with_swapped_sibling_gates(Gates_block* block,
-                                                     const std::vector<int>& parent_path,
-                                                     int depth,
-                                                     int first_idx,
-                                                     int second_idx) {
-    Gates_block* ret = new Gates_block(block->get_qbit_num());
-
-    if (depth == static_cast<int>(parent_path.size())) {
-        for (int idx = 0; idx < block->get_gate_num(); ++idx) {
-            int source_idx = idx;
-            if (idx == first_idx) {
-                source_idx = second_idx;
-            } else if (idx == second_idx) {
-                source_idx = first_idx;
-            }
-            Gate* source_gate = block->get_gate(source_idx);
-            if (source_gate != NULL) {
-                ret->add_gate(source_gate->clone());
-            }
-        }
-        return ret;
-    }
-
-    int selected_idx = parent_path[depth];
-    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
-        Gate* gate = block->get_gate(idx);
-        if (gate == NULL) {
-            continue;
-        }
-
-        if (idx == selected_idx) {
-            if (gate->get_type() != BLOCK_OPERATION) {
-                delete ret;
-                return NULL;
-            }
-            Gates_block* swapped_block = clone_with_swapped_sibling_gates(
-                static_cast<Gates_block*>(gate), parent_path, depth + 1,
-                first_idx, second_idx);
-            if (swapped_block == NULL) {
-                delete ret;
-                return NULL;
-            }
-            ret->add_gate(swapped_block);
-        } else {
-            ret->add_gate(gate->clone());
-        }
-    }
-
-    return ret;
-}
-
-static Gates_block* clone_with_swapped_sibling_gates(Gates_block* block,
-                                                     const OSRGatePath& first_path,
-                                                     const OSRGatePath& second_path) {
-    if (first_path.indices.size() != second_path.indices.size() ||
-        first_path.indices.empty()) {
-        return NULL;
-    }
-
-    std::vector<int> first_parent(
-        first_path.indices.begin(), first_path.indices.end() - 1);
-    std::vector<int> second_parent(
-        second_path.indices.begin(), second_path.indices.end() - 1);
-    if (first_parent != second_parent) {
-        return NULL;
-    }
-
-    int first_idx = first_path.indices.back();
-    int second_idx = second_path.indices.back();
-    if (first_idx == second_idx) {
-        return NULL;
-    }
-    if (second_idx < first_idx) {
-        std::swap(first_idx, second_idx);
-    }
-
-    return clone_with_swapped_sibling_gates(
-        block, first_parent, 0, first_idx, second_idx);
-}
-
-static bool parameter_interval_for_path(Gates_block* block,
-                                        const OSRGatePath& path,
-                                        int depth,
-                                        int offset,
-                                        int& start,
-                                        int& length) {
-    if (block == NULL || depth >= static_cast<int>(path.indices.size())) {
-        return false;
-    }
-
-    int gate_idx = path.indices[depth];
-    Gate* gate = block->get_gate(gate_idx);
-    if (gate == NULL) {
-        return false;
-    }
-
-    int gate_offset = offset + gate->get_parameter_start_idx();
-    if (depth == static_cast<int>(path.indices.size()) - 1) {
-        start = gate_offset;
-        length = gate->get_parameter_num();
-        return true;
-    }
-
-    if (gate->get_type() != BLOCK_OPERATION) {
-        return false;
-    }
-
-    return parameter_interval_for_path(
-        static_cast<Gates_block*>(gate), path, depth + 1, gate_offset, start, length);
-}
-
-static Matrix_real reduced_parameters_without_paths(
-    Gates_block* original_gate_structure,
-    const std::vector<OSRGatePath>& removed_paths,
-    const Matrix_real& original_parameters) {
-    if (original_parameters.size() == 0 ||
-        original_parameters.size() != original_gate_structure->get_parameter_num()) {
-        return Matrix_real(0, 0);
-    }
-
-    std::vector<std::pair<int, int>> intervals;
-    intervals.reserve(removed_paths.size());
-    for (size_t idx = 0; idx < removed_paths.size(); ++idx) {
-        int start = 0;
-        int length = 0;
-        if (parameter_interval_for_path(
-                original_gate_structure, removed_paths[idx], 0, 0, start, length) &&
-            length > 0) {
-            intervals.push_back(std::make_pair(start, start + length));
-        }
-    }
-
-    if (intervals.empty()) {
-        return original_parameters.copy();
-    }
-
-    std::sort(intervals.begin(), intervals.end());
-    std::vector<std::pair<int, int>> merged;
-    for (size_t idx = 0; idx < intervals.size(); ++idx) {
-        if (merged.empty() || intervals[idx].first > merged.back().second) {
-            merged.push_back(intervals[idx]);
-        } else {
-            merged.back().second = std::max(merged.back().second, intervals[idx].second);
-        }
-    }
-
-    int removed_parameter_num = 0;
-    for (size_t idx = 0; idx < merged.size(); ++idx) {
-        removed_parameter_num += merged[idx].second - merged[idx].first;
-    }
-
-    Matrix_real reduced_parameters(1, original_parameters.size() - removed_parameter_num);
-    int src = 0;
-    int dst = 0;
-    for (size_t idx = 0; idx < merged.size(); ++idx) {
-        int keep_num = merged[idx].first - src;
-        if (keep_num > 0) {
-            std::memcpy(reduced_parameters.get_data() + dst,
-                        original_parameters.get_data() + src,
-                        keep_num * sizeof(double));
-            dst += keep_num;
-        }
-        src = merged[idx].second;
-    }
-
-    if (src < original_parameters.size()) {
-        int keep_num = original_parameters.size() - src;
-        std::memcpy(reduced_parameters.get_data() + dst,
-                    original_parameters.get_data() + src,
-                    keep_num * sizeof(double));
-    }
-
-    return reduced_parameters;
-}
-
-static Matrix_real reduced_parameters_without_removed_paths(
-    Gates_block* original_gate_structure,
-    const std::vector<OSRGatePath>& all_paths,
-    const std::vector<int>& removed_ids,
-    const Matrix_real& original_parameters) {
-    std::vector<OSRGatePath> removed_paths;
-    removed_paths.reserve(removed_ids.size());
-    for (size_t idx = 0; idx < removed_ids.size(); ++idx) {
-        removed_paths.push_back(all_paths[removed_ids[idx]]);
-    }
-    return reduced_parameters_without_paths(
-        original_gate_structure, removed_paths, original_parameters);
-}
-
-static void add_topology_edge(std::set<std::pair<int, int>>& edges, int q0, int q1) {
-    if (q0 == q1) {
-        return;
-    }
-    if (q1 < q0) {
-        std::swap(q0, q1);
-    }
-    edges.insert(std::make_pair(q0, q1));
-}
-
-static void collect_topology_edges(Gates_block* block, std::set<std::pair<int, int>>& edges) {
-    if (block == NULL) {
-        return;
-    }
-
-    for (int idx = 0; idx < block->get_gate_num(); ++idx) {
-        Gate* gate = block->get_gate(idx);
-        if (gate == NULL) {
-            continue;
-        }
-
-        if (gate->get_type() == BLOCK_OPERATION) {
-            collect_topology_edges(static_cast<Gates_block*>(gate), edges);
-            continue;
-        }
-
-        if (!is_entangling_gate(gate)) {
-            continue;
-        }
-
-        std::vector<int> involved = gate->get_involved_qubits();
-        for (size_t q0_idx = 0; q0_idx < involved.size(); ++q0_idx) {
-            for (size_t q1_idx = q0_idx + 1; q1_idx < involved.size(); ++q1_idx) {
-                add_topology_edge(edges, involved[q0_idx], involved[q1_idx]);
-            }
-        }
-    }
-}
-
-static std::vector<matrix_base<int>> topology_from_gate_structure(Gates_block* gate_structure, int qbit_num) {
-    std::set<std::pair<int, int>> edges;
-    collect_topology_edges(gate_structure, edges);
-
-    if (edges.empty() && qbit_num > 1) {
-        for (int q0 = 0; q0 < qbit_num; ++q0) {
-            for (int q1 = q0 + 1; q1 < qbit_num; ++q1) {
-                edges.insert(std::make_pair(q0, q1));
-            }
-        }
-    }
-
-    std::vector<matrix_base<int>> topology;
-    topology.reserve(edges.size());
-    for (std::set<std::pair<int, int>>::const_iterator it = edges.begin(); it != edges.end(); ++it) {
-        matrix_base<int> edge(2, 1);
-        edge[0] = it->first;
-        edge[1] = it->second;
-        topology.push_back(edge);
-    }
-    return topology;
-}
-
-static std::vector<std::pair<int, int>> topology_pairs_from_matrices(
-    const std::vector<matrix_base<int>>& topology) {
-    std::vector<std::pair<int, int>> pairs;
-    pairs.reserve(topology.size());
-    for (size_t idx = 0; idx < topology.size(); ++idx) {
-        int q0 = topology[idx][0];
-        int q1 = topology[idx][1];
-        if (q0 == q1) {
-            continue;
-        }
-        if (q1 < q0) {
-            std::swap(q0, q1);
-        }
-        std::pair<int, int> edge(q0, q1);
-        if (std::find(pairs.begin(), pairs.end(), edge) == pairs.end()) {
-            pairs.push_back(edge);
-        }
-    }
-    return pairs;
-}
-
-static std::vector<std::pair<int, int>> complete_topology_pairs(int qbit_num) {
-    std::vector<std::pair<int, int>> pairs;
-    for (int q0 = 0; q0 < qbit_num; ++q0) {
-        for (int q1 = q0 + 1; q1 < qbit_num; ++q1) {
-            pairs.push_back(std::make_pair(q0, q1));
-        }
-    }
-    return pairs;
-}
-
-static Gates_block* construct_cnot_skeleton_gate_structure(
-    int qbit_num,
-    const std::vector<std::pair<int, int>>& edges,
-    const std::vector<int>& sequence) {
-    Gates_block* gate_structure = new Gates_block(qbit_num);
-
-    for (size_t idx = 0; idx < sequence.size(); ++idx) {
-        int edge_idx = sequence[idx];
-        if (edge_idx < 0 || edge_idx >= static_cast<int>(edges.size())) {
-            delete gate_structure;
-            return NULL;
-        }
-
-        Gates_block* layer = new Gates_block(qbit_num);
-        int target = edges[edge_idx].first;
-        int control = edges[edge_idx].second;
-        layer->add_u3(target);
-        layer->add_u3(control);
-        layer->add_cnot(target, control);
-        gate_structure->add_gate(layer);
-    }
-
-    Gates_block* final_layer = new Gates_block(qbit_num);
-    for (int qbit = 0; qbit < qbit_num; ++qbit) {
-        final_layer->add_u3(qbit);
-    }
-    gate_structure->add_gate(final_layer);
-
-    return gate_structure;
-}
-
-static int64_t limited_integer_power(int base, int exponent, int64_t limit) {
-    int64_t value = 1;
-    for (int idx = 0; idx < exponent; ++idx) {
-        if (base <= 0 || value > limit / base) {
-            return limit + 1;
-        }
-        value *= base;
-    }
-    return value;
-}
-
-static std::vector<CompressionCandidate> generate_cnot_skeleton_candidates(
-    int qbit_num,
-    int original_entangling_gate_num,
-    const std::vector<std::pair<int, int>>& edges,
-    const N_Qubit_Decomposition_OSR_Compression_Options& options) {
-    std::vector<CompressionCandidate> candidates;
-    if (!options.enable_skeleton_search || edges.empty() ||
-        options.skeleton_max_candidates <= 0 || original_entangling_gate_num <= 0) {
-        return candidates;
-    }
-
-    int target_depth = options.skeleton_target_cnots;
-    if (target_depth < 0) {
-        int removed = options.max_removed_gates >= 0 ? options.max_removed_gates : 2;
-        target_depth = original_entangling_gate_num - removed;
-    }
-    if (target_depth < 0 || target_depth >= original_entangling_gate_num) {
-        return candidates;
-    }
-
-    int edge_num = static_cast<int>(edges.size());
-    int64_t combination_num = limited_integer_power(
-        edge_num, target_depth, static_cast<int64_t>(options.skeleton_max_candidates));
-
-    if (combination_num > options.skeleton_max_candidates) {
-        return candidates;
-    }
-
-    std::set<std::string> seen;
-    for (int64_t state = 0; state < combination_num; ++state) {
-        int64_t value = state;
-        std::vector<int> sequence(target_depth, 0);
-        for (int depth = target_depth - 1; depth >= 0; --depth) {
-            sequence[depth] = static_cast<int>(value % edge_num);
-            value /= edge_num;
-        }
-
-        std::shared_ptr<Gates_block> gate_structure(
-            construct_cnot_skeleton_gate_structure(qbit_num, edges, sequence));
-        if (!gate_structure) {
-            continue;
-        }
-
-        std::string key = gate_structure_signature(gate_structure.get());
-        if (!seen.insert(key).second) {
-            continue;
-        }
-
-        CompressionCandidate candidate;
-        candidate.entangling_gate_num = target_depth;
-        candidate.gate_structure = gate_structure;
-        candidate.key = key;
-        candidate.initial_parameters = Matrix_real(0, 0);
-        candidates.push_back(candidate);
-    }
-
-    return candidates;
-}
-
-static double residual_sum(const std::vector<std::pair<int, double>>& cut_bounds) {
-    return std::accumulate(cut_bounds.begin(), cut_bounds.end(), 0.0,
-        [&cut_bounds](double acc, const std::pair<int, double>& item) {
-            return acc + item.first * cut_bounds.size() + item.second;
-        });
-}
-
-static bool score_less(const N_Qubit_Decomposition_OSR_Compression_Score& lhs,
-                       const N_Qubit_Decomposition_OSR_Compression_Score& rhs) {
-    if (lhs.min_remaining_cnots != rhs.min_remaining_cnots) {
-        return lhs.min_remaining_cnots < rhs.min_remaining_cnots;
-    }
-    if (lhs.kappa != rhs.kappa) {
-        return lhs.kappa < rhs.kappa;
-    }
-    return lhs.residual < rhs.residual;
-}
-
-static bool beam_candidate_less(const CompressionCandidate& lhs,
-                                const CompressionCandidate& rhs) {
-    if (score_less(lhs.score, rhs.score)) {
-        return true;
-    }
-    if (score_less(rhs.score, lhs.score)) {
-        return false;
-    }
-    return lhs.key < rhs.key;
-}
-
-static bool final_candidate_less(const CompressionCandidate& lhs,
-                                 const CompressionCandidate& rhs) {
-    if (lhs.entangling_gate_num != rhs.entangling_gate_num) {
-        return lhs.entangling_gate_num < rhs.entangling_gate_num;
-    }
-    return beam_candidate_less(lhs, rhs);
-}
-
-static std::string compression_candidate_key(const CompressionCandidate& candidate) {
-    if (!candidate.key.empty()) {
-        return candidate.key;
-    }
-
-    std::stringstream sstream;
-    sstream << "removed:";
-    for (size_t idx = 0; idx < candidate.removed_ids.size(); ++idx) {
-        if (idx > 0) {
-            sstream << ",";
-        }
-        sstream << candidate.removed_ids[idx];
-    }
-    return sstream.str();
-}
-
-static void sort_unique_candidates(std::vector<CompressionCandidate>& candidates, bool final_sort) {
-    std::sort(candidates.begin(), candidates.end(),
-              final_sort ? final_candidate_less : beam_candidate_less);
-
-    std::set<std::string> seen;
-    std::vector<CompressionCandidate> unique_candidates;
-    unique_candidates.reserve(candidates.size());
-    for (size_t idx = 0; idx < candidates.size(); ++idx) {
-        std::string key = compression_candidate_key(candidates[idx]);
-        if (seen.insert(key).second) {
-            unique_candidates.push_back(candidates[idx]);
-        }
-    }
-    candidates.swap(unique_candidates);
-}
-
-static Gates_block* clone_gate_structure_for_candidate(
-    Gates_block* original_gate_structure,
-    const std::vector<OSRGatePath>& original_paths,
-    const CompressionCandidate& candidate) {
-    if (candidate.gate_structure) {
-        return candidate.gate_structure->clone();
-    }
-    return clone_without_removed_paths(
-        original_gate_structure, original_paths, candidate.removed_ids);
-}
-
-static bool edge_shares_endpoint(const std::pair<int, int>& edge, int q0, int q1) {
-    return edge.first == q0 || edge.first == q1 ||
-           edge.second == q0 || edge.second == q1;
-}
-
-static bool same_undirected_edge(const std::pair<int, int>& edge, int q0, int q1) {
-    int a = q0;
-    int b = q1;
-    if (b < a) {
-        std::swap(a, b);
-    }
-    return edge.first == a && edge.second == b;
-}
-
-static bool same_parent_and_adjacent(const OSRGatePath& lhs,
-                                     const OSRGatePath& rhs) {
-    if (lhs.indices.size() != rhs.indices.size() || lhs.indices.empty()) {
-        return false;
-    }
-
-    for (size_t idx = 0; idx + 1 < lhs.indices.size(); ++idx) {
-        if (lhs.indices[idx] != rhs.indices[idx]) {
-            return false;
-        }
-    }
-
-    return std::abs(lhs.indices.back() - rhs.indices.back()) == 1;
-}
-
-static void append_candidate_if_new(std::vector<CompressionCandidate>& out,
-                                    std::set<std::string>& seen,
-                                    CompressionCandidate& candidate) {
-    if (!candidate.gate_structure) {
-        return;
-    }
-
-    candidate.key = gate_structure_signature(candidate.gate_structure.get());
-    if (seen.insert(candidate.key).second) {
-        candidate.entangling_gate_num =
-            static_cast<int>(collect_entangling_gate_paths(candidate.gate_structure.get()).size());
-        out.push_back(candidate);
-    }
-}
-
-static std::vector<CompressionCandidate> generate_local_mutation_candidates(
-    Gates_block* base_structure,
-    const Matrix_real& base_parameters,
-    const CompressionCandidate& parent,
-    const std::vector<std::pair<int, int>>& mutation_edges,
-    const N_Qubit_Decomposition_OSR_Compression_Options& options) {
-    std::vector<CompressionCandidate> candidates;
-    if (!options.enable_mutations || options.mutation_candidates <= 0 ||
-        base_structure == NULL) {
-        return candidates;
-    }
-
-    std::vector<OSRGatePath> paths = collect_entangling_gate_paths(base_structure);
-    if (paths.empty()) {
-        return candidates;
-    }
-
-    std::set<std::string> seen;
-    seen.insert(gate_structure_signature(base_structure));
-
-    for (size_t idx = 0; idx + 1 < paths.size() &&
-                         static_cast<int>(candidates.size()) < options.mutation_candidates; ++idx) {
-        const OSRGatePath& lhs_path = paths[idx];
-        const OSRGatePath& rhs_path = paths[idx + 1];
-        if (!same_parent_and_adjacent(lhs_path, rhs_path)) {
-            continue;
-        }
-
-        Gate* lhs_gate = gate_at_path(base_structure, lhs_path);
-        Gate* rhs_gate = gate_at_path(base_structure, rhs_path);
-        if (!gate_endpoint_sets_are_disjoint(lhs_gate, rhs_gate)) {
-            continue;
-        }
-
-        std::shared_ptr<Gates_block> swapped(
-            clone_with_swapped_sibling_gates(base_structure, lhs_path, rhs_path));
-        if (!swapped) {
-            continue;
-        }
-
-        CompressionCandidate child = parent;
-        child.gate_structure = swapped;
-        if (base_parameters.size() == base_structure->get_parameter_num() &&
-            swapped->get_parameter_num() == base_parameters.size()) {
-            child.initial_parameters = base_parameters.copy();
-        } else {
-            child.initial_parameters = Matrix_real(0, 0);
-        }
-        append_candidate_if_new(candidates, seen, child);
-    }
-
-    for (size_t path_idx = 0; path_idx < paths.size() &&
-                              static_cast<int>(candidates.size()) < options.mutation_candidates; ++path_idx) {
-        Gate* gate = gate_at_path(base_structure, paths[path_idx]);
-        if (gate == NULL) {
-            continue;
-        }
-
-        int old_q0 = 0;
-        int old_q1 = 0;
-        if (!get_two_qubit_endpoint_pair(gate, old_q0, old_q1)) {
-            continue;
-        }
-
-        bool directional = gate_type_is_directional(gate->get_type()) &&
-            (!gate->get_control_qbits().empty() || gate->get_control_qbit() >= 0);
-
-        for (int pass = 0; pass < 2 &&
-                           static_cast<int>(candidates.size()) < options.mutation_candidates; ++pass) {
-            for (size_t edge_idx = 0; edge_idx < mutation_edges.size() &&
-                                      static_cast<int>(candidates.size()) < options.mutation_candidates; ++edge_idx) {
-                const std::pair<int, int>& edge = mutation_edges[edge_idx];
-                if (same_undirected_edge(edge, old_q0, old_q1)) {
-                    continue;
-                }
-
-                bool shares_endpoint = edge_shares_endpoint(edge, old_q0, old_q1);
-                if ((pass == 0 && !shares_endpoint) || (pass == 1 && shares_endpoint)) {
-                    continue;
-                }
-
-                int orientation_count = directional ? 2 : 1;
-                for (int orientation = 0;
-                     orientation < orientation_count &&
-                     static_cast<int>(candidates.size()) < options.mutation_candidates;
-                     ++orientation) {
-                    int new_target = (orientation == 0) ? edge.first : edge.second;
-                    int new_control = (orientation == 0) ? edge.second : edge.first;
-
-                    std::shared_ptr<Gates_block> rewired(
-                        clone_with_rewired_gate_path(
-                            base_structure, paths[path_idx], new_target, new_control));
-                    if (!rewired) {
-                        continue;
-                    }
-
-                    CompressionCandidate child = parent;
-                    child.gate_structure = rewired;
-                    if (base_parameters.size() == base_structure->get_parameter_num() &&
-                        rewired->get_parameter_num() == base_parameters.size()) {
-                        child.initial_parameters = base_parameters.copy();
-                    } else {
-                        child.initial_parameters = Matrix_real(0, 0);
-                    }
-                    append_candidate_if_new(candidates, seen, child);
-                }
-            }
-        }
-    }
-
-    return candidates;
-}
-
-static bool candidate_is_osr_admissible(const CompressionCandidate& candidate,
-                                        const N_Qubit_Decomposition_OSR_Compression_Options& options) {
-    return candidate.score.min_remaining_cnots <= options.osr_bound_limit;
-}
-
-} // namespace
-
-N_Qubit_Decomposition_OSR_Compression_Result::N_Qubit_Decomposition_OSR_Compression_Result()
-    : current_minimum(std::numeric_limits<double>::infinity()),
-      original_entangling_gate_num(0),
-      compressed_entangling_gate_num(0),
-      validated(false),
-      reached_tolerance(false),
-      decomposition_error(std::numeric_limits<double>::infinity()) {}
-
-N_Qubit_Decomposition_OSR_Compression::N_Qubit_Decomposition_OSR_Compression()
-    : N_Qubit_Decomposition_custom() {
-    name = "OSR_Compression";
-}
-
-N_Qubit_Decomposition_OSR_Compression::N_Qubit_Decomposition_OSR_Compression(
-    Matrix Umtx_in,
-    int qbit_num_in,
-    std::map<std::string, Config_Element>& config,
-    int accelerator_num)
-    : N_Qubit_Decomposition_custom(Umtx_in, qbit_num_in, false, config, RANDOM, accelerator_num) {
-    name = "OSR_Compression";
-}
-
-N_Qubit_Decomposition_OSR_Compression::N_Qubit_Decomposition_OSR_Compression(
-    Matrix Umtx_in,
-    int qbit_num_in,
-    std::vector<matrix_base<int>> topology_in,
-    std::map<std::string, Config_Element>& config,
-    int accelerator_num)
-    : N_Qubit_Decomposition_custom(Umtx_in, qbit_num_in, false, config, RANDOM, accelerator_num),
-      topology(std::move(topology_in)) {
-    name = "OSR_Compression";
-}
-
-N_Qubit_Decomposition_OSR_Compression::~N_Qubit_Decomposition_OSR_Compression() {}
-
-void N_Qubit_Decomposition_OSR_Compression::start_decomposition() {
-    std::unique_ptr<Gates_block> original_gate_structure(clone());
-    Matrix_real original_parameters = optimized_parameters_mtx.size() > 0
-        ? optimized_parameters_mtx.copy()
-        : Matrix_real(0, 0);
-
-    double optimization_tolerance_loc;
-    if (config.count("optimization_tolerance") > 0) {
-        config["optimization_tolerance"].get_property(optimization_tolerance_loc);
-    } else {
-        optimization_tolerance_loc = optimization_tolerance;
-    }
-
-    N_Qubit_Decomposition_OSR_Compression_Result result =
-        compress_gate_structure(original_gate_structure.get());
-
-    if (!result.reached_tolerance &&
-        original_parameters.size() == original_gate_structure->get_parameter_num()) {
-        double original_cost = optimization_problem(original_parameters);
-        if (original_cost < optimization_tolerance_loc ||
-            original_cost < result.current_minimum) {
-            result.gate_structure.reset(original_gate_structure->clone());
-            result.optimized_parameters = original_parameters.copy();
-            result.current_minimum = original_cost;
-            result.decomposition_error = original_cost;
-            result.compressed_entangling_gate_num =
-                result.original_entangling_gate_num;
-            result.removed_gate_paths.clear();
-            result.validated = true;
-            result.reached_tolerance = original_cost < optimization_tolerance_loc;
-        }
-    }
-
-    release_gates();
-    combine(result.gate_structure.get());
-
-    if (result.validated && result.optimized_parameters.size() == get_parameter_num()) {
-        optimized_parameters_mtx = result.optimized_parameters.copy();
-        current_minimum = result.current_minimum;
-        decomposition_error = result.decomposition_error;
-    } else {
-        N_Qubit_Decomposition_custom::start_decomposition();
-    }
-}
-
-N_Qubit_Decomposition_OSR_Compression_Options
-N_Qubit_Decomposition_OSR_Compression::get_osr_compression_options() {
-    N_Qubit_Decomposition_OSR_Compression_Options options;
-
-    long long int_value;
-    double double_value;
-
-    if (config.count("osr_compression_beam") > 0) {
-        config["osr_compression_beam"].get_property(int_value);
-        options.beam_width = static_cast<int>(int_value);
-    }
-    if (config.count("osr_compression_max_removed") > 0) {
-        config["osr_compression_max_removed"].get_property(int_value);
-        options.max_removed_gates = static_cast<int>(int_value);
-    }
-    if (config.count("osr_compression_bound_limit") > 0) {
-        config["osr_compression_bound_limit"].get_property(int_value);
-        options.osr_bound_limit = static_cast<int>(int_value);
-    }
-    if (config.count("osr_compression_validation_trials") > 0) {
-        config["osr_compression_validation_trials"].get_property(int_value);
-        options.validation_trials = static_cast<int>(int_value);
-    }
-    if (config.count("osr_compression_validate") > 0) {
-        config["osr_compression_validate"].get_property(int_value);
-        options.validate_final = int_value != 0;
-    }
-    if (config.count("osr_compression_osr_tolerance") > 0) {
-        config["osr_compression_osr_tolerance"].get_property(double_value);
-        options.osr_tolerance = double_value;
-    }
-    if (config.count("osr_compression_enable_mutations") > 0) {
-        config["osr_compression_enable_mutations"].get_property(int_value);
-        options.enable_mutations = int_value != 0;
-    }
-    if (config.count("osr_compression_mutation_rounds") > 0) {
-        config["osr_compression_mutation_rounds"].get_property(int_value);
-        options.mutation_rounds = static_cast<int>(int_value);
-    }
-    if (config.count("osr_compression_mutation_candidates") > 0) {
-        config["osr_compression_mutation_candidates"].get_property(int_value);
-        options.mutation_candidates = static_cast<int>(int_value);
-    }
-    if (config.count("osr_compression_mutate_full_topology") > 0) {
-        config["osr_compression_mutate_full_topology"].get_property(int_value);
-        options.mutate_full_topology = int_value != 0;
-    }
-    if (config.count("osr_compression_enable_skeleton_search") > 0) {
-        config["osr_compression_enable_skeleton_search"].get_property(int_value);
-        options.enable_skeleton_search = int_value != 0;
-    }
-    if (config.count("osr_compression_skeleton_target_cnots") > 0) {
-        config["osr_compression_skeleton_target_cnots"].get_property(int_value);
-        options.skeleton_target_cnots = static_cast<int>(int_value);
-    }
-    if (config.count("osr_compression_skeleton_max_candidates") > 0) {
-        config["osr_compression_skeleton_max_candidates"].get_property(int_value);
-        options.skeleton_max_candidates = static_cast<int>(int_value);
-    }
-
-    options.beam_width = std::max(options.beam_width, 1);
-    options.validation_trials = std::max(options.validation_trials, 1);
-    options.osr_bound_limit = std::max(options.osr_bound_limit, 0);
-    options.mutation_rounds = std::max(options.mutation_rounds, 0);
-    options.mutation_candidates = std::max(options.mutation_candidates, 0);
-    options.skeleton_max_candidates = std::max(options.skeleton_max_candidates, 0);
-
-    return options;
-}
-
-N_Qubit_Decomposition_custom
-N_Qubit_Decomposition_OSR_Compression::prepare_custom_optimizer(
-    Gates_block* gate_structure_in,
-    cost_function_type cost_function_variant) {
-    double optimization_tolerance_loc;
-    if (config.count("optimization_tolerance") > 0) {
-        config["optimization_tolerance"].get_property(optimization_tolerance_loc);
-    } else {
-        optimization_tolerance_loc = optimization_tolerance;
-    }
-
-    N_Qubit_Decomposition_custom cDecomp_custom_random =
-        N_Qubit_Decomposition_custom(Umtx.copy(), qbit_num, false, config, RANDOM, accelerator_num);
-    cDecomp_custom_random.set_custom_gate_structure(gate_structure_in);
-    cDecomp_custom_random.set_optimization_blocks(gate_structure_in->get_gate_num());
-    cDecomp_custom_random.set_max_iteration(max_outer_iterations);
-#ifndef __DFE__
-    cDecomp_custom_random.set_verbose(verbose);
-#else
-    cDecomp_custom_random.set_verbose(0);
-#endif
-    cDecomp_custom_random.set_cost_function_variant(cost_function_variant);
-    cDecomp_custom_random.set_debugfile("");
-    cDecomp_custom_random.set_optimization_tolerance(optimization_tolerance_loc);
-    cDecomp_custom_random.set_trace_offset(trace_offset);
-    cDecomp_custom_random.set_optimizer(alg);
-
-    if (alg == ADAM || alg == BFGS2) {
-        int max_inner_iterations_loc = 10000;
-        int param_num_loc = gate_structure_in->get_parameter_num();
-        max_inner_iterations_loc = static_cast<int>((double)param_num_loc / 852 * 10000000.0);
-        cDecomp_custom_random.set_max_inner_iterations(max_inner_iterations_loc);
-        cDecomp_custom_random.set_random_shift_count_max(5);
-    } else if (alg == ADAM_BATCHED) {
-        int max_inner_iterations_loc = 2000;
-        cDecomp_custom_random.set_max_inner_iterations(max_inner_iterations_loc);
-        cDecomp_custom_random.set_random_shift_count_max(5);
-    } else if (alg == BFGS) {
-        int max_inner_iterations_loc = 10000;
-        cDecomp_custom_random.set_max_inner_iterations(max_inner_iterations_loc);
-    }
-
-    return cDecomp_custom_random;
-}
-
-N_Qubit_Decomposition_OSR_Compression_Score
-N_Qubit_Decomposition_OSR_Compression::evaluate_gate_structure_osr(
-    Gates_block* gate_structure_in,
-    const Matrix_real& initial_parameters,
-    MinCnotBoundSolver& osr_bound_solver,
-    std::vector<std::vector<int>>& all_cuts) {
-    N_Qubit_Decomposition_OSR_Compression_Options options = get_osr_compression_options();
-    N_Qubit_Decomposition_OSR_Compression_Score best_score;
-    best_score.min_remaining_cnots = std::numeric_limits<int>::max();
-    best_score.kappa = std::numeric_limits<double>::infinity();
-    best_score.residual = std::numeric_limits<double>::infinity();
-
-    if (qbit_num <= 1 || all_cuts.empty()) {
-        best_score.min_remaining_cnots = 0;
-        best_score.kappa = 0.0;
-        best_score.residual = 0.0;
-        return best_score;
-    }
-
-    double Fnorm = std::sqrt(static_cast<double>(1 << qbit_num));
-    std::uniform_real_distribution<> distrib_real(0.0, 2 * M_PI);
-
-    N_Qubit_Decomposition_custom cDecomp_custom_random =
-        prepare_custom_optimizer(gate_structure_in, OSR_ENTANGLEMENT);
-    std::vector<double> optimized_parameters(cDecomp_custom_random.get_parameter_num());
-    if (initial_parameters.size() == cDecomp_custom_random.get_parameter_num()) {
-        std::copy(initial_parameters.get_data(),
-                  initial_parameters.get_data() + initial_parameters.size(),
-                  optimized_parameters.begin());
-    } else if (optimized_parameters_mtx.size() == cDecomp_custom_random.get_parameter_num()) {
-        std::copy(optimized_parameters_mtx.get_data(),
-                  optimized_parameters_mtx.get_data() + optimized_parameters_mtx.size(),
-                  optimized_parameters.begin());
-    } else {
-        for (size_t idx = 0; idx < optimized_parameters.size(); ++idx) {
-            optimized_parameters[idx] = distrib_real(gen);
-        }
-    }
-    if (!optimized_parameters.empty()) {
-        cDecomp_custom_random.set_optimized_parameters(
-            optimized_parameters.data(), static_cast<int>(optimized_parameters.size()));
-    }
-
-    for (const std::vector<int>& cut : all_cuts) {
-        if (cut.size() != 1) {
-            continue;
-        }
-
-        int cut_size = static_cast<int>(cut.size());
-        int max_rank = 2 * std::min(cut_size, qbit_num - cut_size);
-        max_rank = std::max(max_rank, 1);
-
-        for (int rank = max_rank - 1; rank >= 0; --rank) {
-            cDecomp_custom_random.set_osr_params({cut}, rank, false);
-            cDecomp_custom_random.start_decomposition();
-
-            Matrix U = Umtx.copy();
-            Matrix_real params = cDecomp_custom_random.get_optimized_parameters();
-            cDecomp_custom_random.apply_to(params, U);
-
-            std::vector<std::pair<int, double>> osr_result;
-            osr_result.reserve(all_cuts.size());
-            int newrank = rank;
-            for (const std::vector<int>& eval_cut : all_cuts) {
-                osr_result.emplace_back(
-                    operator_schmidt_rank(U, qbit_num, eval_cut, Fnorm, options.osr_tolerance));
-                if (cut == eval_cut) {
-                    newrank = osr_result.back().first;
-                }
-            }
-
-            double kappa = std::numeric_limits<double>::infinity();
-            std::vector<int> edge_counts;
-            int min_cnots = osr_bound_solver.solve_min_cnots(osr_result, kappa, edge_counts);
-
-            N_Qubit_Decomposition_OSR_Compression_Score score;
-            score.min_remaining_cnots = min_cnots;
-            score.kappa = kappa;
-            score.residual = residual_sum(osr_result);
-            score.edge_counts = edge_counts;
-            score.cut_bounds = osr_result;
-
-            if (score_less(score, best_score)) {
-                best_score = score;
-            }
-
-            if (newrank > rank) {
-                break;
-            }
-            rank = std::min(rank, newrank);
-        }
-    }
-
-    if (best_score.min_remaining_cnots == std::numeric_limits<int>::max()) {
-        std::vector<std::pair<int, double>> osr_result(all_cuts.size(), std::make_pair(0, 0.0));
-        double kappa = std::numeric_limits<double>::infinity();
-        std::vector<int> edge_counts;
-        best_score.min_remaining_cnots = osr_bound_solver.solve_min_cnots(osr_result, kappa, edge_counts);
-        best_score.kappa = kappa;
-        best_score.residual = 0.0;
-        best_score.edge_counts = edge_counts;
-        best_score.cut_bounds = osr_result;
-    }
-
-    return best_score;
-}
-
-void N_Qubit_Decomposition_OSR_Compression::validate_compressed_gate_structure(
-    Gates_block* gate_structure_in,
-    const Matrix_real& initial_parameters,
-    N_Qubit_Decomposition_OSR_Compression_Result& result) {
-    N_Qubit_Decomposition_OSR_Compression_Options options = get_osr_compression_options();
-
-    double optimization_tolerance_loc;
-    if (config.count("optimization_tolerance") > 0) {
-        config["optimization_tolerance"].get_property(optimization_tolerance_loc);
-    } else {
-        optimization_tolerance_loc = optimization_tolerance;
-    }
-
-    result.validated = true;
-    result.current_minimum = std::numeric_limits<double>::infinity();
-    result.decomposition_error = std::numeric_limits<double>::infinity();
-
-    std::uniform_real_distribution<> distrib_real(0.0, 2 * M_PI);
-    for (int iter = 0; iter < options.validation_trials; ++iter) {
-        N_Qubit_Decomposition_custom cDecomp_custom_random =
-            prepare_custom_optimizer(gate_structure_in, cost_fnc);
-
-        std::vector<double> optimized_parameters(cDecomp_custom_random.get_parameter_num());
-        if (iter == 0 && initial_parameters.size() == cDecomp_custom_random.get_parameter_num()) {
-            std::copy(initial_parameters.get_data(),
-                      initial_parameters.get_data() + initial_parameters.size(),
-                      optimized_parameters.begin());
-        } else if (iter == 0 && optimized_parameters_mtx.size() == cDecomp_custom_random.get_parameter_num()) {
-            std::copy(optimized_parameters_mtx.get_data(),
-                      optimized_parameters_mtx.get_data() + optimized_parameters_mtx.size(),
-                      optimized_parameters.begin());
-        } else {
-            for (size_t idx = 0; idx < optimized_parameters.size(); ++idx) {
-                optimized_parameters[idx] = distrib_real(gen);
-            }
-        }
-        if (!optimized_parameters.empty()) {
-            cDecomp_custom_random.set_optimized_parameters(
-                optimized_parameters.data(), static_cast<int>(optimized_parameters.size()));
-        }
-
-        cDecomp_custom_random.start_decomposition();
-        Matrix_real optimized_parameters_tmp = cDecomp_custom_random.get_optimized_parameters();
-        double current_minimum_tmp = cDecomp_custom_random.optimization_problem(optimized_parameters_tmp);
-        if (current_minimum_tmp < result.current_minimum) {
-            result.current_minimum = current_minimum_tmp;
-            result.optimized_parameters = optimized_parameters_tmp.copy();
-            result.decomposition_error = cDecomp_custom_random.get_decomposition_error();
-        }
-        if (current_minimum_tmp < optimization_tolerance_loc &&
-            cDecomp_custom_random.get_decomposition_error() < optimization_tolerance_loc) {
-            result.reached_tolerance = true;
-            break;
-        }
-    }
-}
-
-N_Qubit_Decomposition_OSR_Compression_Result
-N_Qubit_Decomposition_OSR_Compression::compress_gate_structure(
-    Gates_block* gate_structure_in) {
-    if (gate_structure_in == NULL) {
-        std::string err("N_Qubit_Decomposition_OSR_Compression::compress_gate_structure: gate_structure is null");
-        throw err;
-    }
-
-    N_Qubit_Decomposition_OSR_Compression_Options options = get_osr_compression_options();
-    std::vector<OSRGatePath> removable_paths = collect_entangling_gate_paths(gate_structure_in);
-
-    std::vector<std::vector<int>> all_cuts = unique_cuts(qbit_num);
-    std::sort(all_cuts.begin(), all_cuts.end(), [](const std::vector<int>& a, const std::vector<int>& b) {
-        if (a.size() != b.size()) {
-            return a.size() < b.size();
-        }
-        return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end());
-    });
-
-    std::vector<matrix_base<int>> active_topology = !this->topology.empty()
-        ? this->topology
-        : topology_from_gate_structure(gate_structure_in, qbit_num);
-    std::vector<std::pair<int, int>> mutation_edges = options.mutate_full_topology
-        ? complete_topology_pairs(qbit_num)
-        : topology_pairs_from_matrices(active_topology);
-    MinCnotBoundSolver osr_bound_solver(qbit_num, all_cuts, active_topology);
-
-    CompressionCandidate root;
-    root.entangling_gate_num = static_cast<int>(removable_paths.size());
-    root.key = gate_structure_signature(gate_structure_in);
-    if (optimized_parameters_mtx.size() == gate_structure_in->get_parameter_num()) {
-        root.initial_parameters = optimized_parameters_mtx.copy();
-    }
-    root.score = evaluate_gate_structure_osr(
-        gate_structure_in, root.initial_parameters, osr_bound_solver, all_cuts);
-
-    CompressionCandidate best = root;
-    std::vector<CompressionCandidate> beam(1, root);
-
-    int max_removed = options.max_removed_gates < 0
-        ? static_cast<int>(removable_paths.size())
-        : std::min(options.max_removed_gates, static_cast<int>(removable_paths.size()));
-
-    for (int depth = 1; depth <= max_removed; ++depth) {
-        std::vector<CompressionCandidate> next_candidates;
-
-        for (size_t beam_idx = 0; beam_idx < beam.size(); ++beam_idx) {
-            const CompressionCandidate& parent = beam[beam_idx];
-            int start_id = parent.removed_ids.empty() ? 0 : parent.removed_ids.back() + 1;
-
-            for (int remove_id = start_id; remove_id < static_cast<int>(removable_paths.size()); ++remove_id) {
-                CompressionCandidate child;
-                child.removed_ids = parent.removed_ids;
-                child.removed_ids.push_back(remove_id);
-                child.entangling_gate_num =
-                    static_cast<int>(removable_paths.size()) - static_cast<int>(child.removed_ids.size());
-
-                std::unique_ptr<Gates_block> candidate_gate_structure(
-                    clone_without_removed_paths(gate_structure_in, removable_paths, child.removed_ids));
-                child.initial_parameters = reduced_parameters_without_removed_paths(
-                    gate_structure_in, removable_paths, child.removed_ids, optimized_parameters_mtx);
-                child.key = gate_structure_signature(candidate_gate_structure.get());
-                child.score = evaluate_gate_structure_osr(
-                    candidate_gate_structure.get(), child.initial_parameters, osr_bound_solver, all_cuts);
-
-                if (candidate_is_osr_admissible(child, options)) {
-                    next_candidates.push_back(child);
-                }
-            }
-        }
-
-        if (next_candidates.empty()) {
-            break;
-        }
-
-        sort_unique_candidates(next_candidates, false);
-        if (static_cast<int>(next_candidates.size()) > options.beam_width) {
-            next_candidates.resize(options.beam_width);
-        }
-
-        for (size_t idx = 0; idx < next_candidates.size(); ++idx) {
-            if (candidate_is_osr_admissible(next_candidates[idx], options) &&
-                final_candidate_less(next_candidates[idx], best)) {
-                best = next_candidates[idx];
-            }
-        }
-
-        beam.swap(next_candidates);
-    }
-
-    std::vector<CompressionCandidate> validation_pool = beam;
-    validation_pool.push_back(root);
-    validation_pool.push_back(best);
-
-    if (options.enable_mutations && options.mutation_rounds > 0 &&
-        options.mutation_candidates > 0 && !mutation_edges.empty()) {
-        std::vector<CompressionCandidate> mutation_seeds = validation_pool;
-        sort_unique_candidates(mutation_seeds, true);
-        if (static_cast<int>(mutation_seeds.size()) > options.beam_width) {
-            mutation_seeds.resize(options.beam_width);
-        }
-
-        for (int round = 0; round < options.mutation_rounds; ++round) {
-            std::vector<CompressionCandidate> round_mutations;
-
-            for (size_t seed_idx = 0; seed_idx < mutation_seeds.size(); ++seed_idx) {
-                const CompressionCandidate& seed = mutation_seeds[seed_idx];
-                std::unique_ptr<Gates_block> seed_gate_structure(
-                    clone_gate_structure_for_candidate(
-                        gate_structure_in, removable_paths, seed));
-
-                std::vector<CompressionCandidate> local_mutations =
-                    generate_local_mutation_candidates(
-                        seed_gate_structure.get(), seed.initial_parameters, seed,
-                        mutation_edges, options);
-
-                for (size_t mut_idx = 0; mut_idx < local_mutations.size(); ++mut_idx) {
-                    CompressionCandidate& mutation = local_mutations[mut_idx];
-                    mutation.score = evaluate_gate_structure_osr(
-                        mutation.gate_structure.get(), mutation.initial_parameters,
-                        osr_bound_solver, all_cuts);
-                    if (candidate_is_osr_admissible(mutation, options)) {
-                        round_mutations.push_back(mutation);
-                    }
-                }
-            }
-
-            if (round_mutations.empty()) {
-                break;
-            }
-
-            sort_unique_candidates(round_mutations, false);
-            if (static_cast<int>(round_mutations.size()) > options.mutation_candidates) {
-                round_mutations.resize(options.mutation_candidates);
-            }
-
-            validation_pool.insert(
-                validation_pool.end(), round_mutations.begin(), round_mutations.end());
-            mutation_seeds.swap(round_mutations);
-        }
-    }
-
-    if (options.enable_skeleton_search) {
-        std::vector<std::pair<int, int>> skeleton_edges =
-            (options.mutate_full_topology || qbit_num <= 3)
-                ? complete_topology_pairs(qbit_num)
-                : mutation_edges;
-        std::vector<CompressionCandidate> skeleton_candidates =
-            generate_cnot_skeleton_candidates(
-                qbit_num, static_cast<int>(removable_paths.size()),
-                skeleton_edges, options);
-        validation_pool.insert(
-            validation_pool.end(), skeleton_candidates.begin(), skeleton_candidates.end());
-    }
-
-    sort_unique_candidates(validation_pool, true);
-    int validation_pool_limit = options.beam_width;
-    if (options.enable_mutations) {
-        validation_pool_limit += options.mutation_candidates;
-    }
-    if (options.enable_skeleton_search) {
-        validation_pool_limit += options.skeleton_max_candidates;
-    }
-    validation_pool_limit = std::max(validation_pool_limit, options.beam_width);
-    if (static_cast<int>(validation_pool.size()) > validation_pool_limit) {
-        validation_pool.resize(validation_pool_limit);
-    }
-    bool has_root_candidate = false;
-    for (size_t idx = 0; idx < validation_pool.size(); ++idx) {
-        if (compression_candidate_key(validation_pool[idx]) == root.key) {
-            has_root_candidate = true;
-            break;
-        }
-    }
-    if (!has_root_candidate) {
-        validation_pool.push_back(root);
-    }
-
-    N_Qubit_Decomposition_OSR_Compression_Result result;
-    result.original_entangling_gate_num = static_cast<int>(removable_paths.size());
-
-    if (!options.validate_final) {
-        result.gate_structure.reset(
-            clone_gate_structure_for_candidate(gate_structure_in, removable_paths, best));
-        result.osr_score = best.score;
-        for (size_t idx = 0; idx < best.removed_ids.size(); ++idx) {
-            result.removed_gate_paths.push_back(removable_paths[best.removed_ids[idx]]);
-        }
-        result.compressed_entangling_gate_num = best.entangling_gate_num;
-        return result;
-    }
-
-    bool selected_validated_candidate = false;
-    N_Qubit_Decomposition_OSR_Compression_Result best_validated_result;
-
-    best_validated_result.gate_structure.reset(
-        clone_gate_structure_for_candidate(gate_structure_in, removable_paths, root));
-    best_validated_result.osr_score = root.score;
-    best_validated_result.original_entangling_gate_num = static_cast<int>(removable_paths.size());
-    best_validated_result.compressed_entangling_gate_num = root.entangling_gate_num;
-    validate_compressed_gate_structure(
-        best_validated_result.gate_structure.get(), root.initial_parameters, best_validated_result);
-    selected_validated_candidate = true;
-
-    for (size_t idx = 0; idx < validation_pool.size(); ++idx) {
-        const CompressionCandidate& candidate = validation_pool[idx];
-        if (compression_candidate_key(candidate) == root.key) {
-            continue;
-        }
-        N_Qubit_Decomposition_OSR_Compression_Result candidate_result;
-        candidate_result.gate_structure.reset(
-            clone_gate_structure_for_candidate(gate_structure_in, removable_paths, candidate));
-        candidate_result.osr_score = candidate.score;
-        candidate_result.original_entangling_gate_num = static_cast<int>(removable_paths.size());
-        candidate_result.compressed_entangling_gate_num = candidate.entangling_gate_num;
-        for (size_t removed_idx = 0; removed_idx < candidate.removed_ids.size(); ++removed_idx) {
-            candidate_result.removed_gate_paths.push_back(removable_paths[candidate.removed_ids[removed_idx]]);
-        }
-
-        validate_compressed_gate_structure(
-            candidate_result.gate_structure.get(), candidate.initial_parameters, candidate_result);
-
-        if ((candidate_result.reached_tolerance && !best_validated_result.reached_tolerance) ||
-            (candidate_result.reached_tolerance && best_validated_result.reached_tolerance &&
-             candidate_result.compressed_entangling_gate_num < best_validated_result.compressed_entangling_gate_num) ||
-            (candidate_result.reached_tolerance && best_validated_result.reached_tolerance &&
-             candidate_result.compressed_entangling_gate_num == best_validated_result.compressed_entangling_gate_num &&
-             candidate_result.current_minimum < best_validated_result.current_minimum) ||
-            (!candidate_result.reached_tolerance && !best_validated_result.reached_tolerance &&
-             candidate_result.current_minimum < best_validated_result.current_minimum)) {
-            best_validated_result = std::move(candidate_result);
-        }
-
-        if (best_validated_result.reached_tolerance &&
-            best_validated_result.compressed_entangling_gate_num == validation_pool.front().entangling_gate_num) {
-            break;
-        }
-    }
-
-    if (selected_validated_candidate) {
-        return best_validated_result;
-    }
-
-    result.gate_structure.reset(
-        clone_gate_structure_for_candidate(gate_structure_in, removable_paths, best));
-    result.osr_score = best.score;
-    for (size_t idx = 0; idx < best.removed_ids.size(); ++idx) {
-        result.removed_gate_paths.push_back(removable_paths[best.removed_ids[idx]]);
-    }
-    result.compressed_entangling_gate_num = best.entangling_gate_num;
-    return result;
-}
diff --git a/squander/src-cpp/decomposition/include/N_Qubit_Decomposition_OSR_Compression.h b/squander/src-cpp/decomposition/include/N_Qubit_Decomposition_OSR_Compression.h
deleted file mode 100644
index 80936330a..000000000
--- a/squander/src-cpp/decomposition/include/N_Qubit_Decomposition_OSR_Compression.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
-Created on Sat May 02 2026
-Copyright 2026
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-/*! \file N_Qubit_Decomposition_OSR_Compression.h
-    \brief OSR-guided top-down compression for an existing gate structure.
-*/
-
-#ifndef N_Qubit_Decomposition_OSR_Compression_H
-#define N_Qubit_Decomposition_OSR_Compression_H
-
-#include "Gates_block.h"
-#include "N_Qubit_Decomposition_custom.h"
-#include "N_Qubit_Decomposition_Tree_Search.h"
-#include "config_element.h"
-#include "matrix.h"
-#include "matrix_real.h"
-
-#include <map>
-#include <memory>
-#include <vector>
-
-/**
-@brief Path to a gate inside a possibly nested Gates_block.
-
-Each entry is an index inside the corresponding block. For example, path
-{3, 2} means the third top-level gate is a block and its second child gate is
-the selected gate.
-*/
-struct OSRGatePath {
-    std::vector<int> indices;
-
-    bool operator<(const OSRGatePath& other) const {
-        return indices < other.indices;
-    }
-};
-
-/**
-@brief Tunable controls for OSR-guided compression.
-*/
-struct N_Qubit_Decomposition_OSR_Compression_Options {
-    /// Number of candidates kept after each deletion depth.
-    int beam_width = 8;
-    /// Maximal number of entangling gates to remove. Negative means no limit.
-    int max_removed_gates = -1;
-    /// Keep OSR candidates whose estimated remaining CNOT count is at most this value.
-    int osr_bound_limit = 0;
-    /// Number of full Hilbert-Schmidt validation trials for final candidates.
-    int validation_trials = 3;
-    /// If true, run full optimization on final candidates before returning.
-    bool validate_final = true;
-    /// OSR numerical rank tolerance.
-    double osr_tolerance = 1e-3;
-    /// If true, augment deletion candidates with local circuit mutations before validation.
-    bool enable_mutations = true;
-    /// Number of local mutation rounds applied after the deletion beam.
-    int mutation_rounds = 1;
-    /// Maximal number of mutation candidates generated from each round.
-    int mutation_candidates = 32;
-    /// If true, rewiring mutations may use all qubit pairs instead of only observed edges.
-    bool mutate_full_topology = false;
-    /// If true, validate freshly synthesized U3+CNOT skeletons at compressed depths.
-    bool enable_skeleton_search = true;
-    /// Exact CNOT skeleton depth to test. Negative derives the target from max_removed_gates.
-    int skeleton_target_cnots = -1;
-    /// Maximal number of synthesized skeletons admitted to final validation.
-    int skeleton_max_candidates = 4096;
-};
-
-/**
-@brief OSR score of one compressed candidate.
-*/
-struct N_Qubit_Decomposition_OSR_Compression_Score {
-    /// Lower-bound estimate of remaining CNOTs required by the residual OSR.
-    int min_remaining_cnots = 0;
-    /// Secondary bound-solver objective used as tie-breaker.
-    double kappa = 0.0;
-    /// Aggregate OSR residual tie-breaker.
-    double residual = 0.0;
-    /// Best per-topology-edge CNOT-count composition found by the bound solver.
-    std::vector<int> edge_counts;
-    /// Per-cut OSR rank/loss pairs used to derive the bound.
-    std::vector<std::pair<int, double>> cut_bounds;
-};
-
-/**
-@brief Result of OSR-guided compression.
-*/
-struct N_Qubit_Decomposition_OSR_Compression_Result {
-    /// Newly allocated compressed gate structure. The input gate structure is not modified.
-    std::unique_ptr<Gates_block> gate_structure;
-    /// Parameters from the best validation run, if validation was enabled and run.
-    Matrix_real optimized_parameters;
-    /// Best final cost from validation, or infinity when validation was not run.
-    double current_minimum;
-    /// OSR score of the returned structure.
-    N_Qubit_Decomposition_OSR_Compression_Score osr_score;
-    /// Removed entangling-gate paths from the original gate structure.
-    std::vector<OSRGatePath> removed_gate_paths;
-    /// Number of entangling gates found in the input structure.
-    int original_entangling_gate_num;
-    /// Number of entangling gates left in the returned structure.
-    int compressed_entangling_gate_num;
-    /// Whether final Hilbert-Schmidt validation was run.
-    bool validated;
-    /// Whether the final validation reached optimization_tolerance.
-    bool reached_tolerance;
-
-    double decomposition_error;
-
-    N_Qubit_Decomposition_OSR_Compression_Result();
-    N_Qubit_Decomposition_OSR_Compression_Result(const N_Qubit_Decomposition_OSR_Compression_Result&) = delete;
-    N_Qubit_Decomposition_OSR_Compression_Result& operator=(const N_Qubit_Decomposition_OSR_Compression_Result&) = delete;
-    N_Qubit_Decomposition_OSR_Compression_Result(N_Qubit_Decomposition_OSR_Compression_Result&&) = default;
-    N_Qubit_Decomposition_OSR_Compression_Result& operator=(N_Qubit_Decomposition_OSR_Compression_Result&&) = default;
-};
-
-/**
-@brief Decomposition class that compresses an already supplied gate structure with OSR guidance.
-
-The class assumes the starting circuit is already present in the inherited
-Gates_block, typically through set_custom_gate_structure(...). It then searches
-top-down by deleting entangling gates, uses OSR to keep promising compressed
-candidates, and finally validates the chosen structure with the standard
-optimization cost.
-*/
-class N_Qubit_Decomposition_OSR_Compression : public N_Qubit_Decomposition_custom {
-
-public:
-    N_Qubit_Decomposition_OSR_Compression();
-    N_Qubit_Decomposition_OSR_Compression(Matrix Umtx_in, int qbit_num_in,
-                                          std::map<std::string, Config_Element>& config,
-                                          int accelerator_num = 0);
-    N_Qubit_Decomposition_OSR_Compression(Matrix Umtx_in, int qbit_num_in,
-                                          std::vector<matrix_base<int>> topology_in,
-                                          std::map<std::string, Config_Element>& config,
-                                          int accelerator_num = 0);
-    virtual ~N_Qubit_Decomposition_OSR_Compression();
-
-    /// Externally supplied hardware topology. Empty means infer from the gate structure.
-    std::vector<matrix_base<int>> topology;
-
-    virtual void start_decomposition();
-
-    /**
-    @brief Compress the supplied gate structure without modifying it.
-    */
-    N_Qubit_Decomposition_OSR_Compression_Result compress_gate_structure(
-        Gates_block* gate_structure_in);
-
-    N_Qubit_Decomposition_OSR_Compression_Options get_osr_compression_options();
-
-protected:
-    N_Qubit_Decomposition_OSR_Compression_Score evaluate_gate_structure_osr(
-        Gates_block* gate_structure_in,
-        const Matrix_real& initial_parameters,
-        MinCnotBoundSolver& osr_bound_solver,
-        std::vector<std::vector<int>>& all_cuts);
-
-    N_Qubit_Decomposition_custom prepare_custom_optimizer(
-        Gates_block* gate_structure_in,
-        cost_function_type cost_function_variant);
-
-    void validate_compressed_gate_structure(
-        Gates_block* gate_structure_in,
-        const Matrix_real& initial_parameters,
-        N_Qubit_Decomposition_OSR_Compression_Result& result);
-};
-
-#endif

From 58d53fbb64a2645381629010cc703bda1ef5063e Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Sun, 3 May 2026 22:29:07 +0200
Subject: [PATCH 192/232] fix utils

---
 squander/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/squander/utils.py b/squander/utils.py
index d33eec17b..a434702c6 100644
--- a/squander/utils.py
+++ b/squander/utils.py
@@ -130,7 +130,7 @@ def qasm_to_squander_circuit(filename: str, return_transpiled=False):
         for n in dir(gate)
         if not n.startswith("_")
         and issubclass(getattr(gate, n), gate.Gate)
-        and n not in ("Gate", "CROT", "CR", "SYC")
+        and n not in ("Gate", "CROT", "CR", "SYC","Permutation")
     }
     if any(gate.operation.name not in SUPPORTED_GATES_NAMES for gate in qc.data):
         qc_transpiled = qiskit.transpile(

From 25e181711afa9865efebfbd8bcf9784532c33dfb Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 4 May 2026 14:15:43 +0200
Subject: [PATCH 193/232] Remove other partitioning strategies

---
 squander/synthesis/PartAM.py | 357 +----------------------------------
 1 file changed, 7 insertions(+), 350 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index ca7008d62..c4d31f6c4 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -8,7 +8,7 @@
 from collections import deque, defaultdict
 from itertools import permutations
 from multiprocessing import Pool
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional
 
 import numpy as np
 from tqdm import tqdm
@@ -135,7 +135,6 @@ def __init__(self, config):
         self.config.setdefault('max_partition_size', 3 )
         self.config.setdefault('topology', None)
         self.config.setdefault('routed', False)
-        self.config.setdefault('partition_strategy','ilp')
         self.config.setdefault('optimizer', 'BFGS')
         self.config.setdefault('use_osr', 0)
         self.config.setdefault("use_graph_search", 0)
@@ -163,9 +162,7 @@ def __init__(self, config):
         self.config.setdefault('three_qubit_exit_weight', 1.0)
         self.config.setdefault('boundary_beam_width', 1)
         self.config.setdefault('boundary_beam_depth', 1)
-        self.config.setdefault('size_density_weight', False)
-        self.config.setdefault('sparse_penalty', 3.0)
-        self.config.setdefault('partition_weight_model', 'density')
+        self.config['partition_weight_model'] = 'window_turnover'
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -209,46 +206,6 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
     # Static Synthesis Helpers (extracted from SynthesizeWideCircuit)
     # ------------------------------------------------------------------------
 
-    @staticmethod
-    def _parts_to_density_weights(allparts, gate_dict, sparse_penalty=3.0):
-        """Per-part ILP weights that penalise sparse 3-qubit partitions.
-
-        Penalty by active-pair count for a 3q partition:
-          1 pair  -> sparse_penalty        (e.g. 3 -> total ILP cost 4)
-          2 pairs -> sparse_penalty / 3    (e.g. 1 -> total ILP cost 2)
-          3 pairs -> 0                     (no penalty)
-        For 2q (or 1q) partitions the weight is always 0.
-        """
-        N = max(len(allparts), 1)
-        weights = []
-        for part in allparts:
-            qubits_in_part = set()
-            for gate_idx in part:
-                gate = gate_dict.get(gate_idx)
-                if gate is not None:
-                    qubits_in_part.update(gate.get_Involved_Qbits())
-            if len(qubits_in_part) != 3:
-                weights.append(0.0)
-                continue
-            active_pairs = set()
-            for gate_idx in part:
-                gate = gate_dict.get(gate_idx)
-                if gate is None:
-                    continue
-                qbs = list(gate.get_Involved_Qbits())
-                for a in range(len(qbs)):
-                    for b in range(a + 1, len(qbs)):
-                        active_pairs.add((min(qbs[a], qbs[b]), max(qbs[a], qbs[b])))
-            n_pairs = len(active_pairs)
-            if n_pairs >= 3:
-                penalty = 0.0
-            elif n_pairs == 2:
-                penalty = sparse_penalty / 3.0
-            else:
-                penalty = sparse_penalty
-            weights.append(penalty / N)
-        return weights
-
     @staticmethod
     def _part_support_and_active_pairs(part, gate_dict):
         qubits_in_part = set()
@@ -371,276 +328,6 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g):
             weights.append((conceptual_cost - 1.0) / N)
         return weights
 
-    @staticmethod
-    def _side_window_turnover_cnot_cost(support, neighbor_support):
-        if len(support) < 3 or len(neighbor_support) < 2:
-            return None
-        entering_or_leaving = len(support - neighbor_support)
-        if entering_or_leaving == 0:
-            return 0.0
-
-        # A new qubit in a 3q window implies at least one SWAP on a line.
-        # If both sides are 3q candidates the boundary is seen from both
-        # candidate scores, so each side pays half of the 3-CNOT SWAP cost.
-        cnot_per_window_qubit = 1.5 if len(neighbor_support) >= 3 else 3.0
-        return cnot_per_window_qubit * entering_or_leaving
-
-    @staticmethod
-    def _average_window_cnot_cost(part_idx, part, neighbor_gate_sets,
-                                  gate_to_parts, allparts, supports):
-        costs = []
-        support = supports[part_idx]
-        turnover_cost = (
-            qgd_Partition_Aware_Mapping._side_window_turnover_cnot_cost
-        )
-        for gate_set in neighbor_gate_sets:
-            for gate_idx in gate_set - part:
-                for other_idx in gate_to_parts.get(gate_idx, ()):
-                    if other_idx == part_idx:
-                        continue
-                    other_part = allparts[other_idx]
-                    if part & other_part:
-                        continue
-                    cost = turnover_cost(support, supports[other_idx])
-                    if cost is not None:
-                        costs.append(cost)
-        if not costs:
-            return 0.0
-        return sum(costs) / len(costs)
-
-    @staticmethod
-    def _parts_to_window_turnover_cnot_costs(allparts, gate_dict, g):
-        supports = []
-        for part in allparts:
-            support, _ = (
-                qgd_Partition_Aware_Mapping._part_support_and_active_pairs(
-                    part,
-                    gate_dict,
-                )
-            )
-            supports.append(support)
-
-        gate_to_parts = defaultdict(list)
-        for part_idx, part in enumerate(allparts):
-            for gate_idx in part:
-                gate_to_parts[gate_idx].append(part_idx)
-
-        rg = defaultdict(set)
-        for src, dsts in g.items():
-            for dst in dsts:
-                rg[dst].add(src)
-
-        costs = []
-        for part_idx, part in enumerate(allparts):
-            support = supports[part_idx]
-            if len(support) < 3:
-                costs.append(0.0)
-                continue
-            succ_gate_sets = [g.get(gate_idx, set()) for gate_idx in part]
-            pred_gate_sets = [rg.get(gate_idx, set()) for gate_idx in part]
-            costs.append(
-                qgd_Partition_Aware_Mapping._average_window_cnot_cost(
-                    part_idx,
-                    part,
-                    pred_gate_sets,
-                    gate_to_parts,
-                    allparts,
-                    supports,
-                )
-                + qgd_Partition_Aware_Mapping._average_window_cnot_cost(
-                    part_idx,
-                    part,
-                    succ_gate_sets,
-                    gate_to_parts,
-                    allparts,
-                    supports,
-                )
-            )
-        return costs
-
-    @staticmethod
-    def _subcircuit_from_gate_set(gates, gate_dict, parameters, go, rgo,
-                                  gate_to_qubit, qbit_num):
-        subcircuit = Circuit(qbit_num)
-        subparams = []
-        ordered_gates = _get_topo_order(
-            {gate_idx: go[gate_idx] & gates for gate_idx in gates},
-            {gate_idx: rgo[gate_idx] & gates for gate_idx in gates},
-            gate_to_qubit,
-        )
-        for gate_idx in ordered_gates:
-            gate = gate_dict[gate_idx]
-            subcircuit.add_Gate(gate)
-            start = gate.get_Parameter_Start_Index()
-            stop = start + gate.get_Parameter_Num()
-            subparams.append(parameters[start:stop])
-        return subcircuit, np.concatenate(subparams, axis=0)
-
-    def _meta_from_gate_set(self, gates, gate_dict, parameters, go, rgo,
-                            gate_to_qubit, qbit_num):
-        subcircuit, subparams = self._subcircuit_from_gate_set(
-            gates,
-            gate_dict,
-            parameters,
-            go,
-            rgo,
-            gate_to_qubit,
-            qbit_num,
-        )
-        involved_qbits = subcircuit.get_Qbits()
-        qbit_num_sub = len(involved_qbits)
-        qbit_map = {
-            involved_qbits[idx]: idx for idx in range(len(involved_qbits))
-        }
-        remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num_sub)
-        return {
-            'N': qbit_num_sub,
-            'circuit': remapped_subcircuit,
-            'params': subparams,
-            'mini_topologies': get_unique_subtopologies(
-                self.topology,
-                qbit_num_sub,
-            ),
-            'involved_qbits': involved_qbits,
-            'qbit_map': qbit_map,
-            'original_cnot_count': subcircuit.get_Gate_Nums().get('CNOT', 0),
-        }
-
-    @staticmethod
-    def _synthesis_score_pairs(N):
-        identity = tuple(range(N))
-        pairs = []
-        seen = set()
-        for perm in permutations(range(N)):
-            for pair in ((identity, tuple(perm)), (tuple(perm), identity)):
-                if pair in seen:
-                    continue
-                seen.add(pair)
-                pairs.append(pair)
-        return pairs
-
-    def _synthesis_score_fallback(self, meta):
-        best_cost = float('inf')
-        for mini_topology in meta['mini_topologies']:
-            fb_circuit, _ = self._qiskit_routing_fallback(meta, mini_topology)
-            if fb_circuit is not None:
-                best_cost = min(
-                    best_cost,
-                    fb_circuit.get_Gate_Nums().get('CNOT', 0),
-                )
-        if best_cost < float('inf'):
-            return best_cost
-        return max(1, int(meta.get('original_cnot_count', 1)))
-
-    def _parts_to_synthesis_cnot_weights(self, allparts, gate_dict, parameters,
-                                         go, rgo, gate_to_qubit, qbit_num,
-                                         gate_dag=None,
-                                         include_window_route_cost=False):
-        """Linear ILP weights from measured SeqPAM CNOT cost.
-
-        Each candidate partition is synthesized over the input-identity and
-        output-identity boundary sweeps, i.e. up to 2*N! local decompositions
-        per local topology.  The selected partitions are later fully
-        enumerated by _run_parallel_synthesis, reusing the shared decomposition
-        cache populated here.
-
-        The ILP objective always keeps the original one-unit partition cost.
-        The measured CNOT score is an additional cost, not a replacement for
-        partition count; otherwise small local CNOT savings can fragment the
-        circuit into many routing boundaries.
-        """
-        N_parts = max(len(allparts), 1)
-        metas = []
-        scores = [None] * len(allparts)
-
-        for part_idx, part in enumerate(allparts):
-            meta = self._meta_from_gate_set(
-                part,
-                gate_dict,
-                parameters,
-                go,
-                rgo,
-                gate_to_qubit,
-                qbit_num,
-            )
-            metas.append(meta)
-            if meta['N'] < 2:
-                scores[part_idx] = 0
-
-        disable_pbar = self.config.get('progressbar', 0) == False
-        futures = []
-        cached = []
-        n_cpus = _available_cpus()
-
-        with Pool(processes=n_cpus, initializer=_init_decompose_worker,
-                  initargs=(self.config,)) as pool:
-            for part_idx, meta in enumerate(metas):
-                if scores[part_idx] is not None:
-                    continue
-                pairs = self._synthesis_score_pairs(meta['N'])
-                for topology_idx, mini_topology in enumerate(
-                    meta['mini_topologies']
-                ):
-                    for P_i, P_o in pairs:
-                        Umtx = self._build_permuted_unitary(meta, P_i, P_o)
-                        ck = self._cache_key(Umtx, mini_topology)
-                        if ck in self._decomp_cache:
-                            cached.append((part_idx, ck))
-                        else:
-                            future = pool.apply_async(
-                                _decompose_one,
-                                (Umtx, mini_topology),
-                            )
-                            futures.append((part_idx, ck, future))
-
-            for part_idx, ck in cached:
-                _, _, synth_err = self._decomp_cache[ck]
-                if synth_err <= self.config['tolerance']:
-                    synth_circuit, _, _ = self._decomp_cache[ck]
-                    cnot_count = synth_circuit.get_Gate_Nums().get('CNOT', 0)
-                    if scores[part_idx] is None:
-                        scores[part_idx] = cnot_count
-                    else:
-                        scores[part_idx] = min(scores[part_idx], cnot_count)
-
-            for part_idx, ck, future in tqdm(
-                futures,
-                desc="Partition Weight Synthesis",
-                disable=disable_pbar,
-            ):
-                synth_circuit, synth_params, synth_err = future.get()
-                self._decomp_cache[ck] = (
-                    synth_circuit,
-                    synth_params,
-                    synth_err,
-                )
-                if synth_err <= self.config['tolerance']:
-                    cnot_count = synth_circuit.get_Gate_Nums().get('CNOT', 0)
-                    if scores[part_idx] is None:
-                        scores[part_idx] = cnot_count
-                    else:
-                        scores[part_idx] = min(scores[part_idx], cnot_count)
-
-        for part_idx, score in enumerate(scores):
-            if score is None:
-                scores[part_idx] = self._synthesis_score_fallback(
-                    metas[part_idx],
-                )
-
-        if include_window_route_cost and gate_dag is not None:
-            window_route_costs = self._parts_to_window_turnover_cnot_costs(
-                allparts,
-                gate_dict,
-                gate_dag,
-            )
-            scores = [
-                float(score) + window_route_costs[part_idx]
-                for part_idx, score in enumerate(scores)
-            ]
-
-        self._partition_synthesis_cnot_scores = list(scores)
-        return [float(score) / N_parts for score in scores]
-
     @staticmethod
     def _topo_key(mini_topology):
         return tuple(sorted(tuple(sorted(e)) for e in mini_topology))
@@ -929,42 +616,12 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
         # ---- Phase 2: ILP partition selection ----
-        # By default this minimizes partition count. Optional weight models can
-        # replace that unit cost with a routing-oriented conceptual cost while
-        # preserving a linear ILP objective.
-        ilp_weights = None
-        partition_weight_model = self.config.get(
-            'partition_weight_model',
-            'density',
+        # PartAM keeps one partitioning strategy: window_turnover.
+        ilp_weights = self._parts_to_window_turnover_weights(
+            allparts,
+            gate_dict,
+            g,
         )
-        if partition_weight_model == 'window_turnover':
-            ilp_weights = self._parts_to_window_turnover_weights(
-                allparts,
-                gate_dict,
-                g,
-            )
-        elif partition_weight_model in (
-            'synthesis_cnot',
-            'synthesis_route_cnot',
-        ):
-            ilp_weights = self._parts_to_synthesis_cnot_weights(
-                allparts,
-                gate_dict,
-                working_parameters,
-                go,
-                rgo,
-                gate_to_qubit,
-                qbit_num_orig_circuit,
-                gate_dag=g,
-                include_window_route_cost=(
-                    partition_weight_model == 'synthesis_route_cnot'
-                ),
-            )
-        elif self.config.get('size_density_weight', False):
-            sparse_penalty = float(self.config.get('sparse_penalty', 3.0))
-            ilp_weights = self._parts_to_density_weights(
-                allparts, gate_dict, sparse_penalty=sparse_penalty
-            )
         L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----

From d4ac03dcfbcfb00fb29c95c7d5f750fcd5d19b3f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 4 May 2026 21:21:07 +0200
Subject: [PATCH 194/232] improve partitioning

---
 squander/synthesis/PartAM.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index c4d31f6c4..6cabc96c3 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -133,6 +133,7 @@ def __init__(self, config):
         self.config.setdefault('test_subcircuits', False )
         self.config.setdefault('test_final_circuit', True )
         self.config.setdefault('max_partition_size', 3 )
+        self.config.setdefault('pack_credit_weight', 0.0)
         self.config.setdefault('topology', None)
         self.config.setdefault('routed', False)
         self.config.setdefault('optimizer', 'BFGS')
@@ -256,7 +257,7 @@ def _average_turnover(part_idx, part, neighbor_gate_sets,
         return sum(turnovers) / len(turnovers)
 
     @staticmethod
-    def _parts_to_window_turnover_weights(allparts, gate_dict, g):
+    def _parts_to_window_turnover_weights(allparts, gate_dict, g, pack_credit_weight=0.0):
         """Linear ILP weights for 3q window continuity.
 
         Dense 3q blocks are only routing-friendly when their local qubit window
@@ -325,7 +326,17 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g):
                 + succ_turnover
                 + pred_turnover
             )
-            weights.append((conceptual_cost - 1.0) / N)
+            if pack_credit_weight:
+                k = len(support)
+                full_clique_pairs = k * (k - 1) // 2
+                if len(active_pairs) == full_clique_pairs:
+                    multi_qubit_gate_count = sum(
+                        1 for gate_idx in part
+                        if gate_dict.get(gate_idx) is not None
+                        and len(gate_dict[gate_idx].get_Involved_Qbits()) >= 2
+                    )
+                    conceptual_cost -= pack_credit_weight * max(multi_qubit_gate_count - 1, 0)
+            weights.append(max((conceptual_cost - 1.0) / N, 0.0))
         return weights
 
     @staticmethod
@@ -621,6 +632,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             allparts,
             gate_dict,
             g,
+            pack_credit_weight=self.config['pack_credit_weight'],
         )
         L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 

From d9e1d1a742433d14126fa4374dc818b2e8b92258 Mon Sep 17 00:00:00 2001
From: jnadori <degututaj@gmail.com>
Date: Wed, 6 May 2026 05:49:15 -0400
Subject: [PATCH 195/232] update yaml to use pybind

---
 conda_env_example.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda_env_example.yaml b/conda_env_example.yaml
index 12bde7316..b09ebd8f5 100644
--- a/conda_env_example.yaml
+++ b/conda_env_example.yaml
@@ -16,6 +16,7 @@ dependencies:
   - numpy
   - scipy
   - tbb-devel
+  - pybind11
   - pip:
       - gurobipy
       - matplotlib

From 60835d9a30ca2e7762556961930b2e2918e58a3f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 7 May 2026 23:28:56 +0200
Subject: [PATCH 196/232] seqpam-ilp uses squander seqpam workflow with simple
 ILP partitioning

---
 .../qgd_Wide_Circuit_Optimization.py          | 1404 +----------------
 1 file changed, 65 insertions(+), 1339 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 2a2d9853e..6aad05e4f 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -96,1211 +96,18 @@ def CNOTGateCount(circ: Circuit, max_gates: int = 0) -> int:
         )
     return num_cnots
 
-
-class N_Qubit_Decomposition_Guided_Tree(N_Qubit_Decomposition_custom):
-    """Tree-guided multi-qubit decomposition using operator Schmidt rank (OSR) style costs."""
-
-    def __init__(
-        self, Umtx, config, accelerator_num, topology, paramspace=None, paramscale=None
-    ):
-        """Initialize guided tree search over a unitary (or list of unitaries) and hardware topology.
-
-        Args:
-            Umtx: Complex unitary matrix, or list of such matrices (already conjugate-transposed per caller).
-            config: Decomposition / search configuration dict.
-            accelerator_num: Number of accelerators for the base decomposer.
-            topology: List of undirected coupler pairs ``(i, j)``; default is all-to-all.
-            paramspace: Optional per-parameter affine scaling space for ``params_to_mat``.
-            paramscale: Optional scaling denominators paired with ``paramspace``.
-        """
-        super().__init__(
-            Umtx[0] if isinstance(Umtx, list) else Umtx,
-            config=config,
-            accelerator_num=accelerator_num,
-        )
-        self.Umtx = (
-            Umtx if isinstance(Umtx, list) else [Umtx]
-        )  # already conjugate transposed
-        self.qbit_num = self.Umtx[0].shape[0].bit_length() - 1
-        self.config = config
-        self.accelerator_num = accelerator_num
-        self.paramspace = paramspace
-        self.paramscale = () if paramscale is None else paramscale
-        # self.set_Cost_Function_Variant( 0 )	 #0 is Frobenius, 3 is HS, 10 is OSR
-        if topology is None:
-            topology = [
-                (i, j)
-                for i in range(self.qbit_num)
-                for j in range(i + 1, self.qbit_num)
-            ]
-        self.topology = topology
-
-    @staticmethod
-    def enumerate_unordered_cnot_BFS(n: int, topology=None, use_gl=True):
-        """Yield successive BFS levels of CNOT-reachable GL(n,2) states (see ``enumerate_unordered_cnot_BFS_level``).
-
-        Args:
-            n: Number of qubits.
-            topology: Allowed unordered CNOT pairs; default all pairs.
-            use_gl: If True, use GL-style column updates; else restricted enumeration.
-
-        Yields:
-            Each level's list of ``(state_key, seq_pairs, seq_directed)`` discoveries.
-        """
-        # Precompute unordered pairs
-        topology = (
-            [(i, j) for i in range(n) for j in range(i + 1, n)]
-            if topology is None
-            else topology
-        )
-        prior_level_info: Union[tuple[Any, Any, Any, Any], None] = None
-        while True:
-            visited, seq_pairs_of, seq_dir_of, res = (
-                N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS_level(
-                    n, topology, prior_level_info, use_gl=use_gl
-                )
-            )
-            if not res:
-                break
-            yield res
-            prior_level_info = (
-                visited,
-                seq_pairs_of,
-                seq_dir_of,
-                list(x[0] for x in reversed(res)),
-            )
-
-    @staticmethod
-    def canonical_prefix_ok(seq):
-        """Check whether a sequence of unordered pair steps has a canonical topological order.
-
-        Returns:
-            ``-1`` if the prefix is OK; otherwise the first index where canonical order fails.
-        """
-        m = len(seq)
-        if m <= 1:
-            return -1
-        succ = {}
-        indeg = {}
-        last_on = {}
-        for k in range(m):
-            for q in seq[k]:
-                if q in last_on:
-                    p = last_on[q]
-                    succ.setdefault(p, []).append(k)
-                    indeg[k] = indeg.get(k, 0) + 1
-                last_on[q] = k
-        import heapq
-
-        pq = [(seq[x], x) for x in range(m) if indeg.get(x, 0) == 0]
-        heapq.heapify(pq)
-        for pos in range(m):
-            # Kahn's algorithm
-            if len(pq) == 0:
-                return pos  # malformed (shouldn't happen)
-            u = heapq.heappop(pq)
-            if u[1] != pos:
-                return pos  # deviation: not canonical
-            for v in succ.get(u[1], ()):
-                indeg[v] -= 1
-                if indeg[v] == 0:
-                    heapq.heappush(pq, (seq[v], v))
-        return -1
-
-    @staticmethod
-    def enumerate_unordered_cnot_BFS_level(
-        n: int,
-        topology: Optional[List[Tuple[int, int]]] = None,
-        prior_level_info: Optional[
-            Tuple[
-                Set[Tuple[int, ...]],
-                Dict[Tuple[int, ...], List[Tuple[int, int]]],
-                Dict[Tuple[int, ...], List[Tuple[int, int]]],
-                List[
-                    Tuple[Tuple[int, ...], List[Tuple[int, int]], List[Tuple[int, int]]]
-                ],
-            ]
-        ] = None,
-        use_gl=True,
-    ):
-        """Enumerate GL(n,2) states at the next BFS depth from ``prior_level_info``.
-
-        Moves are *recorded* as unordered pairs (structure view); each expansion
-        may try both CNOT directions internally when ``use_gl`` is True.
-
-        Returns:
-            Tuple ``(visited, seq_pairs_of, seq_dir_of, res)`` where ``res`` is a
-            list of ``(A, seq_pairs, seq_directed)`` for newly discovered states
-            ``A``: ``seq_pairs`` is the unordered-pair history; ``seq_directed`` is
-            a consistent directed realization. On the first call, pass
-            ``prior_level_info=None`` to obtain the root state only.
-        """
-        if prior_level_info is None:
-            # Initial state
-            start_key = tuple(1 << i for i in range(n))
-
-            # Visited: we only need to mark states once (minimal depth)
-            visited = {start_key}
-
-            # We also keep *one* representative sequence per state (unordered + directed)
-            seq_pairs_of = {start_key: []}
-            seq_dir_of = {start_key: []}
-
-            # Yield the root
-            return visited, seq_pairs_of, seq_dir_of, [(start_key, [], [])]
-        else:
-            visited, seq_pairs_of, seq_dir_of, q = prior_level_info
-        res = []
-        new_seq_pairs_of = {}
-        new_seq_dir_of = {}
-
-        while q:
-            A = q.pop()
-            last_pairs = seq_pairs_of[A]
-            last_dirs = seq_dir_of[A]
-            assert topology is not None
-            for p in topology:
-                if not use_gl:
-                    if len(last_pairs) >= 3 and all(p == x for x in last_pairs[-3:]):
-                        continue  # avoid more than 3 repeated CNOTs
-                    if (
-                        N_Qubit_Decomposition_Guided_Tree.canonical_prefix_ok(
-                            last_pairs + [p]
-                        )
-                        >= 0
-                    ):
-                        continue  # not canonical prefix
-                # Try both directions, but record the *same* unordered step 'p'
-                for mv in (p, (p[1], p[0])) if use_gl else (p,):
-                    # CNOT left
-                    if use_gl:
-                        if mv[0] == mv[1]:
-                            B = A
-                        else:
-                            B = list(A)
-                            B[mv[1]] ^= B[mv[0]]
-                            B = tuple(B)
-
-                        if B in visited:
-                            continue  # already discovered at minimal depth
-                    else:
-                        B = tuple(last_dirs + [p])
-
-                    visited.add(B)
-                    new_seq_pairs_of[B] = last_pairs + [p]
-                    new_seq_dir_of[B] = last_dirs + [mv]
-
-                    # Emit as soon as we discover the state (BFS → minimal depth)
-                    res.append((B, new_seq_pairs_of[B], new_seq_dir_of[B]))
-        return visited, new_seq_pairs_of, new_seq_dir_of, res
-
-    @staticmethod
-    def build_sequence(stop: int = 5, ordered: bool = True, use_gl: bool = True):
-        """Debug helper: print distribution of minimal CNOT sequence lengths by qubit count (up to ``stop``).
-
-        See OEIS A002884 for related enumeration context. Not used in production optimization paths.
-        """
-        # https://oeis.org/A002884
-        # unordered sequence: 1, 1, 4, 88, 9556, 4526605
-        # unordered at 5 qubits: {0: 1, 1: 10, 2: 85, 3: 650, 4: 4475, 5: 27375, 6: 142499, 7: 580482, 8: 1501297, 9: 1738232, 10: 517884, 11: 13591, 12: 24}
-        for i in range(2, stop + 1):
-            d = {}
-            for z in N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS(
-                i, use_gl=use_gl
-            ):
-                for x in (list if ordered else set)(tuple(x[1]) for x in z):
-                    d[len(x)] = d.get(len(x), 0) + 1
-                if not use_gl and len(d) > 5:
-                    break
-            print({x: d[x] for x in sorted(d)}, sum(d.values()))
-
-    @staticmethod
-    def extract_bits(x, pos):
-        """Pack bits of integer ``x`` at positions ``pos`` into a smaller integer (LSB-first order)."""
-        return sum(((x >> p) & 1) << i for i, p in enumerate(pos))
-
-    @staticmethod
-    def build_osr_matrix(U, n, A):
-        """Reshape unitary ``U`` (size ``2^n``) into the OSR matrix for bipartition ``A`` vs complement.
-
-        Args:
-            U: Flattened ``2^n x 2^n`` unitary (row-major).
-            n: Qubit count.
-            A: Tuple of qubit indices on subsystem A.
-
-        Returns:
-            Matrix of shape ``(2^{|A|})^2 x (2^{|B|})^2`` for Schmidt analysis.
-        """
-        A = list(reversed(A))
-        B = list(sorted(set(range(n)) - set(A), reverse=True))
-        A, B = [n - 1 - q for q in A], [n - 1 - q for q in B]
-        dA = 1 << len(A)
-        dB = 1 << len(B)
-        return (
-            U.reshape([2] * (2 * n))
-            .transpose(
-                tuple(A) + tuple(t + n for t in A) + tuple(B) + tuple(t + n for t in B)
-            )
-            .reshape(dA * dA, dB * dB)
-        )
-
-    @staticmethod
-    def accumulate_grad_for_cut(U, G, Umat, VTmat, n, A):  # qubits on A
-        """Accumulate gradient ``G * Umat @ VTmat`` from an SVD triplet back into full ``U`` layout for cut ``A``."""
-        A = list(reversed(A))
-        B = list(sorted(set(range(n)) - set(A), reverse=True))
-        A, B = [n - 1 - q for q in A], [n - 1 - q for q in B]
-        mat = np.array(G) * Umat @ VTmat  # reconstruct U from its dyadic decomposition
-        revmap = [None] * (2 * n)
-        for i, x in enumerate(
-            tuple(A) + tuple(t + n for t in A) + tuple(B) + tuple(t + n for t in B)
-        ):
-            revmap[x] = i
-        U += mat.reshape([2] * (2 * n)).transpose(tuple(revmap)).reshape(*U.shape)
-        return U
-
-    @staticmethod
-    def trace_out_qubits(U, n, A):
-        """Trace out complement of subsystem ``A`` and return a unitary polar factor on ``A`` (2^{|A|} x 2^{|A|})."""
-        M = N_Qubit_Decomposition_Guided_Tree.build_osr_matrix(U, n, A)
-        M = np.linalg.svd(M, compute_uv=True, full_matrices=False)[0][:, 0].reshape(
-            1 << len(A), 1 << len(A)
-        )
-        return N_Qubit_Decomposition_Guided_Tree._polar_unitary(M)
-
-    @staticmethod
-    def numerical_rank_osr(M, Fnorm, tol=1e-10):
-        """Count singular values of ``M/Fnorm`` above ``tol`` relative to the largest; returns ``(rank, s)``."""
-        s = np.linalg.svd(M, full_matrices=False, compute_uv=False) / Fnorm
-        # print(s)
-        return int(np.sum(s >= s[0] * tol)), s
-
-    @staticmethod
-    def operator_schmidt_rank(U, n, A, Fnorm, tol=1e-10):
-        """Operator Schmidt rank of ``U`` across cut ``A`` (via OSR matrix), using ``numerical_rank_osr``."""
-        return N_Qubit_Decomposition_Guided_Tree.numerical_rank_osr(
-            N_Qubit_Decomposition_Guided_Tree.build_osr_matrix(U, n, A), Fnorm, tol
-        )
-
-    @staticmethod
-    def unique_cuts(n):
-        """Yield all nontrivial unordered bipartitions of ``n`` qubits (each complement pair once)."""
-        import itertools
-
-        qubits = tuple(range(n))
-        for r in range(1, n // 2 + 1):  # only up to half
-            for S in itertools.combinations(qubits, r):
-                if r < n - r:
-                    yield S
-                else:  # r == n-r (only possible when n even): tie-break
-                    comp = tuple(q for q in qubits if q not in S)
-                    if S < comp:  # lexicographically smaller tuple wins
-                        yield S
-
-    def get_circuit_from_pairs(self, pairs, finalizing=True):
-        """Build a layer of U3–U3–CNOT per pair, optionally followed by trailing U3 on every qubit."""
-        circ = Circuit(self.qbit_num)
-        for pair in pairs:
-            circ.add_U3(pair[0])
-            circ.add_U3(pair[1])
-            circ.add_CNOT(pair[0], pair[1])
-        if finalizing:
-            for qbit in range(self.qbit_num):
-                circ.add_U3(qbit)
-        return circ
-
-    @staticmethod
-    def ceil_log2(x):
-        """Ceiling of log2 for nonnegative integer ``x``; ``0`` maps to ``0``."""
-        return 0 if x == 0 else (x - 1).bit_length()
-
-    @staticmethod
-    def logsumexp_smoothmax(Lc, tau=1e-2):
-        """Smooth maximum of list ``Lc``: ``tau * log(sum exp(v/tau)) + max``, stable implementation."""
-        if not Lc:
-            return 0.0
-        if tau <= 0.0:
-            raise RuntimeError("tau must be > 0")
-        m = max(Lc)
-        acc = 0.0
-        for v in Lc:
-            acc += np.exp((v - m) / tau)
-        return tau * np.log(acc) + m
-
-    @staticmethod
-    def dyadic_loss(S, max_dyadic, rho=0.9, tol=1e-4):
-        """Weighted loss on dyadic singular-value indices (powers of two) of normalized spectrum ``S``."""
-        tot_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(len(S))
-        w = 1.0
-        acc = 0.0
-        for k in range(max_dyadic - 1, -1, -1):
-            if k < tot_dyadic:
-                val = S[1 << k] - S[0] * tol
-                acc += w * val * val
-            w *= rho
-        return acc
-
-    @staticmethod
-    def avg_loss(cuts_S, rho=0.9):
-        """Average ``dyadic_loss`` over a list of singular-value spectra ``cuts_S``."""
-        max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(
-            max(len(S) for S in cuts_S)
-        )
-        total_loss = 0.0
-        for S in cuts_S:
-            total_loss += N_Qubit_Decomposition_Guided_Tree.dyadic_loss(
-                S, max_dyadic, rho
-            )
-        return total_loss / len(cuts_S)
-
-    # Aggregated cost over cuts: softmax (log-sum-exp) of per-cut dyadic losses
-    @staticmethod
-    def cuts_softmax_dyadic_cost(cuts_S, rho=0.1, tau=1e-2):
-        """Log-sum-exp aggregate of per-cut dyadic losses (temperature ``tau``)."""
-        if tau <= 0.0:
-            raise RuntimeError("tau must be > 0")
-        Lc = []
-        max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(
-            max(len(S) for S in cuts_S)
-        )
-        for S in cuts_S:
-            Lc.append(N_Qubit_Decomposition_Guided_Tree.dyadic_loss(S, max_dyadic, rho))
-        return N_Qubit_Decomposition_Guided_Tree.logsumexp_smoothmax(Lc, tau)
-
-    # Gradient w.r.t. the singular values (diagonal of dL/dΣ):
-    @staticmethod
-    def dyadic_loss_grad_diag(S, max_dyadic, Fnorm, rho=0.1, tol=1e-4):
-        """Diagonal gradient of ``dyadic_loss`` w.r.t. singular values (dyadic indices only)."""
-        n = len(S)
-        # c_k = rho^k / Mk  for k=1..n-1, then prefix sum C_j = sum_{k=1}^j c_k
-        tot_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(n)
-        grad = [0.0] * tot_dyadic
-        w = 1.0
-        for k in range(max_dyadic - 1, -1, -1):
-            if k < tot_dyadic:
-                idx = 1 << k
-                grad[k] = (
-                    2.0 * w * S[idx] * (1.0 - tol) / Fnorm
-                )  # 1-tol not needed if using stop-grad
-            w *= rho  # w = rho^k
-        return grad
-
-    @staticmethod
-    def cuts_avg_dyadic_grad(cuts_S, Fnorm, rho=0.1):
-        """Per-cut gradients for the average dyadic loss (list parallel to ``cuts_S``)."""
-        C = len(cuts_S)
-        max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(
-            max(len(S) for S in cuts_S)
-        )
-        Lc = []
-        for c in range(C):
-            Lc.append(
-                N_Qubit_Decomposition_Guided_Tree.dyadic_loss_grad_diag(
-                    cuts_S[c], max_dyadic, Fnorm * C, rho
-                )
-            )
-        return Lc
-
-    # Gradient w.r.t. singular values (same length as S).
-    # Only dyadic positions (1,2,4,...) get nonzero entries; others are 0.
-    @staticmethod
-    def cuts_softmax_tail_grad(cuts_S, Fnorm, rho=0.1, tau=1e-2):
-        """Gradient of softmax-of-dyadic-losses w.r.t. each cut's singular values."""
-        C = len(cuts_S)
-        if C == 0:
-            return []
-        max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(
-            max(len(S) for S in cuts_S)
-        )
-        # 1) per-cut losses
-        Lc = [
-            N_Qubit_Decomposition_Guided_Tree.dyadic_loss(cuts_S[c], max_dyadic, rho)
-            for c in range(C)
-        ]
-
-        # 2) softmax weights w_c = exp((Lc - m)/tau) / Z
-        m = max(Lc)
-        w = [np.exp((Lc[c] - m) / tau) for c in range(C)]
-        Z = np.sum(w)
-        for c in range(C):
-            w[c] /= Z if Z > 0.0 else 1.0
-
-        # 3) dL/dS^{(c)} = w_c * dL_c/dS^{(c)}
-        return [
-            [
-                v * w[c]
-                for v in N_Qubit_Decomposition_Guided_Tree.dyadic_loss_grad_diag(
-                    cuts_S[c], max_dyadic, Fnorm, rho
-                )
-            ]
-            for c in range(C)
-        ]
-
-    @staticmethod
-    def loss_for_rank(S, rank):
-        """Sum of squares of singular values from index ``2**rank`` onward (tail beyond target rank)."""
-        start = 1 << rank
-        if start >= len(S):
-            return 0.0
-        return sum(x * x for x in S[start:])
-
-    @staticmethod
-    def avg_loss_for_rank(cuts_S, rank):
-        """Average ``loss_for_rank`` over cuts."""
-        if not cuts_S:
-            return 0.0
-        total_loss = 0.0
-        for S in cuts_S:
-            total_loss += N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank)
-        return total_loss / len(cuts_S)
-
-    # Aggregated cost over cuts: softmax (log-sum-exp) of per-cut dyadic losses
-    @staticmethod
-    def cuts_softmax_rank_cost(cuts_S, rank, tau=1e-2):
-        """Softmax aggregate of per-cut ``loss_for_rank`` (temperature ``tau``)."""
-        Lc = []
-        for S in cuts_S:
-            Lc.append(N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank))
-        return N_Qubit_Decomposition_Guided_Tree.logsumexp_smoothmax(Lc, tau)
-
-    # Gradient w.r.t. the singular values (diagonal of dL/dΣ):
-    @staticmethod
-    def loss_for_rank_grad_diag(S, rank, Fnorm):
-        """
-        Gradient of a single-cut tail loss with respect to the RAW singular values,
-        assuming S is already normalized and Fnorm is treated as constant.
-
-        If S = sigma / Fnorm, then d/dsigma_i sum_{j>=r} S_j^2 = 2*S_i/Fnorm on tail.
-        """
-        n = len(S)
-        start = 1 << rank
-        grad = [0.0] * n
-        if start >= n:
-            return grad
-        invF = 1.0 / Fnorm
-        for i in range(start, n):
-            grad[i] = 2.0 * S[i] * invF
-        return grad
-
-    @staticmethod
-    def cuts_avg_rank_grad(cuts_S, rank, Fnorm):
-        """
-        Gradient of average tail loss across cuts.
-        Returns one gradient vector per cut, same length as that cut's S.
-        """
-        C = len(cuts_S)
-        if C == 0:
-            return []
-        scale = 1.0 / C
-        out = []
-        for S in cuts_S:
-            g = N_Qubit_Decomposition_Guided_Tree.loss_for_rank_grad_diag(
-                S, rank, Fnorm
-            )
-            out.append([scale * v for v in g])
-        return out
-
-    # Gradient w.r.t. singular values (same length as S).
-    @staticmethod
-    def cuts_softmax_rank_grad(cuts_S, rank, Fnorm, tau=1e-2):
-        """
-        Gradient of smooth-max across cuts:
-            L = tau * log(sum_c exp(L_c / tau))
-        so
-            dL = sum_c softmax_c * dL_c
-        """
-        C = len(cuts_S)
-        if C == 0:
-            return []
-        if tau <= 0.0:
-            raise RuntimeError("tau must be > 0")
-
-        Lc = [N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank) for S in cuts_S]
-
-        m = max(Lc)
-        w = [np.exp((v - m) / tau) for v in Lc]
-        Z = np.sum(w)
-        if Z <= 0.0:
-            Z = 1.0
-        w = [x / Z for x in w]
-
-        out = []
-        for c, S in enumerate(cuts_S):
-            g = N_Qubit_Decomposition_Guided_Tree.loss_for_rank_grad_diag(
-                S, rank, Fnorm
-            )
-            out.append([w[c] * v for v in g])
-        return out
-
-    # Build M with build_osr_matrix, then SVD (econ) and grab top triplet.
-    @staticmethod
-    def top_k_triplet_for_cut(
-        U,  # (N x N), row-major, N = 1<<q
-        q,  # number of qubits
-        A,  # qubits on side A
-        Fnorm,  # e.g., sqrt(N)
-    ):
-        """SVD of OSR matrix for cut ``A``: returns normalized singular values and ``U``, ``Vh``."""
-        # 1) Build M for this cut
-        M = N_Qubit_Decomposition_Guided_Tree.build_osr_matrix(U, q, A)
-        k = min(M.shape)
-
-        # 2) SVD: M = U * diag(S) * VT  (VT = V^H)
-        # Row-major API handles leading dims as col counts.
-        res = np.linalg.svd(M, full_matrices=False, compute_uv=True)
-        return res.S / Fnorm, res.U, res.Vh  # normalized singular value
-
-    @staticmethod
-    def get_deriv_osr_entanglement(matrix, use_cuts, rank, use_softmax):
-        """Gradient of rank / softmax-rank entanglement cost w.r.t. unitary ``matrix`` entries."""
-        qbit_num = len(matrix).bit_length() - 1
-        cuts = (
-            list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(qbit_num))
-            if len(use_cuts) == 0
-            else use_cuts
-        )
-        Fnorm = np.sqrt(len(matrix))
-        deriv = np.zeros(matrix.shape, dtype=complex)
-        # Compute the derivative of the OSR entanglement cost function
-        triplets = []
-        allS = []
-        for cut in cuts:
-            # 1) top k triplet on the normalized reshape M_c
-            S, Umat, VTmat = N_Qubit_Decomposition_Guided_Tree.top_k_triplet_for_cut(
-                matrix, qbit_num, cut, Fnorm
-            )
-            triplets.append(([], Umat, VTmat))
-            allS.append(S)
-        if use_softmax:
-            allS = N_Qubit_Decomposition_Guided_Tree.cuts_softmax_rank_grad(
-                allS, rank, Fnorm
-            )
-        else:
-            allS = N_Qubit_Decomposition_Guided_Tree.cuts_avg_rank_grad(
-                allS, rank, Fnorm
-            )
-        for i in range(len(cuts)):
-            triplets[i] = (allS[i], triplets[i][1], triplets[i][2])
-        for i in range(len(cuts)):
-            G, Umat, VTmat = triplets[i]
-            N_Qubit_Decomposition_Guided_Tree.accumulate_grad_for_cut(
-                deriv, G, Umat, VTmat, qbit_num, cuts[i]
-            )
-        return deriv
-
-    # Compute grad component = Re Tr( A^† B ) for A = dL/dU, B = dU/dθ
-    # A and B are (rows x cols) with row-major leading dimension.
-    @staticmethod
-    def real_trace_conj_dot(A, B):
-        """Return ``Re Tr(A† B)`` for complex matrices ``A``, ``B`` (row-major storage)."""
-        return np.sum(A.real * B.real + A.imag * B.imag)  # Re Tr(A^† B)
-
-    @staticmethod
-    def param_derivs(circ, Umtx, x):
-        """Finite-difference / shift-style partial derivatives ``∂U/∂θ_i`` for each gate parameter in ``x``."""
-        n = len(x)
-        derivs = [None] * n
-        for i in range(n):
-            kind = i % 3
-            if kind == 0:  # d/dt:  ∂U/∂t = U(t+π/2, φ, λ)
-                x_shift = x.copy()
-                x_shift[i] += np.pi / 2
-                Ui = Umtx.copy()
-                circ.apply_to(x_shift, Ui)
-                derivs[i] = Ui
-            else:  # d/dφ or d/dλ: ∂U/∂p = 0.5*(U(p+π/2) - U(p-π/2))
-                xp = x.copy()
-                xp[i] += np.pi / 2
-                xm = x.copy()
-                xm[i] -= np.pi / 2
-                Up = Umtx.copy()
-                Um = Umtx.copy()
-                circ.apply_to(xp, Up)
-                circ.apply_to(xm, Um)
-                derivs[i] = 0.5 * (Up - Um)
-        return derivs
-
-    @staticmethod
-    def _global_phase_fix(U):
-        """Remove global phase from square unitary ``U`` using determinant normalization."""
-        return U / (np.linalg.det(U) ** (1 / len(U)))
-
-    @staticmethod
-    def _polar_unitary(X):
-        """Nearest unitary to ``X`` via polar decomposition (SVD)."""
-        U, _, Vh = np.linalg.svd(X, full_matrices=False)
-        return U @ Vh
-
-    @staticmethod
-    def su2_to_u3_zyz(U):
-        """
-        Decompose a 2x2 unitary (det=1) into Qiskit U3: Rz(phi) @ Ry(theta) @ Rz(lam).
-        Returns (theta, phi, lam) in radians.
-        """
-        U = N_Qubit_Decomposition_Guided_Tree._global_phase_fix(U)
-        # Handle numeric edge cases robustly
-        a = U[0, 0]
-        b = U[0, 1]
-        c = U[1, 0]
-        d = U[1, 1]
-        # Prefer arccos for theta; it's stable when |a| is not tiny
-        ca = np.clip(np.abs(a), 0.0, 1.0)
-        theta = 2.0 * np.arccos(ca)
-        # If sin(theta/2) ~ 0, collapse to Z rotations
-        eps = 1e-12
-        if abs(np.sin(theta / 2)) < eps:
-            # Then c≈0, b≈0; only Z phases matter: U ≈ e^{iα} Rz(phi+lam)
-            # Choose phi=0, lam = arg(d) - arg(a)
-            phi = 0.0
-            lam = np.angle(d) - np.angle(a)
-            # Normalize to [-pi,pi)
-            lam = (lam + np.pi) % (2 * np.pi) - np.pi
-            return float(theta), float(phi), float(lam)
-
-        # Otherwise, phases from elements and normalize
-        phi = np.angle(c) - np.angle(a)
-        phi = (phi + np.pi) % (2 * np.pi) - np.pi
-        lam = np.angle(b) - np.angle(a) - np.pi
-        lam = (lam + np.pi) % (2 * np.pi) - np.pi
-        return float(theta), float(phi), float(lam)
-
-    @staticmethod
-    def _A_from_c(c1, c2, c3):
-        """Two-qubit canonical interaction ``exp(-i/2 * (c1 XX + c2 YY + c3 ZZ))`` as a unitary."""
-        X = np.array([[0, 1], [1, 0]], complex)
-        Y = np.array([[0, -1j], [1j, 0]], complex)
-        Z = np.array([[1, 0], [0, -1]], complex)
-        XX = np.kron(X, X)
-        YY = np.kron(Y, Y)
-        ZZ = np.kron(Z, Z)
-        H = c1 * XX + c2 * YY + c3 * ZZ
-        # use exp via eig (4x4) for robustness
-        ew, EV = np.linalg.eig(1j * H)
-        A = EV @ np.diag(np.exp(ew)) @ np.linalg.inv(EV)
-        # project back to unitary (remove numeric drift)
-        return N_Qubit_Decomposition_Guided_Tree._polar_unitary(A)
-
-    # Factor K1, K2 → (2x2 ⊗ 2x2)
-    @staticmethod
-    def factor_local(K):
-        """Factor 4x4 unitary ``K`` into Kronecker product of two 2x2 unitaries (SVD on reshaped tensor)."""
-        # reshape to (2,2,2,2), SVD the (a,c ; b,d) unfolding
-        M = K.reshape(2, 2, 2, 2).transpose(0, 2, 1, 3).reshape(4, 4)
-        U, _, Vh = np.linalg.svd(M, full_matrices=False)
-        A = U[:, 0].reshape(2, 2)
-        B = Vh.conj().T[:, 0].reshape(2, 2)
-        return N_Qubit_Decomposition_Guided_Tree._polar_unitary(
-            A
-        ), N_Qubit_Decomposition_Guided_Tree._polar_unitary(B)
-
-    @staticmethod
-    def _magic_basis_plusYY():
-        """Magic basis matrix for two-qubit canonical form (Bell-like columns)."""
-        # Complex magic basis (matches A(c)=exp(-i/2*(c1 XX + c2 YY + c3 ZZ)) below)
-        # Columns are (|Φ+>, i|Φ->, i|Ψ+>, |Ψ->) up to harmless phases
-        return (1 / np.sqrt(2)) * np.array(
-            [[1, 0, 0, 1j], [0, 1j, 1, 0], [0, 1j, -1, 0], [1j, 0, 0, -1]],
-            dtype=complex,
-        )
-
-    @staticmethod
-    def _project_to_SO4(O):
-        """Nearest proper SO(4) rotation to real matrix ``O`` (SVD with det fix)."""
-        # nearest real orthogonal with det=+1
-        O = np.real_if_close(O, tol=1e5)
-        U, _, Vt = np.linalg.svd(O)
-        O = U @ Vt
-        if np.linalg.det(O) < 0:
-            O[:, 0] *= -1
-        return O
-
-    @staticmethod
-    def _clean_col_phases(W):
-        """Remove column-wise global phases from matrix ``W`` (largest-magnitude entry per column)."""
-        Wc = W.copy()
-        for j in range(Wc.shape[1]):
-            col = Wc[:, j]
-            k = np.argmax(np.abs(col))
-            if np.abs(col[k]) > 1e-14:
-                Wc[:, j] *= np.exp(-1j * np.angle(col[k]))
-        return Wc
-
-    @staticmethod
-    def closest_local_product(W4):
-        """Best product of single-qubit unitaries approximating 4x4 ``W4`` (via ``factor_local``)."""
-        A, B = N_Qubit_Decomposition_Guided_Tree.factor_local(W4)
-        return N_Qubit_Decomposition_Guided_Tree._global_phase_fix(
-            A
-        ), N_Qubit_Decomposition_Guided_Tree._global_phase_fix(B)
-
-    @staticmethod
-    def kak_u3s_around_cx(U, n, c, t, iters=3):
-        """KAK-style two-qubit block on control ``c`` and target ``t``: Weyl angles and U3 params (debug helper)."""
-        U4 = N_Qubit_Decomposition_Guided_Tree.trace_out_qubits(U, n, (c, t))
-        U4 = N_Qubit_Decomposition_Guided_Tree._global_phase_fix(U4)
-        from qiskit.synthesis import TwoQubitWeylDecomposition
-
-        twd = TwoQubitWeylDecomposition(U4)
-        c1, c2, c3 = twd.a, twd.b, twd.c
-        K1A, K1B, K2A, K2B = twd.K1l, twd.K1r, twd.K2l, twd.K2r
-        A = N_Qubit_Decomposition_Guided_Tree._A_from_c(c1, c2, c3)
-        U_rec = np.kron(K1A, K1B) @ A @ np.kron(K2A, K2B)
-        z = np.trace(U_rec.conj().T @ U4)
-        U_rec *= np.exp(1j * np.angle(z))
-        print("Frob err:", np.linalg.norm(U_rec - U4), c1, c2, c3)
-        thA_pre, phA_pre, laA_pre = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz(
-            K2A.conj().T
-        )
-        thB_pre, phB_pre, laB_pre = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz(
-            K2B.conj().T
-        )
-        thA_post, phA_post, laA_post = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz(
-            K1A.conj().T
-        )  # left-apply ⇒ take dagger on outputs
-        thB_post, phB_post, laB_post = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz(
-            K1B.conj().T
-        )
-        return {
-            "c": (c1, c2, c3),
-            "pre": {
-                "A": (thA_pre / 2, phA_pre, laA_pre),
-                "B": (thB_pre / 2, phB_pre, laB_pre),
-            },
-            "post": {
-                "A": (thA_post / 2, phA_post, laA_post),
-                "B": (thB_post / 2, phB_post, laB_post),
-            },
-        }
-
-    def params_to_mat(self, params):
-        """Apply current gate structure to each target unitary with (optional) affine parameter scaling."""
-        allU = []
-        for U, pspace in zip(
-            self.Umtx, [None] if self.paramspace is None else self.paramspace
-        ):
-            U = U.copy()
-            scaled_params = (
-                np.sum(
-                    params.reshape(-1, 1 + len(pspace)) * np.array((1.0,) + pspace),
-                    axis=1,
-                )
-                if pspace is not None
-                else params
-            )
-            self.get_Circuit().apply_to(
-                scaled_params if pspace is not None else params, U
-            )
-            allU.append(U)
-        return allU
-
-    def OSR_with_local_alignment(
-        self, pairs, cuts, Fnorm, tol, rank, use_softmax, method="dual_annealing"
-    ):
-        """Optimize gate parameters to reduce OSR-based entanglement across ``cuts`` (optionally softmax-aggregated).
-
-        Uses cost variant 10 during optimization, then restores variant 3. Returns list of
-        ``(ceil_log2(rank), singular_spectrum)``-style entries per unitary and cut.
-        """
-        if len(pairs) != 0:
-            self.set_Cost_Function_Variant(10)
-            # self.Run_Decomposition(pairs, False)
-            self.set_Gate_Structure(self.get_circuit_from_pairs(pairs, False))
-            import scipy
-
-            param_bound = np.array(
-                ([2 * np.pi] + [1 / x for x in self.paramscale])
-                * self.get_Parameter_Num()
-            )
-
-            def cost(x):
-                allU = self.params_to_mat(x)
-                S = [
-                    N_Qubit_Decomposition_Guided_Tree.operator_schmidt_rank(
-                        U, self.qbit_num, cut, Fnorm, tol
-                    )[1]
-                    for U in allU
-                    for cut in cuts
-                ]
-                if use_softmax:
-                    return N_Qubit_Decomposition_Guided_Tree.cuts_softmax_rank_cost(
-                        S, rank
-                    )
-                else:
-                    return N_Qubit_Decomposition_Guided_Tree.avg_loss_for_rank(S, rank)
-
-            def jacobian(x):
-                allU = self.params_to_mat(x)
-                grad = np.zeros(len(x), dtype=float)
-                for Ubase, U, pspace in zip(
-                    self.Umtx,
-                    allU,
-                    [None] if self.paramspace is None else self.paramspace,
-                ):
-                    dL = N_Qubit_Decomposition_Guided_Tree.get_deriv_osr_entanglement(
-                        U, cuts, rank, use_softmax
-                    )
-                    basevec = np.array((1.0,) if pspace is None else (1.0,) + pspace)
-                    scaled_params = (
-                        np.sum(x.reshape(-1, 1 + len(pspace)) * basevec, axis=1)
-                        if pspace is not None
-                        else x
-                    )
-                    derivs = N_Qubit_Decomposition_Guided_Tree.param_derivs(
-                        self.get_Circuit(), Ubase, scaled_params
-                    )
-                    newgrad = np.array(
-                        [
-                            N_Qubit_Decomposition_Guided_Tree.real_trace_conj_dot(
-                                dL, deriv
-                            )
-                            for deriv in derivs
-                        ]
-                    )
-                    if pspace is not None:
-                        newgrad = (np.array(newgrad)[:, np.newaxis] * basevec).reshape(
-                            -1
-                        )
-                    grad += newgrad
-                return grad / len(self.Umtx)
-
-            if method == "differential_evolution":
-                best = scipy.optimize.differential_evolution(
-                    cost, [(0, x) for x in param_bound], maxiter=100, polish=False
-                )
-                best = scipy.optimize.minimize(
-                    cost, best.x, method="BFGS", jac=jacobian, options={"maxiter": 200}
-                )
-            elif method == "dual_annealing":
-                best = None
-                for seed in range(20):
-                    res = scipy.optimize.dual_annealing(
-                        cost, [(0, x) for x in param_bound], maxiter=100
-                    )  # , minimizer_kwargs={'jac': jacobian})
-                    if best is None or res.fun < best.fun:
-                        best = res
-            elif method == "basinhopping":
-                best = scipy.optimize.basinhopping(
-                    cost,
-                    np.random.rand(len(param_bound)) * param_bound,
-                    niter=50,
-                    stepsize=np.pi / 2,
-                    minimizer_kwargs={"jac": jacobian},
-                )
-            else:
-                best = min(
-                    [
-                        scipy.optimize.minimize(
-                            cost,
-                            np.random.rand(len(param_bound)) * param_bound,
-                            method="BFGS",
-                            jac=jacobian,
-                            options={"maxiter": 200},
-                        )
-                        for _ in range(20)
-                    ],
-                    key=lambda r: r.fun,
-                )
-            # print(best)
-            self.set_Cost_Function_Variant(3)
-            assert best is not None
-            allU = self.params_to_mat(best.x)
-        else:
-            allU = self.Umtx
-        return [
-            (N_Qubit_Decomposition_Guided_Tree.ceil_log2(rank), s)
-            for U in allU
-            for cut in cuts
-            for rank, s in (
-                N_Qubit_Decomposition_Guided_Tree.operator_schmidt_rank(
-                    U, self.qbit_num, cut, Fnorm, tol
-                ),
-            )
-        ]
-
-    def Run_Decomposition(self, pairs, finalizing=True):
-        """Run BFGS decomposition for CNOT structure ``pairs``; set ``self.err`` and return success vs tolerance."""
-        circ = self.get_circuit_from_pairs(pairs, finalizing)
-        self.set_Gate_Structure(circ)
-        self.set_Optimized_Parameters(
-            np.random.rand(self.get_Parameter_Num()) * (2 * np.pi)
-        )
-        super().Start_Decomposition()
-        if finalizing:
-            params = self.get_Optimized_Parameters()
-            self.err = self.Optimization_Problem(params)
-            return self.err < self.config.get("tolerance", 1e-8)
-
-    @staticmethod
-    def generate_insertions(curpath, topology, num_cnot):
-        """Yield CNOT insertion patterns: insert ``num_cnot`` topology pairs into sequence ``curpath``."""
-        import itertools
-
-        n = len(curpath)
-        nslots = n + 1
-        for places in itertools.combinations_with_replacement(range(nslots), num_cnot):
-            for pairs in itertools.product(topology, repeat=num_cnot):
-                out = []
-                j = 0  # index into inserted pairs
-                for slot in range(nslots):
-                    while j < num_cnot and places[j] == slot:
-                        out.append(pairs[j])
-                        j += 1
-                    if slot < n:
-                        out.append(curpath[slot])
-                yield tuple(out)
-
-    def Start_Decomposition(self):
-        """Beam-style search over CNOT prefixes guided by OSR stats; collects solutions in ``self.all_solutions``."""
-        import heapq, itertools
-
-        self.all_solutions = []
-        self.err = 1.0
-        stop_first_solution = self.config.get("stop_first_solution", True)
-        cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(self.qbit_num))
-        # because we have U already conjugate transposed, must use prefix order
-        B = self.config.get("beam", None)  # 8*len(self.topology))
-        max_depth = self.config.get("tree_level_max", 14)
-        tol = 1e-3
-        Fnorm = np.sqrt(1 << self.qbit_num)
-        best = []
-        visited = set()
-        all_ranks = list(range(min(2, self.qbit_num - 1)))
-
-        def get_osr_stats(path, rank, use_softmax):
-            """Return ``(min_cnots, rank_kappa_metric, raw_osr_list)`` for prefix ``path``."""
-            h = self.OSR_with_local_alignment(
-                path,
-                cuts,
-                Fnorm,
-                tol=tol,
-                rank=rank,
-                use_softmax=use_softmax,
-                method="basin_hopping",
-            )
-            min_cnots = max((x[0] for x in h), default=0)
-            ranktot = sum(x[0] for x in h)
-            kappa = sum(sum(y * y for y in x[1][1:]) for x in h)
-            return min_cnots, ranktot + kappa, h
-
-        def add_to_heap(path, parent_stats):
-            """Push ``path`` onto search heap if within depth and OSR bounds improve on ``parent_stats``."""
-            if len(path) > max_depth:
-                return False
-            if path in visited:
-                return False
-            visited.add(path)
-            if self.qbit_num > 1:
-                min_cnots, rankkappa = min(
-                    get_osr_stats(path, rank, use_sm)[:2]
-                    for (rank, use_sm) in itertools.product(all_ranks, (False,))
-                )  # (False, True)
-            else:
-                min_cnots, rankkappa = 0, 0.0
-            if parent_stats is not None and (min_cnots, rankkappa) >= parent_stats:
-                return False
-            heapq.heappush(best, (min_cnots, rankkappa, path))
-            return True
-
-        add_to_heap((), None)
-        while best:
-            # print(best[0])
-            min_cnots, rankkappa, curpath = heapq.heappop(best)
-            if min_cnots == 0:
-                # print(path)
-                for i in range(10):
-                    if self.Run_Decomposition(curpath):
-                        self.all_solutions.append(
-                            (self.get_Circuit(), self.get_Optimized_Parameters())
-                        )
-                        if stop_first_solution:
-                            return
-                        break
-                    # print("Looping", h)
-            num_cnot = 1
-            while True:
-                any_added = False
-                for newpath in N_Qubit_Decomposition_Guided_Tree.generate_insertions(
-                    curpath, self.topology, num_cnot
-                ):
-                    if add_to_heap(newpath, (min_cnots, rankkappa)):
-                        any_added = True
-                if any_added:
-                    break
-                num_cnot += 1
-                if len(curpath) + num_cnot > max_depth:
-                    break
-        self.set_Gate_Structure(Circuit(self.qbit_num))
-        self.set_Optimized_Parameters(np.array([]))
-        # print("No decomposition found within the given CNOT limit.")
-
-    """
-    def Start_Decomposition(self):
-        self.all_solutions = []
-        self.err = 1.0
-        stop_first_solution = self.config.get("stop_first_solution", True)
-        cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(self.qbit_num))
-        if self.topology is None:
-            self.topology = [(i, j) for i in range(self.qbit_num) for j in range(i+1, self.qbit_num)]
-        pair_affects = {
-            pair: [i for i,A in enumerate(cuts) if (pair[0] in A) ^ (pair[1] in A)]
-            for pair in self.topology
-        }
-        #because we have U already conjugate transposed, must use prefix order
-        B = self.config.get('beam', None)#8*len(self.topology))
-        max_depth = self.config.get('tree_level_max', 14)
-        tol = 1e-3
-        Fnorm = np.sqrt(1<<self.qbit_num)
-        prior_level_info = None
-        for depth in range(max_depth+1):
-            remaining = max_depth - depth
-            visited, seq_pairs_of, seq_dir_of, res = N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS_level(self.qbit_num, self.topology, prior_level_info, use_gl=False)
-            nextprefixes = []
-            for path in set(tuple(x[1]) for x in res):
-                curh = None if len(path)==0 else prefixes[path[:-1]]
-                check_cuts = pair_affects[tuple(sorted(path[-1]))] if not curh is None else range(len(cuts))
-                #samples = [max(x[0] for x in self.OSR_with_local_alignment(path, cuts, Fnorm, tol=tol)) for _ in range(5)]
-                #if len(set(samples)) != 1: print(samples)
-                h = self.OSR_with_local_alignment(path, cuts, Fnorm, tol=tol, use_softmax=False, method="dual_annealing")
-                min_cnots = max((x[0] for x in h), default=0)
-                print(path, h, N_Qubit_Decomposition_Guided_Tree.avg_loss([x[1] for x in h]), remaining, min_cnots)
-                if min_cnots == 0:
-                    #print(path)
-                    for i in range(10):
-                        if self.Run_Decomposition(path):
-                            self.all_solutions.append((self.get_Circuit(), self.get_Optimized_Parameters()))
-                            if stop_first_solution: return
-                            break
-                        #print("Looping", h)
-                if min_cnots > remaining: continue
-                if not curh is None:
-                    #print(path, [(h[i], curh[i]) for i in check_cuts])
-                    #if any(h[i][0] > curh[i][0] for i in check_cuts): continue
-                    if max((x[0] for x in curh), default=0) < min_cnots: continue
-                nextprefixes.append((path, h))
-            nextprefixes.sort(key=lambda t: (max((x[0] for x in t[1]), default=0), sum(x[0] for x in t[1]), N_Qubit_Decomposition_Guided_Tree.avg_loss([x[1] for x in t[1]])))
-            prefixes = {x[0]: x[1] for x in nextprefixes[:B]}
-            prior_level_info = (visited, seq_pairs_of, seq_dir_of, list(x[0] for x in reversed(res) if tuple(x[1]) in prefixes))
-        self.set_Gate_Structure(Circuit(self.qbit_num))
-        self.set_Optimized_Parameters(np.array([]))
-        #print("No decomposition found within the given CNOT limit.")
-    """
-
-    def get_Decomposition_Error(self):
-        """Last decomposition error (Frobenius / cost) from guided search or ``Run_Decomposition``."""
-        return self.err
-
-    @staticmethod
-    def compositions(total, parts):
-        """
-        All nonnegative integer tuples of length `parts` summing to `total`.
-        """
-        if parts == 1:
-            yield (total,)
-            return
-        for x in range(total + 1):
-            for rest in N_Qubit_Decomposition_Guided_Tree.compositions(
-                total - x, parts - 1
-            ):
-                yield (x,) + rest
-
-    @staticmethod
-    def solve_best_min_cnots(num_qubits, cuts, rank_kappa, topology, use_surplus=True):
-        """Minimize total CNOT count subject to per-cut edge coverage vs ``rank_kappa`` bounds; return best kappa."""
-        m = len(topology)
-        cut_to_edges = [
-            [i for i, z in enumerate(topology) if (z[0] in cut) != (z[1] in cut)]
-            for cut in cuts
-        ]
-        total = 0
-        best_kappa = None
-        while True:
-            for edge_counts in N_Qubit_Decomposition_Guided_Tree.compositions(total, m):
-                if all(
-                    sum(edge_counts[j] for j in cut_to_edge) >= cut_bound[0]
-                    for cut_to_edge, cut_bound in zip(cut_to_edges, rank_kappa)
-                ):
-                    new_kappa = 0.0
-                    for cut_to_edge, cut_bound in zip(cut_to_edges, rank_kappa):
-                        coverage = sum(edge_counts[j] for j in cut_to_edge)
-                        if use_surplus:
-                            new_kappa += cut_bound[1] * (coverage - cut_bound[0])
-                        else:
-                            new_kappa += cut_bound[1] * coverage
-                    best_kappa = (
-                        new_kappa if best_kappa is None else max(best_kappa, new_kappa)
-                    )
-            if best_kappa is not None:
-                break
-            total += 1
-        return total, best_kappa
-
-    @staticmethod
-    def solve_min_cnots(num_qubits, cuts, cut_bounds, topology):
-        """Smallest total CNOT budget such that each cut's crossing edges meet ``cut_bounds``."""
-        m = len(topology)
-        cut_to_edges = [
-            [i for i, z in enumerate(topology) if (z[0] in cut) != (z[1] in cut)]
-            for cut in cuts
-        ]
-        total = 0
-        while True:
-            for edge_counts in N_Qubit_Decomposition_Guided_Tree.compositions(total, m):
-                if all(
-                    sum(edge_counts[j] for j in cut_to_edge) >= cut_bound
-                    for cut_to_edge, cut_bound in zip(cut_to_edges, cut_bounds)
-                ):
-                    return total
-            total += 1
-
-    @staticmethod
-    def gen_all_min_cnots(
-        num_qbits, topology=None
-    ):  # OSR tells min CNOTs at most for 3 qubits 3, 4 qubits 6, 5 qubits 7
-        """Debug: print min CNOT solutions for all combinations of per-cut bounds (see ``solve_min_cnots``)."""
-        import itertools
-
-        cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(num_qbits))
-        min_cnot_bounds = [
-            2 * min(cut_size, num_qbits - cut_size)
-            for cut_size in (len(cut) for cut in cuts)
-        ]
-        if topology is None:
-            topology = [
-                (i, j) for i in range(num_qbits) for j in range(i + 1, num_qbits)
-            ]
-        for cnot_bounds in itertools.product(
-            *(range(bound + 1) for bound in min_cnot_bounds)
-        ):
-            # if tuple(sorted(cnot_bounds)) != cnot_bounds: continue
-            print(
-                cnot_bounds,
-                N_Qubit_Decomposition_Guided_Tree.solve_min_cnots(
-                    num_qbits, cuts, cnot_bounds, topology
-                ),
-            )
-
-
-# N_Qubit_Decomposition_Guided_Tree.gen_all_min_cnots(3); assert False
-# N_Qubit_Decomposition_Guided_Tree.build_sequence(); assert False
-# print(len(list(N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS(3, [(0,1),(1,2),])))); assert False
 def _topology_le_to_be(n_qubits, topology):
     """Convert a topology from squander LE convention to bqskit BE convention."""
     return [(n_qubits - 1 - i, n_qubits - 1 - j) for i, j in topology]
 
 
 def generate_squander_seqpam(squander_config, block_size):
-    """Build a bqskit SeqPAM workflow using Squander as the inner synthesis engine.
-
-    Partitioning uses squander's ILP (same logic as PartAM.SynthesizeWideCircuit),
-    with optional density-penalty weights for sparse 3-qubit blocks when
-    ``squander_config['size_density_weight']`` is True.
+    """Build a bqskit SeqPAM workflow using Squander as the inner synthesis engine with ILP partitioning.
 
     Args:
         squander_config: Config dict passed to SquanderSynthesisPass (bqskit-squander keys:
             ``strategy`` ("Tree_search"/"Tabu_search"), ``verbosity``,
             ``optimization_tolerance``, ``optimizer_engine``, etc.).
-            Also read by SquanderILPPartitioner: ``size_density_weight`` (bool),
-            ``sparse_penalty`` (float).
         block_size: Maximum block size for ILP partitioning and SubtopologySelectionPass.
 
     Returns:
@@ -1325,11 +132,10 @@ def generate_squander_seqpam(squander_config, block_size):
     from bqskit.compiler import Workflow, BasePass
 
     class SquanderILPPartitioner(BasePass):
-        """Partition a bqskit circuit using squander's ILP with PartAM density weights."""
+        """Partition a bqskit circuit using squander's ILP."""
 
-        def __init__(self, block_size, squander_config):
+        def __init__(self, block_size):
             self.block_size = block_size
-            self.squander_config = squander_config
 
         async def run(self, circuit, data):
             from bqskit.ir import Circuit as BQCircuit
@@ -1339,7 +145,6 @@ async def run(self, circuit, data):
             from squander.partitioning.ilp import (
                 get_all_partitions, _get_topo_order, ilp_global_optimal,
             )
-            from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping
 
             # Unfold any CircuitGate blocks (e.g. from a prior SubtopologySelectionPass)
             # so that bqskit op indices align 1:1 with squander gate indices after the
@@ -1355,13 +160,7 @@ async def run(self, circuit, data):
                 get_all_partitions(sqdr_circ, self.block_size)
             gate_dict = {i: gate for i, gate in enumerate(sqdr_circ.get_Gates())}
 
-            ilp_weights = None
-            if self.squander_config.get('size_density_weight', False):
-                sparse_penalty = float(self.squander_config.get('sparse_penalty', 3.0))
-                ilp_weights = qgd_Partition_Aware_Mapping._parts_to_density_weights(
-                    allparts, gate_dict, sparse_penalty=sparse_penalty
-                )
-            L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
+            L_parts, _ = ilp_global_optimal(allparts, g)
 
             bqskit_ops = list(flat_circuit.operations_with_cycles())
 
@@ -1410,7 +209,6 @@ async def run(self, circuit, data):
                     seen_parts.add(part_key)
                     sorted_parts.append((part_key, gate_idxs))
             sorted_parts.sort(key=lambda x: x[0])
-            print(f"[ILP Partitioner] {len(sorted_parts)} partitions, expanded gate counts: {[len(gi) for _, gi in sorted_parts]}")
 
             # Map gate_idx -> sorted partition index
             gate_to_part = {}
@@ -1450,15 +248,6 @@ async def run(self, circuit, data):
                     sub_1q.append_gate(op.gate, [0], op.params)
                     partitioned.append_circuit(sub_1q, list(op.location), as_circuit_gate=True)
 
-            # Sanity check: all gates from flat_circuit must appear exactly once
-            flat_check = partitioned.copy()
-            flat_check.unfold_all()
-            n_expected = len(bqskit_ops)
-            n_actual = flat_check.num_operations
-            if n_actual != n_expected:
-                print(f'[ILP Partitioner] WARNING: gate count mismatch! '
-                      f'expected={n_expected}, actual={n_actual}')
-
             circuit.become(partitioned, False)
 
     class SetPAMInitialPlacementPass(BasePass):
@@ -1477,7 +266,7 @@ async def run(self, circuit, data):
             data.placement = list(self.placement)
 
     squander    = SquanderSynthesisPass(squander_config=squander_config)
-    partitioner = SquanderILPPartitioner(block_size, squander_config)
+    partitioner = SquanderILPPartitioner(block_size)
     post_pam_seq: BasePass = PAMVerificationSequence(8)
     num_layout_passes = int(squander_config.get("num_layout_passes", 100))
     pam_initial_placement = squander_config.get("pam_initial_placement", None)
@@ -1493,7 +282,7 @@ async def run(self, circuit, data):
                     EmbedAllPermutationsPass(
                         inner_synthesis=squander,
                         input_perm=True,
-                        output_perm=True,
+                        output_perm=False,
                         vary_topology=False,
                     ),
                 ),
@@ -1508,7 +297,7 @@ async def run(self, circuit, data):
                 ForEachBlockPass(
                     EmbedAllPermutationsPass(
                         inner_synthesis=squander,
-                        input_perm=True,
+                        input_perm=False,
                         output_perm=True,
                         vary_topology=True,
                     ),
@@ -1664,10 +453,6 @@ def DecomposePartition(
                 level_limit_min=1,
                 topology=mini_topology,
             )
-        elif strategy == "TreeGuided":
-            cDecompose = N_Qubit_Decomposition_Guided_Tree(
-                Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology
-            )
         elif strategy == "Custom":
             cDecompose = N_Qubit_Decomposition_custom(
                 Umtx.conj().T, config=config, accelerator_num=0
@@ -1803,116 +588,16 @@ def PartitionDecompositionProcess(
         # remap the subcircuit to a smaller qubit register
         remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num)
 
-        if (
-            qbit_num > 3
-            and structure is None
-            and config.get("strategy", "") == "TreeGuided"
-        ):
-            circo = Circuit(qbit_num)
-            for gate in remapped_subcircuit.get_Gates():
-                circo.add_Gate(gate)
-            remapped_subcircuit = circo
-            partitioned_circuit, params, recombine_info, _ = (
-                qgd_Wide_Circuit_Optimization.make_all_partition_circuit(
-                    remapped_subcircuit, subcircuit_parameters, 3
-                )
-            )
-            optimized_circuits = []
-            subcircs = partitioned_circuit.get_Gates()
-            # first find the optimal CNOT decomposition
-            for innercirc in subcircs:
-                start_idx = innercirc.get_Parameter_Start_Index()
-                innercirc_parameters = params[
-                    start_idx : start_idx + innercirc.get_Parameter_Num()
-                ]
-                callback_fnc = (
-                    lambda x: qgd_Wide_Circuit_Optimization.CompareAndPickCircuits(
-                        [innercirc, *(z[0] for z in x)],
-                        [innercirc_parameters, *(z[1] for z in x)],
-                    )
-                )
-                optimized_circuits.append(
-                    callback_fnc(
-                        qgd_Wide_Circuit_Optimization.PartitionDecompositionProcess(
-                            innercirc,
-                            innercirc_parameters,
-                            {
-                                **config,
-                                "stop_first_solution": True,
-                                "tree_level_max": max(
-                                    0, CNOTGateCount(subcircuit, 0) - 1
-                                ),
-                            },
-                            structure=None,
-                        )
-                    )
-                )
-            parts, struct_idxs = (
-                qgd_Wide_Circuit_Optimization.recombine_all_partition_circuit(
-                    remapped_subcircuit,
-                    [x[0] for x in optimized_circuits],
-                    params,
-                    recombine_info,
-                )
-            )
-            # enumerate all solutions for each subcircuit in the optimal
-            all_sol_for_idx = []
-            for idx in struct_idxs:
-                innercirc = subcircs[idx]
-                start_idx = innercirc.get_Parameter_Start_Index()
-                innercirc_parameters = params[
-                    start_idx : start_idx + innercirc.get_Parameter_Num()
-                ]
-                callback_fnc = lambda x: x + [(innercirc, innercirc_parameters)]
-                all_sol_for_idx.append(
-                    callback_fnc(
-                        qgd_Wide_Circuit_Optimization.PartitionDecompositionProcess(
-                            innercirc,
-                            innercirc_parameters,
-                            {
-                                **config,
-                                "stop_first_solution": False,
-                                "tree_level_max": max(0, CNOTGateCount(subcircuit, 0)),
-                            },
-                            structure=None,
-                        )
-                    )
-                )
-            all_decomposed = []
-            import itertools
-
-            opt = qgd_Wide_Circuit_Optimization({**config, "max_partition_size": 3})
-            if np.prod([len(x) for x in all_sol_for_idx]) > 32:
-                import random
-
-                trycombs = [
-                    [random.choice(x) for x in all_sol_for_idx] for _ in range(32)
-                ]
-            else:
-                trycombs = itertools.product(*all_sol_for_idx)
-            for combination in trycombs:
-                structures = [
-                    qgd_Wide_Circuit_Optimization.copy_circuit_structure(x[0])
-                    for x in combination
-                ]
-                optcirc, optparams = opt._OptimizeWideCircuit(
-                    remapped_subcircuit, subcircuit_parameters, False, parts, structures
-                )
-                reoptcirc, reoptparams = opt._OptimizeWideCircuit(
-                    optcirc.get_Flat_Circuit(), optparams
-                )
-                all_decomposed.append((reoptcirc.get_Flat_Circuit(), reoptparams))
-        else:
-            if not structure is None:
-                structure = structure.Remap_Qbits(qbit_map, qbit_num)
+        if not structure is None:
+            structure = structure.Remap_Qbits(qbit_map, qbit_num)
 
-            # get the unitary representing the circuit
-            unitary = remapped_subcircuit.get_Matrix(subcircuit_parameters)
+        # get the unitary representing the circuit
+        unitary = remapped_subcircuit.get_Matrix(subcircuit_parameters)
 
-            # decompose a small unitary into a new circuit
-            all_decomposed = qgd_Wide_Circuit_Optimization.DecomposePartition(
-                unitary, config, mini_topology, structure=structure
-            )
+        # decompose a small unitary into a new circuit
+        all_decomposed = qgd_Wide_Circuit_Optimization.DecomposePartition(
+            unitary, config, mini_topology, structure=structure
+        )
         # create inverse qbit map:
         inverse_qbit_map = {}
         for key, value in qbit_map.items():
@@ -2746,7 +1431,56 @@ def route_circuit(self, circ: Circuit, orig_parameters: np.ndarray):
         """Map ``circ`` onto ``self.config['topology']`` using BQSKit SeQPAM, Qiskit SABRE, or Squander SABRE."""
         strategy = self.config.get("routing-strategy", "seqpam-ilp")
 
-        if strategy in ("seqpam-ilp", "seqpam-quick", "bqskit-sabre"):
+        if strategy == "seqpam-ilp":
+            from squander import Qiskit_IO
+            from squander.decomposition.qgd_Wide_Circuit_Optimization import generate_squander_seqpam
+            from bqskit.compiler import Compiler
+            from bqskit.compiler.machine import MachineModel
+            from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from bqskit.passes import SetModelPass
+            from qiskit import qasm2, QuantumCircuit
+
+            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
+            circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters)
+            bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
+
+            strategy_map = {"TreeSearch": "Tree_search", "TabuSearch": "Tabu_search"}
+            squander_config = {
+                "strategy": strategy_map.get(self.config.get("strategy", "TreeSearch"), "Tree_search"),
+                "optimization_tolerance": self.config.get("tolerance", 1e-8),
+                "verbosity": self.config.get("verbosity", 0),
+                "optimizer_engine": self.config.get("optimizer_engine", "BFGS"),
+                "Cost_Function_Variant": self.config.get("Cost_Function_Variant", 3),
+                "size_density_weight": True,
+                "sparse_penalty": self.config.get("sparse_penalty", 3.0),
+                "max_partition_size": self.max_partition_size,
+            }
+            block_size = self.max_partition_size
+
+            workflow = generate_squander_seqpam(squander_config, block_size)
+
+            with Compiler() as compiler:
+                routed_bqskit_circ, pass_data = compiler.compile(
+                    bqskit_circ, [SetModelPass(model), workflow], True
+                )
+
+            circuit_qiskit_routed = QuantumCircuit.from_qasm_str(
+                OPENQASM2Language().encode(routed_bqskit_circ)
+            )
+            Squander_remapped_circuit, parameters_remapped_circuit = (
+                Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_routed)
+            )
+            Squander_remapped_circuit = Squander_remapped_circuit.Remap_Qbits(
+                {i: j for i, j in enumerate(pass_data.placement)}
+            )
+            self.config["initial_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.initial_mapping
+            )
+            self.config["final_mapping"] = list(
+                pass_data.placement[x] for x in pass_data.final_mapping
+            )
+
+        elif strategy in ("seqpam-quick", "bqskit-sabre"):
             from squander import Qiskit_IO
             from bqskit import Circuit as BQSKitCircuit, compile
             from bqskit.compiler import Compiler
@@ -2809,14 +1543,6 @@ async def run(self, circuit: BQSKitCircuit, data=None):
             mainflow = build_seqpam_mapping_optimization_workflow(
                 block_size=self.config["max_partition_size"]
             )
-            if strategy == "seqpam-ilp":
-                for curpass in mainflow._passes:
-                    if isinstance(curpass, IfThenElsePass):
-                        for i in range(len(curpass.on_true._passes)):
-                            if isinstance(curpass.on_true._passes[i], QuickPartitioner):
-                                curpass.on_true._passes[i] = SquanderPartitioner(
-                                    self.config["max_partition_size"]
-                                )
 
             routing_workflow = [
                 SetModelPass(model),  # attach hardware model to circuit

From a1e2f7ea6b982a4573fb6c0a8cc360d84c879105 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 8 May 2026 20:27:40 +0200
Subject: [PATCH 197/232] Fix error

---
 .../qgd_Wide_Circuit_Optimization.py          | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 6aad05e4f..fad50a547 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -96,10 +96,6 @@ def CNOTGateCount(circ: Circuit, max_gates: int = 0) -> int:
         )
     return num_cnots
 
-def _topology_le_to_be(n_qubits, topology):
-    """Convert a topology from squander LE convention to bqskit BE convention."""
-    return [(n_qubits - 1 - i, n_qubits - 1 - j) for i, j in topology]
-
 
 def generate_squander_seqpam(squander_config, block_size):
     """Build a bqskit SeqPAM workflow using Squander as the inner synthesis engine with ILP partitioning.
@@ -265,10 +261,11 @@ async def run(self, circuit, data):
                 )
             data.placement = list(self.placement)
 
+    from bqskit.passes import QuickPartitioner
     squander    = SquanderSynthesisPass(squander_config=squander_config)
     partitioner = SquanderILPPartitioner(block_size)
-    post_pam_seq: BasePass = PAMVerificationSequence(8)
-    num_layout_passes = int(squander_config.get("num_layout_passes", 100))
+    post_pam_seq: BasePass = PAMVerificationSequence(block_size)
+    num_layout_passes = int(squander_config.get("num_layout_passes", 3))
     pam_initial_placement = squander_config.get("pam_initial_placement", None)
 
     return Workflow(
@@ -293,7 +290,7 @@ async def run(self, circuit, data):
                 RestoreModelConnectivityPass(),
                 LogPass("Recaching permutation-aware synthesis results."),
                 SubtopologySelectionPass(block_size),
-                partitioner,
+                QuickPartitioner(block_size),
                 ForEachBlockPass(
                     EmbedAllPermutationsPass(
                         inner_synthesis=squander,
@@ -874,7 +871,7 @@ def OptimizeWideCircuit(
             )
 
             # Build BQSKit machine model from your topology
-            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
 
             # Convert squander circuit → qiskit → BQSKit
             # (BQSKit has a from_qiskit helper if you go via Qiskit IR)
@@ -947,7 +944,7 @@ def OptimizeWideCircuit(
             }
             block_size = self.max_partition_size
 
-            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
             circo = Qiskit_IO.get_Qiskit_Circuit(circ, parameters)
             bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
 
@@ -1440,7 +1437,7 @@ def route_circuit(self, circ: Circuit, orig_parameters: np.ndarray):
             from bqskit.passes import SetModelPass
             from qiskit import qasm2, QuantumCircuit
 
-            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
             circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters)
             bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
 
@@ -1530,7 +1527,7 @@ async def run(self, circuit: BQSKitCircuit, data=None):
             from qiskit import qasm2, QuantumCircuit
 
             # Build BQSKit machine model from your topology
-            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
 
             # Convert squander circuit → qiskit → BQSKit
             # (BQSKit has a from_qiskit helper if you go via Qiskit IR)
@@ -1587,7 +1584,7 @@ async def run(self, circuit: BQSKitCircuit, data=None):
             from bqskit.passes import SetModelPass
             from qiskit import qasm2, QuantumCircuit
 
-            model = MachineModel(circ.get_Qbit_Num(), _topology_le_to_be(circ.get_Qbit_Num(), self.config["topology"]))
+            model = MachineModel(circ.get_Qbit_Num(), self.config["topology"])
             circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters)
             bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo))
 

From 3af90b95d9e6cedda064ae660c7f2952e501e59f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 9 May 2026 10:00:04 +0200
Subject: [PATCH 198/232] Fix core counts

---
 .../qgd_Wide_Circuit_Optimization.py          | 106 +++++++++++-------
 1 file changed, 63 insertions(+), 43 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index fad50a547..10c0ebf1d 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -27,6 +27,20 @@
 from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE
 
 
+def _affinity_num_workers():
+    """Return CPU count visible to this process via sched affinity, falling back to cpu_count.
+
+    Use this to size BQSKit ``Compiler(num_workers=...)`` so it does not oversubscribe
+    when the job is bound (taskset/cgroup) to a subset of the machine's CPUs.
+    """
+    if hasattr(os, "sched_getaffinity"):
+        try:
+            return max(1, len(os.sched_getaffinity(0)))
+        except OSError:
+            pass
+    return max(1, mp.cpu_count())
+
+
 def extract_subtopology(involved_qbits, qbit_map, config):
     """Return topology edges restricted to ``involved_qbits``, with indices remapped via ``qbit_map``.
 
@@ -264,50 +278,56 @@ async def run(self, circuit, data):
     from bqskit.passes import QuickPartitioner
     squander    = SquanderSynthesisPass(squander_config=squander_config)
     partitioner = SquanderILPPartitioner(block_size)
-    post_pam_seq: BasePass = PAMVerificationSequence(block_size)
+    enable_pam_verification = bool(squander_config.get("enable_pam_verification", False))
     num_layout_passes = int(squander_config.get("num_layout_passes", 3))
     pam_initial_placement = squander_config.get("pam_initial_placement", None)
 
+    pam_verify_passes = (
+        [PAMVerificationSequence(block_size)] if enable_pam_verification else []
+    )
+
+    inner_passes = [
+        LogPass("Caching permutation-aware synthesis results."),
+        ExtractModelConnectivityPass(),
+        partitioner,
+        ForEachBlockPass(
+            EmbedAllPermutationsPass(
+                inner_synthesis=squander,
+                input_perm=True,
+                output_perm=False,
+                vary_topology=False,
+            ),
+        ),
+        LogPass("Preoptimizing with permutation-aware mapping."),
+        PAMRoutingPass(),
+        *pam_verify_passes,
+        UnfoldPass(),
+        RestoreModelConnectivityPass(),
+        LogPass("Recaching permutation-aware synthesis results."),
+        SubtopologySelectionPass(block_size),
+        QuickPartitioner(block_size),
+        ForEachBlockPass(
+            EmbedAllPermutationsPass(
+                inner_synthesis=squander,
+                input_perm=False,
+                output_perm=True,
+                vary_topology=True,
+            ),
+        ),
+        LogPass("Performing permutation-aware mapping."),
+        ApplyPlacement(),
+        SetPAMInitialPlacementPass(pam_initial_placement),
+        PAMLayoutPass(num_layout_passes),
+        PAMRoutingPass(0.1),
+        *pam_verify_passes,
+        ApplyPlacement(),
+        UnfoldPass(),
+    ]
+
     return Workflow(
         IfThenElsePass(
             NotPredicate(WidthPredicate(2)),
-            [
-                LogPass("Caching permutation-aware synthesis results."),
-                ExtractModelConnectivityPass(),
-                partitioner,
-                ForEachBlockPass(
-                    EmbedAllPermutationsPass(
-                        inner_synthesis=squander,
-                        input_perm=True,
-                        output_perm=False,
-                        vary_topology=False,
-                    ),
-                ),
-                LogPass("Preoptimizing with permutation-aware mapping."),
-                PAMRoutingPass(),
-                post_pam_seq,
-                UnfoldPass(),
-                RestoreModelConnectivityPass(),
-                LogPass("Recaching permutation-aware synthesis results."),
-                SubtopologySelectionPass(block_size),
-                QuickPartitioner(block_size),
-                ForEachBlockPass(
-                    EmbedAllPermutationsPass(
-                        inner_synthesis=squander,
-                        input_perm=False,
-                        output_perm=True,
-                        vary_topology=True,
-                    ),
-                ),
-                LogPass("Performing permutation-aware mapping."),
-                ApplyPlacement(),
-                SetPAMInitialPlacementPass(pam_initial_placement),
-                PAMLayoutPass(num_layout_passes),
-                PAMRoutingPass(0.1),
-                post_pam_seq,
-                ApplyPlacement(),
-                UnfoldPass(),
-            ],
+            inner_passes,
         ),
         name="SeqPAM Mapping",
     )
@@ -896,7 +916,7 @@ def OptimizeWideCircuit(
                 LogErrorPass(),
             ]
 
-            with Compiler() as compiler:
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
                 routed_bqskit_circ, pass_data = compiler.compile(
                     bqskit_circ, compilation_workflow, True
                 )
@@ -950,7 +970,7 @@ def OptimizeWideCircuit(
 
             workflow = generate_squander_seqpam(squander_config, block_size)
 
-            with Compiler() as compiler:
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
                 routed_bqskit_circ = compiler.compile(
                     bqskit_circ, [SetModelPass(model), workflow]
                 )
@@ -1456,7 +1476,7 @@ def route_circuit(self, circ: Circuit, orig_parameters: np.ndarray):
 
             workflow = generate_squander_seqpam(squander_config, block_size)
 
-            with Compiler() as compiler:
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
                 routed_bqskit_circ, pass_data = compiler.compile(
                     bqskit_circ, [SetModelPass(model), workflow], True
                 )
@@ -1553,7 +1573,7 @@ async def run(self, circuit: BQSKitCircuit, data=None):
                 ),  # SABRE-style routing
             ]
 
-            with Compiler() as compiler:
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
                 routed_bqskit_circ, pass_data = compiler.compile(
                     bqskit_circ, routing_workflow, True
                 )
@@ -1601,7 +1621,7 @@ async def run(self, circuit: BQSKitCircuit, data=None):
             }
             workflow = generate_squander_seqpam(squander_config, self.max_partition_size)
 
-            with Compiler() as compiler:
+            with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler:
                 routed_bqskit_circ, pass_data = compiler.compile(
                     bqskit_circ, [SetModelPass(model), workflow], True
                 )

From bc1ca53cac62acc32553722d3d5ab0792ea55ed8 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 9 May 2026 16:47:47 +0200
Subject: [PATCH 199/232] Improve PartAM routing layout heuristics

---
 .../sabre_router/include/sabre_router.hpp     |   5 +-
 .../src-cpp/sabre_router/sabre_router.cpp     |  20 ++-
 squander/synthesis/PartAM.py                  | 164 ++++++++++++------
 squander/synthesis/bindings.cpp               |   4 +-
 4 files changed, 138 insertions(+), 55 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 7a0154e23..ab15c0d27 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -94,6 +94,8 @@ struct SabreConfig {
     double three_qubit_exit_weight = 1.0;
     int boundary_beam_width = 1;
     int boundary_beam_depth = 1;
+    int layout_trial_boundary_beam_width = 1;
+    int layout_trial_boundary_beam_depth = 1;
 };
 
 struct RouteStep {
@@ -383,7 +385,8 @@ class SabreRouter {
         const std::vector<std::vector<int>>& parents_graph,
         bool reverse,
         const std::unordered_map<int, CanonicalEntry>& canonical_data,
-        SwapCache* swap_cache
+        SwapCache* swap_cache,
+        bool final_route
     ) const;
 
     // Check if partition is single-qubit
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index d1444a26d..90aea9b3a 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1436,7 +1436,8 @@ size_t SabreRouter::boundary_beam_select_index(
     const std::vector<std::vector<int>>& parents_graph,
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data,
-    SwapCache* swap_cache
+    SwapCache* swap_cache,
+    bool final_route
 ) const {
     size_t fallback_idx = 0;
     for (size_t i = 1; i < scores.size(); i++) {
@@ -1445,8 +1446,18 @@ size_t SabreRouter::boundary_beam_select_index(
         }
     }
 
-    const int beam_width = std::max(1, config_.boundary_beam_width);
-    const int beam_depth = std::max(1, config_.boundary_beam_depth);
+    const int beam_width = std::max(
+        1,
+        final_route
+            ? config_.boundary_beam_width
+            : config_.layout_trial_boundary_beam_width
+    );
+    const int beam_depth = std::max(
+        1,
+        final_route
+            ? config_.boundary_beam_depth
+            : config_.layout_trial_boundary_beam_depth
+    );
     if (beam_width <= 1 || beam_depth <= 1 || candidates.size() <= 1) {
         return fallback_idx;
     }
@@ -1784,7 +1795,8 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             pg,
             reverse,
             canonical_data,
-            &swap_cache
+            &swap_cache,
+            route_trace != nullptr
         );
         const auto& best = *candidates[best_ci];
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 6cabc96c3..082adbb99 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -163,6 +163,9 @@ def __init__(self, config):
         self.config.setdefault('three_qubit_exit_weight', 1.0)
         self.config.setdefault('boundary_beam_width', 1)
         self.config.setdefault('boundary_beam_depth', 1)
+        self.config.setdefault('layout_trial_boundary_beam_width', None)
+        self.config.setdefault('layout_trial_boundary_beam_depth', None)
+        self.config.setdefault('initial_layout_seed_pair_top_k', 8)
         self.config['partition_weight_model'] = 'window_turnover'
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
@@ -1113,6 +1116,20 @@ def _run_layout_trials_cpp(
             cfg.boundary_beam_depth = self.config.get(
                 'boundary_beam_depth', 1
             )
+        if hasattr(cfg, 'layout_trial_boundary_beam_width'):
+            layout_beam_width = self.config.get(
+                'layout_trial_boundary_beam_width', None
+            )
+            if layout_beam_width is None:
+                layout_beam_width = cfg.boundary_beam_width
+            cfg.layout_trial_boundary_beam_width = layout_beam_width
+        if hasattr(cfg, 'layout_trial_boundary_beam_depth'):
+            layout_beam_depth = self.config.get(
+                'layout_trial_boundary_beam_depth', None
+            )
+            if layout_beam_depth is None:
+                layout_beam_depth = cfg.boundary_beam_depth
+            cfg.layout_trial_boundary_beam_depth = layout_beam_depth
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
@@ -3071,14 +3088,17 @@ def _compute_seeded_layout(self, optimized_partitions, D, N, circ):
         if not self.topology:
             return np.arange(N)
 
-        # --- build gate-level interaction graph from circuit CNOT pairs ---
+        # --- build gate-level interaction graph from all multi-qubit gates ---
         gate_edges = set()
         for g in circ.get_Gates():
-            gname = str(type(g).__name__)
-            if 'CNOT' in gname or 'CX' in gname:
-                ctrl = g.get_Control_Qbit()
-                tgt = g.get_Target_Qbit()
-                gate_edges.add((min(ctrl, tgt), max(ctrl, tgt)))
+            qbits = list(g.get_Involved_Qbits())
+            if len(qbits) < 2:
+                continue
+            for i in range(len(qbits)):
+                for j in range(i + 1, len(qbits)):
+                    gate_edges.add(
+                        (min(qbits[i], qbits[j]), max(qbits[i], qbits[j]))
+                    )
 
         if not gate_edges:
             return np.arange(N)
@@ -3186,59 +3206,105 @@ def _greedy_seeded_layout(self, optimized_partitions, D, N):
         if not interaction_weight:
             return np.arange(N)
 
-        pi = np.arange(N)
-        placed_logical = set()
-        placed_physical = set()
+        logical_degree = defaultdict(float)
+        for (u, v), weight in interaction_weight.items():
+            logical_degree[u] += weight
+            logical_degree[v] += weight
+
+        physical_centrality = np.sum(D, axis=1)
+        ranked_logical_pairs = sorted(
+            interaction_weight.items(),
+            key=lambda item: (
+                -item[1],
+                -(logical_degree[item[0][0]] + logical_degree[item[0][1]]),
+                item[0],
+            ),
+        )
+        seed_pair_top_k = int(
+            self.config.get('initial_layout_seed_pair_top_k', 8)
+        )
+        if seed_pair_top_k > 0:
+            ranked_logical_pairs = ranked_logical_pairs[:seed_pair_top_k]
+
+        physical_edges = sorted(
+            [(int(u), int(v)) for u, v in self.topology],
+            key=lambda edge: (
+                physical_centrality[edge[0]] + physical_centrality[edge[1]],
+                edge,
+            ),
+        )
+
+        def layout_score(pi):
+            total = 0.0
+            for (u, v), weight in interaction_weight.items():
+                total += weight * D[int(pi[u])][int(pi[v])]
+            return total
 
-        (q1, q2), _ = max(interaction_weight.items(), key=lambda x: x[1])
-        p1, p2 = self.topology[0]
+        def build_layout(q1, q2, p1, p2):
+            pi = np.arange(N)
+            placed_logical = {q1, q2}
+            placed_physical = {p1, p2}
 
-        holder1 = np.where(pi == p1)[0][0]
-        pi[q1], pi[holder1] = p1, pi[q1]
-        holder2 = np.where(pi == p2)[0][0]
-        pi[q2], pi[holder2] = p2, pi[q2]
-        placed_logical.update([q1, q2])
-        placed_physical.update([p1, p2])
+            holder1 = int(np.where(pi == p1)[0][0])
+            pi[q1], pi[holder1] = p1, pi[q1]
+            holder2 = int(np.where(pi == p2)[0][0])
+            pi[q2], pi[holder2] = p2, pi[q2]
 
-        remaining = [q for q in range(N) if q not in placed_logical]
+            remaining = [q for q in range(N) if q not in placed_logical]
+
+            def logical_frontier_score(q):
+                return sum(
+                    interaction_weight.get((min(q, pq), max(q, pq)), 0.0)
+                    for pq in placed_logical
+                )
 
-        def _score(q):
-            return sum(
-                interaction_weight.get((min(q, pq), max(q, pq)), 0.0)
-                for pq in placed_logical
+            remaining.sort(
+                key=lambda q: (-logical_frontier_score(q), -logical_degree[q], q)
             )
 
-        remaining.sort(key=_score, reverse=True)
+            for logical_q in remaining:
+                best_physical = None
+                best_key = None
 
-        for logical_q in remaining:
-            best_physical = None
-            best_dist = float('inf')
+                for physical_q in range(N):
+                    if physical_q in placed_physical:
+                        continue
 
-            for physical_q in range(N):
-                if physical_q in placed_physical:
-                    continue
+                    total_dist = 0.0
+                    total_w = 0.0
+                    for other_q in placed_logical:
+                        key = (min(logical_q, other_q), max(logical_q, other_q))
+                        w = interaction_weight.get(key, 0.0)
+                        if w > 0:
+                            total_dist += D[physical_q][pi[other_q]] * w
+                            total_w += w
+
+                    avg = total_dist / total_w if total_w > 0 else 0.0
+                    candidate_key = (avg, physical_centrality[physical_q], physical_q)
+                    if best_key is None or candidate_key < best_key:
+                        best_key = candidate_key
+                        best_physical = physical_q
+
+                if best_physical is not None:
+                    holder = int(np.where(pi == best_physical)[0][0])
+                    pi[logical_q], pi[holder] = best_physical, pi[logical_q]
+                    placed_logical.add(logical_q)
+                    placed_physical.add(best_physical)
 
-                total_dist = 0.0
-                total_w = 0.0
-                for other_q in placed_logical:
-                    key = (min(logical_q, other_q), max(logical_q, other_q))
-                    w = interaction_weight.get(key, 0.0)
-                    if w > 0:
-                        total_dist += D[physical_q][pi[other_q]] * w
-                        total_w += w
-
-                avg = total_dist / total_w if total_w > 0 else 0.0
-                if avg < best_dist:
-                    best_dist = avg
-                    best_physical = physical_q
-
-            if best_physical is not None:
-                holder = np.where(pi == best_physical)[0][0]
-                pi[logical_q], pi[holder] = best_physical, pi[logical_q]
-                placed_logical.add(logical_q)
-                placed_physical.add(best_physical)
+            return pi
 
-        return pi
+        best_pi = None
+        best_score = float('inf')
+        for (q1, q2), _ in ranked_logical_pairs:
+            for p1, p2 in physical_edges:
+                for seed in ((q1, q2, p1, p2), (q1, q2, p2, p1)):
+                    pi = build_layout(*seed)
+                    score = layout_score(pi)
+                    if score < best_score:
+                        best_score = score
+                        best_pi = pi
+
+        return best_pi if best_pi is not None else np.arange(N)
 
 
     def generate_DAG_levels(self, circuit):
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 95e8d4630..ecb9ab049 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -136,7 +136,9 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
         .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight)
         .def_readwrite("boundary_beam_width", &SabreConfig::boundary_beam_width)
-        .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth);
+        .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth)
+        .def_readwrite("layout_trial_boundary_beam_width", &SabreConfig::layout_trial_boundary_beam_width)
+        .def_readwrite("layout_trial_boundary_beam_depth", &SabreConfig::layout_trial_boundary_beam_depth);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")

From c334197aaa0e7b5a56256dfa98c44e53bb4d29cf Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 9 May 2026 16:57:24 +0200
Subject: [PATCH 200/232] Fix SeqPAM ILP BQSKit block emission

---
 .../qgd_Wide_Circuit_Optimization.py          | 36 +++++++------------
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 10c0ebf1d..82d66e99e 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -150,7 +150,7 @@ def __init__(self, block_size):
         async def run(self, circuit, data):
             from bqskit.ir import Circuit as BQCircuit
             from bqskit.ir.lang.qasm2 import OPENQASM2Language
-            from qiskit import QuantumCircuit as QkCircuit, qasm2 as qasm2_module
+            from qiskit import QuantumCircuit as QkCircuit
             from squander import Qiskit_IO
             from squander.partitioning.ilp import (
                 get_all_partitions, _get_topo_order, ilp_global_optimal,
@@ -179,7 +179,12 @@ async def run(self, circuit, data):
             sqc_prepost = {x[0]: x for x in sq_chains
                            if x[0] in sqc_pre and x[-1] in sqc_post}
 
-            # Build expanded gate_idxs per ILP partition (include surrounding 1q gates)
+            # Build expanded gate_idxs per ILP partition.  The ILP operates on a
+            # graph with single-qubit chains contracted out, so only reinsert
+            # chains that are enclosed by a selected partition.  Do not absorb
+            # every intermediate operation on these qubits: BQSKit sees each
+            # CircuitGate as a synthesis block, and broad expansion creates
+            # large overlapping blocks that are expensive and can duplicate work.
             expanded = {}
             for i in L_parts:
                 part = allparts[i]
@@ -192,32 +197,19 @@ async def run(self, circuit, data):
                 gate_idxs = frozenset.union(part, *(sqc_prepost[v] for v in surrounded))
                 expanded[i] = gate_idxs
 
-            # Further expand: include ALL intermediate gates on partition qubits
-            for i in L_parts:
-                gate_idxs = expanded[i]
-                part_qubits = set()
-                for gi in gate_idxs:
-                    part_qubits.update(gate_dict[gi].get_Involved_Qbits())
-                lo = min(gate_idxs)
-                hi = max(gate_idxs)
-                extra = set()
-                for gi in range(lo, hi + 1):
-                    if gi not in gate_idxs:
-                        gq = set(gate_dict[gi].get_Involved_Qbits())
-                        if gq & part_qubits:
-                            extra.add(gi)
-                if extra:
-                    expanded[i] = gate_idxs | frozenset(extra)
-
             # Sort partitions by their minimum gate index to preserve original order
             seen_parts = set()
             sorted_parts = []
+            claimed_gates = set()
             for i in L_parts:
-                gate_idxs = expanded[i]
+                gate_idxs = expanded[i] - claimed_gates
+                if not gate_idxs:
+                    continue
                 part_key = min(gate_idxs)
                 if part_key not in seen_parts:
                     seen_parts.add(part_key)
                     sorted_parts.append((part_key, gate_idxs))
+                    claimed_gates.update(gate_idxs)
             sorted_parts.sort(key=lambda x: x[0])
 
             # Map gate_idx -> sorted partition index
@@ -254,9 +246,7 @@ async def run(self, circuit, data):
                     partitioned.append_circuit(sub, global_qudits, as_circuit_gate=True)
 
                 elif pidx < 0:
-                    sub_1q = BQCircuit(1)
-                    sub_1q.append_gate(op.gate, [0], op.params)
-                    partitioned.append_circuit(sub_1q, list(op.location), as_circuit_gate=True)
+                    partitioned.append_gate(op.gate, op.location, op.params)
 
             circuit.become(partitioned, False)
 

From 6c38deb4419fdbb1ca2a03c0ac1cb594a2ba795d Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 9 May 2026 17:01:21 +0200
Subject: [PATCH 201/232] Fix SeqPAM ILP PAM permutation blocks

---
 .../qgd_Wide_Circuit_Optimization.py          | 38 ++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index 82d66e99e..e6f17568d 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -218,9 +218,20 @@ async def run(self, circuit, data):
                 for gi in gate_idxs:
                     gate_to_part[gi] = pidx
 
+            # PAMRoutingPass expects permutation data for every non-barrier
+            # operation it may execute.  Keep gates outside ILP blocks wrapped as
+            # CircuitGates too, but group whole unclaimed 1q chains to avoid
+            # spawning one BQSKit task per single-qubit gate.
+            unclaimed_chain_by_gate = {}
+            for chain in sq_chains:
+                if all(gi not in gate_to_part for gi in chain):
+                    for gi in chain:
+                        unclaimed_chain_by_gate[gi] = chain
+
             # Build partitioned circuit by iterating gates in original order
             partitioned = BQCircuit(circuit.num_qudits, circuit.radixes)
             built_parts = set()
+            built_chains = set()
 
             for gi, (_, op) in enumerate(bqskit_ops):
                 pidx = gate_to_part.get(gi, -1)
@@ -246,7 +257,32 @@ async def run(self, circuit, data):
                     partitioned.append_circuit(sub, global_qudits, as_circuit_gate=True)
 
                 elif pidx < 0:
-                    partitioned.append_gate(op.gate, op.location, op.params)
+                    chain = unclaimed_chain_by_gate.get(gi)
+                    if chain is not None:
+                        if chain in built_chains:
+                            continue
+                        built_chains.add(chain)
+                        global_qudits = list(gate_dict[chain[0]].get_Involved_Qbits())
+                        local_map = {gq: l for l, gq in enumerate(global_qudits)}
+                        sub = BQCircuit(len(global_qudits))
+                        for ggi in chain:
+                            _, gop = bqskit_ops[ggi]
+                            sub.append_gate(
+                                gop.gate,
+                                [local_map[q] for q in gop.location],
+                                gop.params,
+                            )
+                        partitioned.append_circuit(
+                            sub, global_qudits, as_circuit_gate=True
+                        )
+                    else:
+                        sub = BQCircuit(len(op.location))
+                        sub.append_gate(
+                            op.gate, list(range(len(op.location))), op.params
+                        )
+                        partitioned.append_circuit(
+                            sub, list(op.location), as_circuit_gate=True
+                        )
 
             circuit.become(partitioned, False)
 

From 824ffa4736a21d1c89d999af35dd08b27958ea3e Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 9 May 2026 19:34:30 +0200
Subject: [PATCH 202/232] Add PartAM routing trace and random trials

---
 .../sabre_router/include/sabre_router.hpp     |  32 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 248 +++++++---
 squander/synthesis/PartAM.py                  | 461 ++++++++++++++++--
 squander/synthesis/bindings.cpp               |   4 +-
 4 files changed, 604 insertions(+), 141 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index ab15c0d27..b6c50d3d0 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -96,6 +96,8 @@ struct SabreConfig {
     int boundary_beam_depth = 1;
     int layout_trial_boundary_beam_width = 1;
     int layout_trial_boundary_beam_depth = 1;
+    bool adaptive_boundary_beam = true;
+    double successor_handoff_weight = 1.0;
 };
 
 struct RouteStep {
@@ -389,6 +391,27 @@ class SabreRouter {
         bool final_route
     ) const;
 
+    int boundary_beam_risk(
+        const std::vector<int>& F_snapshot,
+        const std::vector<const CandidateData*>& candidates,
+        const std::vector<std::vector<int>>& children_graph
+    ) const;
+
+    void collect_immediate_multi_successors(
+        int partition_idx,
+        const std::vector<std::vector<int>>& children_graph,
+        std::vector<int>& successors
+    ) const;
+
+    double successor_handoff_cost(
+        int selected_partition_idx,
+        const std::vector<int>& pi,
+        const std::vector<int>& F_after,
+        bool reverse,
+        const std::vector<std::vector<int>>& children_graph,
+        const std::unordered_map<int, CanonicalEntry>& canonical_data
+    ) const;
+
     // Check if partition is single-qubit
     inline bool partition_is_single(int partition_idx) const {
         return layout_partitions_[partition_idx].is_single;
@@ -402,14 +425,7 @@ class SabreRouter {
     // Random permutation of [0..N-1]
     std::vector<int> random_permutation(int n, std::mt19937& rng) const;
 
-    // Apply a small random walk on topology edges to diversify a seeded layout.
-    std::vector<int> perturb_layout(
-        const std::vector<int>& base,
-        int num_swaps,
-        std::mt19937& rng
-    ) const;
-
-    // Stratified initial-layout sampling with the same total trial budget.
+    // Initial-layout sampling: trial 0 uses the seed, later trials are random.
     std::vector<int> sample_initial_layout(
         int trial_idx,
         int n_trials,
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 90aea9b3a..f5e7c9093 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -168,82 +168,16 @@ std::vector<int> SabreRouter::random_permutation(int n, std::mt19937& rng) const
     return perm;
 }
 
-std::vector<int> SabreRouter::perturb_layout(
-    const std::vector<int>& base,
-    int num_swaps,
-    std::mt19937& rng
-) const {
-    if (num_swaps <= 0 || adj_.empty()) {
-        return base;
-    }
-
-    std::vector<std::pair<int, int>> swaps;
-    swaps.reserve(num_swaps);
-    std::uniform_int_distribution<int> phys_dist(0, N_ - 1);
-
-    for (int step = 0; step < num_swaps; step++) {
-        int phys = phys_dist(rng);
-        int retries = 0;
-        while (adj_[phys].empty() && retries < N_) {
-            phys = (phys + 1) % N_;
-            retries++;
-        }
-        if (adj_[phys].empty()) {
-            break;
-        }
-        std::uniform_int_distribution<int> nb_dist(
-            0, static_cast<int>(adj_[phys].size()) - 1
-        );
-        int nb = adj_[phys][nb_dist(rng)];
-        swaps.push_back({std::min(phys, nb), std::max(phys, nb)});
-    }
-
-    if (swaps.empty()) {
-        return base;
-    }
-
-    return apply_swaps_to_pi(base, swaps);
-}
-
 std::vector<int> SabreRouter::sample_initial_layout(
     int trial_idx,
     int n_trials,
     const std::vector<int>& seeded_pi,
     std::mt19937& rng
 ) const {
-    if (n_trials <= 1) {
+    if (n_trials <= 1 || trial_idx == 0) {
         return seeded_pi;
     }
 
-    std::vector<int> mirrored_pi(N_);
-    for (int q = 0; q < N_; q++) {
-        mirrored_pi[q] = (N_ - 1) - seeded_pi[q];
-    }
-
-    if (trial_idx == 0) {
-        return seeded_pi;
-    }
-    if (trial_idx == 1) {
-        return mirrored_pi;
-    }
-
-    const int local_cutoff = std::max(
-        3, static_cast<int>(std::ceil(n_trials * 0.6))
-    );
-    if (trial_idx < local_cutoff) {
-        const int local_idx = trial_idx - 2;
-        const int band_idx = local_idx / 2;
-        const int local_budget = std::max(1, local_cutoff - 2);
-        const double phase = static_cast<double>(band_idx)
-            / std::max(1, local_budget / 2);
-        const int num_swaps = (phase < 0.5)
-            ? (1 + (band_idx % 3))
-            : (4 + (band_idx % 5));
-        const std::vector<int>& base =
-            (local_idx % 2 == 0) ? seeded_pi : mirrored_pi;
-        return perturb_layout(base, num_swaps, rng);
-    }
-
     return random_permutation(N_, rng);
 }
 
@@ -1425,6 +1359,131 @@ SabreRouter::advance_layout_frontier(
     return {std::move(F_next), std::move(resolved_next)};
 }
 
+void SabreRouter::collect_immediate_multi_successors(
+    int partition_idx,
+    const std::vector<std::vector<int>>& children_graph,
+    std::vector<int>& successors
+) const {
+    successors.clear();
+    std::vector<uint8_t> seen(num_partitions_, 0);
+    std::deque<int> queue;
+    for (int child : children_graph[partition_idx]) {
+        queue.push_back(child);
+    }
+
+    while (!queue.empty()) {
+        const int child = queue.front();
+        queue.pop_front();
+        if (child < 0 || child >= num_partitions_ || seen[child]) {
+            continue;
+        }
+        seen[child] = 1;
+
+        if (layout_partitions_[child].is_single) {
+            for (int grandchild : children_graph[child]) {
+                queue.push_back(grandchild);
+            }
+        } else {
+            successors.push_back(child);
+        }
+    }
+}
+
+int SabreRouter::boundary_beam_risk(
+    const std::vector<int>& F_snapshot,
+    const std::vector<const CandidateData*>& candidates,
+    const std::vector<std::vector<int>>& children_graph
+) const {
+    int risk = 0;
+    for (const auto* cand : candidates) {
+        if (cand->involved_qbits.size() >= 3) {
+            risk = std::max(risk, 1);
+            break;
+        }
+    }
+
+    auto support_turnover = [&](int a, int b) {
+        const auto& as = layout_partitions_[a].involved_qbits;
+        const auto& bs = layout_partitions_[b].involved_qbits;
+        if (as.size() < 2 || bs.size() < 2) {
+            return 0;
+        }
+        int overlap = 0;
+        for (int qa : as) {
+            for (int qb : bs) {
+                if (qa == qb) {
+                    overlap++;
+                    break;
+                }
+            }
+        }
+        const int min_size = static_cast<int>(std::min(as.size(), bs.size()));
+        return min_size - overlap;
+    };
+
+    std::vector<int> successors;
+    for (int p : F_snapshot) {
+        collect_immediate_multi_successors(p, children_graph, successors);
+        for (int child : successors) {
+            const int turnover = support_turnover(p, child);
+            const int min_size = static_cast<int>(std::min(
+                layout_partitions_[p].involved_qbits.size(),
+                layout_partitions_[child].involved_qbits.size()
+            ));
+            if (turnover >= 2 || (min_size >= 3 && turnover >= min_size - 1)) {
+                risk = std::max(risk, 2);
+            }
+        }
+    }
+
+    if (risk > 0 && F_snapshot.size() > 2) {
+        risk = std::max(risk, 2);
+    }
+    return risk;
+}
+
+double SabreRouter::successor_handoff_cost(
+    int selected_partition_idx,
+    const std::vector<int>& pi,
+    const std::vector<int>& F_after,
+    bool reverse,
+    const std::vector<std::vector<int>>& children_graph,
+    const std::unordered_map<int, CanonicalEntry>& canonical_data
+) const {
+    if (config_.successor_handoff_weight <= 0.0 || F_after.empty()) {
+        return 0.0;
+    }
+
+    std::vector<int> successors;
+    collect_immediate_multi_successors(
+        selected_partition_idx,
+        children_graph,
+        successors
+    );
+    if (successors.empty()) {
+        return 0.0;
+    }
+
+    double total = 0.0;
+    int count = 0;
+    for (int child : successors) {
+        if (std::find(F_after.begin(), F_after.end(), child) == F_after.end()) {
+            continue;
+        }
+        const double cost = future_partition_cost(
+            child,
+            pi,
+            reverse,
+            canonical_data
+        );
+        if (std::isfinite(cost)) {
+            total += cost;
+            count++;
+        }
+    }
+    return count > 0 ? total / static_cast<double>(count) : 0.0;
+}
+
 size_t SabreRouter::boundary_beam_select_index(
     const std::vector<const CandidateData*>& candidates,
     const std::vector<double>& scores,
@@ -1446,30 +1505,37 @@ size_t SabreRouter::boundary_beam_select_index(
         }
     }
 
-    const int beam_width = std::max(
+    const int max_beam_width = std::max(
         1,
         final_route
             ? config_.boundary_beam_width
             : config_.layout_trial_boundary_beam_width
     );
-    const int beam_depth = std::max(
+    const int max_beam_depth = std::max(
         1,
         final_route
             ? config_.boundary_beam_depth
             : config_.layout_trial_boundary_beam_depth
     );
-    if (beam_width <= 1 || beam_depth <= 1 || candidates.size() <= 1) {
-        return fallback_idx;
-    }
-
-    bool has_three_qubit_candidate = false;
-    for (const auto* cand : candidates) {
-        if (cand->involved_qbits.size() >= 3) {
-            has_three_qubit_candidate = true;
-            break;
+    const int risk = boundary_beam_risk(
+        F_snapshot,
+        candidates,
+        children_graph
+    );
+    int beam_width = max_beam_width;
+    int beam_depth = max_beam_depth;
+    if (config_.adaptive_boundary_beam) {
+        if (risk <= 0) {
+            return fallback_idx;
         }
+        beam_width = (risk >= 2)
+            ? max_beam_width
+            : std::min(max_beam_width, 2);
+        beam_depth = (risk >= 2)
+            ? max_beam_depth
+            : std::min(max_beam_depth, 2);
     }
-    if (!has_three_qubit_candidate) {
+    if (beam_width <= 1 || beam_depth <= 1 || candidates.size() <= 1) {
         return fallback_idx;
     }
 
@@ -1507,8 +1573,16 @@ size_t SabreRouter::boundary_beam_select_index(
             parents_graph
         );
         const double trans_cost = transition_cost(cand, idx);
+        const double handoff_cost = successor_handoff_cost(
+            cand.partition_idx,
+            cached_pi[idx],
+            F_next,
+            reverse,
+            children_graph,
+            canonical_data
+        );
         states.push_back(BeamState{
-            scores[idx],
+            scores[idx] + config_.successor_handoff_weight * handoff_cost,
             trans_cost,
             cached_pi[idx],
             std::move(F_next),
@@ -1600,7 +1674,6 @@ size_t SabreRouter::boundary_beam_select_index(
                 );
                 const double future_cost = score - trans_cost;
                 const double new_total = state.total_cost + trans_cost;
-                const double rank_cost = new_total + future_cost;
 
                 auto [F_next, resolved_next] = advance_layout_frontier(
                     cand->partition_idx,
@@ -1609,6 +1682,19 @@ size_t SabreRouter::boundary_beam_select_index(
                     children_graph,
                     parents_graph
                 );
+                const double handoff_cost = successor_handoff_cost(
+                    cand->partition_idx,
+                    output_perm,
+                    F_next,
+                    reverse,
+                    children_graph,
+                    canonical_data
+                );
+                const double rank_cost = (
+                    new_total
+                    + future_cost
+                    + config_.successor_handoff_weight * handoff_cost
+                );
                 expanded.push_back(BeamState{
                     rank_cost,
                     new_total,
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 082adbb99..571b3481c 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1,6 +1,7 @@
 """
 This is an implementation of Partition Aware Mapping.
 """
+import csv
 import logging
 import multiprocessing as mp
 import os
@@ -165,7 +166,11 @@ def __init__(self, config):
         self.config.setdefault('boundary_beam_depth', 1)
         self.config.setdefault('layout_trial_boundary_beam_width', None)
         self.config.setdefault('layout_trial_boundary_beam_depth', None)
+        self.config.setdefault('adaptive_boundary_beam', True)
+        self.config.setdefault('successor_handoff_weight', 1.0)
         self.config.setdefault('initial_layout_seed_pair_top_k', 8)
+        self.config.setdefault('initial_layout_line_ordering', True)
+        self.config.setdefault('routing_trace_path', None)
         self.config['partition_weight_model'] = 'window_turnover'
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
@@ -1130,6 +1135,14 @@ def _run_layout_trials_cpp(
             if layout_beam_depth is None:
                 layout_beam_depth = cfg.boundary_beam_depth
             cfg.layout_trial_boundary_beam_depth = layout_beam_depth
+        if hasattr(cfg, 'adaptive_boundary_beam'):
+            cfg.adaptive_boundary_beam = bool(
+                self.config.get('adaptive_boundary_beam', True)
+            )
+        if hasattr(cfg, 'successor_handoff_weight'):
+            cfg.successor_handoff_weight = self.config.get(
+                'successor_handoff_weight', 1.0
+            )
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
@@ -1257,6 +1270,244 @@ def _partition_order_from_cpp_steps(
                 partition_order.append(part)
         return partition_order
 
+    @staticmethod
+    def _csv_list(values):
+        return " ".join(str(int(v)) for v in values)
+
+    @staticmethod
+    def _csv_edges(edges):
+        return " ".join(f"{int(u)}-{int(v)}" for u, v in edges)
+
+    @staticmethod
+    def _candidate_physical_nodes(candidate):
+        nodes = set()
+        for u, v in candidate.topology:
+            nodes.add(int(u))
+            nodes.add(int(v))
+        if not nodes:
+            nodes.update(int(v) for v in candidate.node_mapping.values())
+        return sorted(nodes)
+
+    @staticmethod
+    def _apply_candidate_exit_to_pi(pi, candidate):
+        pi_out = [int(x) for x in pi]
+        qbit_map_inverse = {v: k for k, v in candidate.qbit_map.items()}
+        for q_star, mapped_qstar in enumerate(candidate.P_o):
+            if q_star in qbit_map_inverse:
+                logical_q = qbit_map_inverse[q_star]
+                pi_out[logical_q] = candidate.node_mapping[mapped_qstar]
+        return pi_out
+
+    @staticmethod
+    def _immediate_multi_successors(partition_idx, DAG, layout_partitions):
+        successors = []
+        seen = set()
+        queue = deque(DAG[partition_idx])
+        while queue:
+            child = queue.popleft()
+            if child in seen:
+                continue
+            seen.add(child)
+            if layout_partitions[child]["is_single"]:
+                queue.extend(DAG[child])
+            else:
+                successors.append(child)
+        return successors
+
+    @staticmethod
+    def _support_overlap_summary(partition_idx, successors, layout_partitions):
+        support = set(layout_partitions[partition_idx]["involved_qbits"])
+        summary = []
+        max_overlap = 0
+        min_turnover = None
+        for child in successors:
+            child_support = set(layout_partitions[child]["involved_qbits"])
+            overlap = len(support & child_support)
+            turnover = min(len(support), len(child_support)) - overlap
+            max_overlap = max(max_overlap, overlap)
+            min_turnover = (
+                turnover
+                if min_turnover is None
+                else min(min_turnover, turnover)
+            )
+            summary.append(f"{child}:{overlap}/{turnover}")
+        return max_overlap, (0 if min_turnover is None else min_turnover), " ".join(summary)
+
+    @staticmethod
+    def _eligible_multi_frontier(resolved, IDAG, layout_partitions):
+        frontier = []
+        for idx, info in enumerate(layout_partitions):
+            if resolved[idx] or info["is_single"]:
+                continue
+            if all(resolved[parent] for parent in IDAG[idx]):
+                frontier.append(idx)
+        return frontier
+
+    def _write_cpp_routing_trace(
+        self,
+        trace_path,
+        steps,
+        pi_initial,
+        candidate_cache,
+        layout_partitions,
+        DAG,
+        IDAG,
+        N,
+    ):
+        """Write a CSV trace for the final selected C++ route."""
+        if not trace_path:
+            return
+
+        trace_dir = os.path.dirname(os.path.abspath(trace_path))
+        if trace_dir:
+            os.makedirs(trace_dir, exist_ok=True)
+
+        pi = [int(x) for x in pi_initial]
+        resolved = [False] * len(layout_partitions)
+        pending_swaps = []
+        cumulative_swaps = 0
+        cumulative_body_cnot = 0
+        rows = []
+
+        for route_step_idx, step in enumerate(steps):
+            kind = step[0]
+            if kind == "swap":
+                swaps = [(int(u), int(v)) for u, v in step[1]]
+                if swaps:
+                    pending_swaps.extend(swaps)
+                    pi = self._apply_swaps_to_pi(pi, swaps)
+                continue
+
+            if kind == "single":
+                partition_idx = int(step[1])
+                logical_qubits = tuple(layout_partitions[partition_idx]["involved_qbits"])
+                physical_qubit = int(step[2])
+                resolved[partition_idx] = True
+                rows.append({
+                    "row": len(rows),
+                    "route_step": route_step_idx,
+                    "kind": "single",
+                    "partition_idx": partition_idx,
+                    "candidate_idx": "",
+                    "topology_idx": "",
+                    "permutation_idx": "",
+                    "logical_qubits": self._csv_list(logical_qubits),
+                    "physical_nodes": str(physical_qubit),
+                    "topology_edges": "",
+                    "entry_layout": self._csv_list(
+                        pi[q] for q in logical_qubits
+                    ),
+                    "exit_layout": self._csv_list(
+                        pi[q] for q in logical_qubits
+                    ),
+                    "swap_count": 0,
+                    "routing_cnot": 0,
+                    "body_cnot": 0,
+                    "cumulative_swap_count": cumulative_swaps,
+                    "cumulative_routing_cnot": 3 * cumulative_swaps,
+                    "cumulative_body_cnot": cumulative_body_cnot,
+                    "frontier_size": len(
+                        self._eligible_multi_frontier(
+                            resolved, IDAG, layout_partitions
+                        )
+                    ),
+                    "successor_count": 0,
+                    "max_successor_overlap": 0,
+                    "min_successor_turnover": 0,
+                    "successor_overlap": "",
+                    "swaps": "",
+                })
+                continue
+
+            if kind != "partition":
+                continue
+
+            partition_idx = int(step[1])
+            candidate_idx = int(step[2])
+            candidate = candidate_cache[partition_idx][candidate_idx]
+            logical_qubits = tuple(int(q) for q in candidate.involved_qbits)
+            entry_layout = [int(pi[q]) for q in logical_qubits]
+            exit_pi = self._apply_candidate_exit_to_pi(pi, candidate)
+            exit_layout = [int(exit_pi[q]) for q in logical_qubits]
+            physical_nodes = self._candidate_physical_nodes(candidate)
+            successors = self._immediate_multi_successors(
+                partition_idx, DAG, layout_partitions
+            )
+            max_overlap, min_turnover, overlap_summary = (
+                self._support_overlap_summary(
+                    partition_idx, successors, layout_partitions
+                )
+            )
+            frontier_size = len(
+                self._eligible_multi_frontier(
+                    resolved, IDAG, layout_partitions
+                )
+            )
+            swap_count = len(pending_swaps)
+            cumulative_swaps += swap_count
+            cumulative_body_cnot += int(candidate.cnot_count)
+            rows.append({
+                "row": len(rows),
+                "route_step": route_step_idx,
+                "kind": "partition",
+                "partition_idx": partition_idx,
+                "candidate_idx": candidate_idx,
+                "topology_idx": int(candidate.topology_idx),
+                "permutation_idx": int(candidate.permutation_idx),
+                "logical_qubits": self._csv_list(logical_qubits),
+                "physical_nodes": self._csv_list(physical_nodes),
+                "topology_edges": self._csv_edges(candidate.topology),
+                "entry_layout": self._csv_list(entry_layout),
+                "exit_layout": self._csv_list(exit_layout),
+                "swap_count": swap_count,
+                "routing_cnot": 3 * swap_count,
+                "body_cnot": int(candidate.cnot_count),
+                "cumulative_swap_count": cumulative_swaps,
+                "cumulative_routing_cnot": 3 * cumulative_swaps,
+                "cumulative_body_cnot": cumulative_body_cnot,
+                "frontier_size": frontier_size,
+                "successor_count": len(successors),
+                "max_successor_overlap": max_overlap,
+                "min_successor_turnover": min_turnover,
+                "successor_overlap": overlap_summary,
+                "swaps": self._csv_edges(pending_swaps),
+            })
+            resolved[partition_idx] = True
+            pi = exit_pi
+            pending_swaps = []
+
+        fieldnames = [
+            "row",
+            "route_step",
+            "kind",
+            "partition_idx",
+            "candidate_idx",
+            "topology_idx",
+            "permutation_idx",
+            "logical_qubits",
+            "physical_nodes",
+            "topology_edges",
+            "entry_layout",
+            "exit_layout",
+            "swap_count",
+            "routing_cnot",
+            "body_cnot",
+            "cumulative_swap_count",
+            "cumulative_routing_cnot",
+            "cumulative_body_cnot",
+            "frontier_size",
+            "successor_count",
+            "max_successor_overlap",
+            "min_successor_turnover",
+            "successor_overlap",
+            "swaps",
+        ]
+        with open(trace_path, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(rows)
+        self._routing_trace_path = trace_path
+
 
     def _rank_layout_trials_by_actual_routing(
         self,
@@ -1359,6 +1610,8 @@ def Partition_Aware_Mapping(
         partition_body_cnot = 0
         routing_elapsed_before_cleanup = None
         cleanup_total = 0.0
+        final_route_steps = None
+        final_route_pi_initial = None
 
         if n_iterations == 0:
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
@@ -1444,6 +1697,7 @@ def Partition_Aware_Mapping(
                 best_pre_cleanup = None
                 best_routing_swap_cnot = 0
                 best_partition_body_cnot = 0
+                best_route_steps = None
 
                 for _, trial_pi, _, trace_pi_init, route_steps in top_layouts:
                     self._restore_single_qubit_circuits(
@@ -1499,6 +1753,7 @@ def Partition_Aware_Mapping(
                         best_pi = pi_out
                         best_routing_swap_cnot = trial_routing_cnot
                         best_partition_body_cnot = trial_partition_cnot
+                        best_route_steps = route_steps
 
                 final_cleanup_config = dict(cleanup_config)
                 final_cleanup_config['use_osr'] = 1
@@ -1517,6 +1772,8 @@ def Partition_Aware_Mapping(
                 pi = best_pi
                 routing_swap_cnot = best_routing_swap_cnot
                 partition_body_cnot = best_partition_body_cnot
+                final_route_steps = best_route_steps
+                final_route_pi_initial = best_pi_init
 
             else:
                 _, best_pi, _, trace_pi_init, route_steps = trial_results[0]
@@ -1533,6 +1790,8 @@ def Partition_Aware_Mapping(
                     )
                     pi = np.asarray(best_pi, dtype=np.int64)
                     pi_initial = np.asarray(trace_pi_init, dtype=np.int64)
+                    final_route_steps = route_steps
+                    final_route_pi_initial = pi_initial.copy()
                 else:
                     F = self.get_initial_layer(IDAG, N, optimized_partitions)
                     partition_order, pi, pi_initial = self.Heuristic_Search(
@@ -1583,6 +1842,25 @@ def Partition_Aware_Mapping(
         self._routing_swap_cnot = routing_swap_cnot
         self._partition_body_cnot = partition_body_cnot
 
+        routing_trace_path = self.config.get("routing_trace_path", None)
+        if routing_trace_path:
+            if final_route_steps is not None and final_route_pi_initial is not None:
+                self._write_cpp_routing_trace(
+                    routing_trace_path,
+                    final_route_steps,
+                    final_route_pi_initial,
+                    candidate_cache,
+                    layout_partitions,
+                    DAG,
+                    IDAG,
+                    N,
+                )
+            else:
+                logging.warning(
+                    "routing_trace_path was set, but no C++ route steps were "
+                    "available for the selected route."
+                )
+
         return final_circuit, final_parameters, pi_initial, pi
 
     # ------------------------------------------------------------------------
@@ -1723,56 +2001,11 @@ def _apply_swaps_to_pi(pi, swaps):
             pi_new[q1], pi_new[q2] = P2, P1
         return pi_new
 
-    def _perturb_layout(self, base_pi, num_swaps, rng):
-        if num_swaps <= 0 or rng is None or not self._adj:
-            return np.asarray(base_pi, dtype=np.int64).copy()
-
-        swaps = []
-        N = len(base_pi)
-        for _ in range(num_swaps):
-            phys = int(rng.randint(N))
-            retries = 0
-            while not self._adj[phys] and retries < N:
-                phys = (phys + 1) % N
-                retries += 1
-            if not self._adj[phys]:
-                break
-            nb = int(self._adj[phys][rng.randint(len(self._adj[phys]))])
-            swaps.append((min(phys, nb), max(phys, nb)))
-
-        if not swaps:
-            return np.asarray(base_pi, dtype=np.int64).copy()
-
-        return np.asarray(
-            self._apply_swaps_to_pi(base_pi, swaps), dtype=np.int64
-        )
-
     def _sample_initial_layout(self, trial_idx, n_trials, seeded_pi, rng):
         seeded_pi = np.asarray(seeded_pi, dtype=np.int64)
-        if n_trials <= 1 or rng is None:
+        if n_trials <= 1 or rng is None or trial_idx == 0:
             return seeded_pi.copy()
 
-        mirrored_pi = (len(seeded_pi) - 1) - seeded_pi
-
-        if trial_idx == 0:
-            return seeded_pi.copy()
-        if trial_idx == 1:
-            return mirrored_pi.copy()
-
-        local_cutoff = max(3, int(np.ceil(n_trials * 0.6)))
-        if trial_idx < local_cutoff:
-            local_idx = trial_idx - 2
-            band_idx = local_idx // 2
-            local_budget = max(1, local_cutoff - 2)
-            phase = band_idx / max(1, local_budget // 2)
-            num_swaps = (
-                1 + (band_idx % 3)
-                if phase < 0.5
-                else 4 + (band_idx % 5)
-            )
-            base = seeded_pi if local_idx % 2 == 0 else mirrored_pi
-            return self._perturb_layout(base, num_swaps, rng)
-
         return rng.permutation(len(seeded_pi))
 
     def _bfs_shortest_path(self, src, dst):
@@ -3240,6 +3473,107 @@ def layout_score(pi):
                 total += weight * D[int(pi[u])][int(pi[v])]
             return total
 
+        def physical_line_orders():
+            if not self.config.get('initial_layout_line_ordering', True):
+                return []
+            adj = defaultdict(list)
+            nodes = set(range(N))
+            for u, v in self.topology:
+                u = int(u)
+                v = int(v)
+                adj[u].append(v)
+                adj[v].append(u)
+                nodes.add(u)
+                nodes.add(v)
+            if any(len(adj[node]) > 2 for node in nodes):
+                return []
+            endpoints = [node for node in range(N) if len(adj[node]) <= 1]
+            if len(endpoints) != 2 and N > 1:
+                return []
+
+            start = endpoints[0] if endpoints else 0
+            order = []
+            prev = None
+            node = start
+            while node is not None:
+                order.append(node)
+                next_nodes = [nb for nb in adj[node] if nb != prev]
+                if len(next_nodes) > 1:
+                    return []
+                prev, node = node, next_nodes[0] if next_nodes else None
+            if len(order) != N:
+                return []
+            return [order, list(reversed(order))]
+
+        def add_unique_order(orders, order):
+            key = tuple(int(q) for q in order)
+            if len(key) != N or set(key) != set(range(N)):
+                return
+            if key not in orders:
+                orders[key] = list(key)
+
+        def spectral_logical_orders():
+            W = np.zeros((N, N), dtype=float)
+            for (u, v), weight in interaction_weight.items():
+                W[u, v] += weight
+                W[v, u] += weight
+            degree = np.sum(W, axis=1)
+            if np.count_nonzero(degree) < 2:
+                return []
+            L = np.diag(degree) - W
+            try:
+                eigvals, eigvecs = np.linalg.eigh(L)
+            except np.linalg.LinAlgError:
+                return []
+            fiedler_idx = 1 if len(eigvals) > 1 else 0
+            fiedler = eigvecs[:, fiedler_idx]
+            order = sorted(
+                range(N),
+                key=lambda q: (float(fiedler[q]), -logical_degree[q], q),
+            )
+            return [order, list(reversed(order))]
+
+        def insertion_logical_orders():
+            if not ranked_logical_pairs:
+                return []
+            (q1, q2), _ = ranked_logical_pairs[0]
+            remaining = [
+                q for q in sorted(
+                    range(N),
+                    key=lambda q: (-logical_degree[q], q),
+                )
+                if q not in (q1, q2)
+            ]
+            orders = []
+            for seed in ([q1, q2], [q2, q1]):
+                order = list(seed)
+                for q in remaining:
+                    best_pos = 0
+                    best_cost = float('inf')
+                    for pos in range(len(order) + 1):
+                        trial = order[:pos] + [q] + order[pos:]
+                        pos_map = {logical: idx for idx, logical in enumerate(trial)}
+                        cost = 0.0
+                        for (u, v), weight in interaction_weight.items():
+                            if u in pos_map and v in pos_map:
+                                cost += weight * abs(pos_map[u] - pos_map[v])
+                        if cost < best_cost:
+                            best_cost = cost
+                            best_pos = pos
+                    order.insert(best_pos, q)
+                orders.append(order)
+            return orders
+
+        def center_out(order):
+            center = (len(order) - 1) / 2.0
+            return sorted(order, key=lambda p: (abs(order.index(p) - center), p))
+
+        def layout_from_orders(logical_order, physical_order):
+            pi = np.arange(N)
+            for logical_q, physical_q in zip(logical_order, physical_order):
+                pi[int(logical_q)] = int(physical_q)
+            return pi
+
         def build_layout(q1, q2, p1, p2):
             pi = np.arange(N)
             placed_logical = {q1, q2}
@@ -3295,14 +3629,39 @@ def logical_frontier_score(q):
 
         best_pi = None
         best_score = float('inf')
+
+        def consider_layout(pi):
+            nonlocal best_pi, best_score
+            score = layout_score(pi)
+            if score < best_score:
+                best_score = score
+                best_pi = pi
+
         for (q1, q2), _ in ranked_logical_pairs:
             for p1, p2 in physical_edges:
                 for seed in ((q1, q2, p1, p2), (q1, q2, p2, p1)):
-                    pi = build_layout(*seed)
-                    score = layout_score(pi)
-                    if score < best_score:
-                        best_score = score
-                        best_pi = pi
+                    consider_layout(build_layout(*seed))
+
+        line_orders = physical_line_orders()
+        if line_orders:
+            logical_orders = {}
+            for order in spectral_logical_orders():
+                add_unique_order(logical_orders, order)
+            for order in insertion_logical_orders():
+                add_unique_order(logical_orders, order)
+
+            degree_order = sorted(
+                range(N),
+                key=lambda q: (-logical_degree[q], q),
+            )
+            for physical_order in line_orders:
+                for logical_order in logical_orders.values():
+                    consider_layout(
+                        layout_from_orders(logical_order, physical_order)
+                    )
+                consider_layout(
+                    layout_from_orders(degree_order, center_out(physical_order))
+                )
 
         return best_pi if best_pi is not None else np.arange(N)
 
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index ecb9ab049..a37c7dfda 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -138,7 +138,9 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("boundary_beam_width", &SabreConfig::boundary_beam_width)
         .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth)
         .def_readwrite("layout_trial_boundary_beam_width", &SabreConfig::layout_trial_boundary_beam_width)
-        .def_readwrite("layout_trial_boundary_beam_depth", &SabreConfig::layout_trial_boundary_beam_depth);
+        .def_readwrite("layout_trial_boundary_beam_depth", &SabreConfig::layout_trial_boundary_beam_depth)
+        .def_readwrite("adaptive_boundary_beam", &SabreConfig::adaptive_boundary_beam)
+        .def_readwrite("successor_handoff_weight", &SabreConfig::successor_handoff_weight);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")

From 7478d420be00d663868793ef5a0fff62dbb1f44b Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 9 May 2026 23:15:54 +0200
Subject: [PATCH 203/232] Skip routing for zero-CNOT PartAM partitions

---
 .../src-cpp/sabre_router/sabre_router.cpp     |  46 +++++++-
 squander/synthesis/PartAM.py                  | 101 ++++++++++++++++--
 2 files changed, 137 insertions(+), 10 deletions(-)

diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index f5e7c9093..767166ca6 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -484,6 +484,10 @@ int SabreRouter::estimate_swap_count(
     const std::vector<int>& pi,
     bool reverse
 ) const {
+    if (cand.cnot_count == 0) {
+        return 0;
+    }
+
     const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
 
     double total = 0.0;
@@ -896,6 +900,26 @@ SabreRouter::transform_pi(
     const NeighborInfo* neighbor_info
 ) const {
     const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
+    const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
+
+    if (cand.cnot_count == 0) {
+        std::vector<int> dynamic_node_mapping(P_route_inv.size(), -1);
+        for (size_t i = 0; i < cand.qbit_map_keys_sorted.size(); i++) {
+            const int logical_q = cand.qbit_map_keys_sorted[i];
+            const int qstar = cand.qbit_map_vals_sorted[i];
+            dynamic_node_mapping[P_route_inv[qstar]] = pi[logical_q];
+        }
+
+        std::vector<int> pi_output = pi;
+        for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
+            if (q_star < cand.qstar_to_q.size()) {
+                const int logical_q = cand.qstar_to_q[q_star];
+                if (logical_q < 0) continue;
+                pi_output[logical_q] = dynamic_node_mapping[P_exit[q_star]];
+            }
+        }
+        return {{}, std::move(pi_output)};
+    }
 
     // Route qubits to input positions
     auto [swaps, pi_routed] = find_constrained_swaps(
@@ -909,7 +933,6 @@ SabreRouter::transform_pi(
     );
 
     // Update output positions using P_exit
-    const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
     std::vector<int> pi_output = pi_routed;
 
     for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
@@ -1089,6 +1112,27 @@ std::vector<int> SabreRouter::estimate_candidate_output_layout(
     const std::vector<int>& pi,
     bool reverse
 ) const {
+    if (cand.cnot_count == 0) {
+        const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
+        const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
+        std::vector<int> dynamic_node_mapping(P_route_inv.size(), -1);
+        for (size_t i = 0; i < cand.qbit_map_keys_sorted.size(); i++) {
+            const int logical_q = cand.qbit_map_keys_sorted[i];
+            const int qstar = cand.qbit_map_vals_sorted[i];
+            dynamic_node_mapping[P_route_inv[qstar]] = pi[logical_q];
+        }
+
+        std::vector<int> pi_output = pi;
+        for (size_t q_star = 0; q_star < P_exit.size(); q_star++) {
+            if (q_star < cand.qstar_to_q.size()) {
+                const int logical_q = cand.qstar_to_q[q_star];
+                if (logical_q < 0) continue;
+                pi_output[logical_q] = dynamic_node_mapping[P_exit[q_star]];
+            }
+        }
+        return pi_output;
+    }
+
     const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
     std::vector<int> pi_output = pi;
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 571b3481c..459a1d70a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -71,6 +71,34 @@ def _available_cpus():
 _routing_worker_state = None
 
 
+class _DynamicMappedPartitionCandidate:
+    """Partition candidate remapped to a route-time physical layout.
+
+    This is used for synthesized partition bodies with zero CNOTs.  They do
+    not require adjacent physical nodes, but their single-qubit gates still
+    need to be emitted on the physical wires occupied by the partition's
+    logical qubits at that point in the route.
+    """
+
+    def __init__(self, candidate, node_mapping):
+        self.candidate = candidate
+        self.partition_idx = candidate.partition_idx
+        self.topology_idx = candidate.topology_idx
+        self.permutation_idx = candidate.permutation_idx
+        self.cnot_count = candidate.cnot_count
+        self.node_mapping = dict(node_mapping)
+
+    def get_final_circuit(self, optimized_partitions, N):
+        partition = optimized_partitions[self.partition_idx]
+        params = partition.synthesised_parameters[
+            self.topology_idx
+        ][self.permutation_idx]
+        circuit = partition.synthesised_circuits[
+            self.topology_idx
+        ][self.permutation_idx].get_Flat_Circuit()
+        return circuit.Remap_Qbits(self.node_mapping, N), params
+
+
 def _init_layout_trial_worker(state):
     global _routing_worker_state
     from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping
@@ -1244,21 +1272,38 @@ def _partition_order_cnot_breakdown(partition_order):
         return routing_cnot, partition_cnot
 
     def _partition_order_from_cpp_steps(
-        self, steps, optimized_partitions, candidate_cache, N
+        self, steps, optimized_partitions, candidate_cache, N, pi_initial=None
     ):
         partition_order = []
+        pi = [int(x) for x in pi_initial] if pi_initial is not None else None
         for step in steps:
             kind = step[0]
             if kind == "swap":
                 swaps = [(int(u), int(v)) for u, v in step[1]]
                 if swaps:
                     partition_order.append(construct_swap_circuit(swaps, N))
+                    if pi is not None:
+                        pi = self._apply_swaps_to_pi(pi, swaps)
             elif kind == "partition":
                 partition_idx = int(step[1])
                 candidate_idx = int(step[2])
-                partition_order.append(
-                    candidate_cache[partition_idx][candidate_idx]
-                )
+                candidate = candidate_cache[partition_idx][candidate_idx]
+                if pi is not None and int(candidate.cnot_count) == 0:
+                    node_mapping = self._zero_cnot_dynamic_node_mapping(
+                        pi, candidate
+                    )
+                    partition_order.append(
+                        _DynamicMappedPartitionCandidate(
+                            candidate, node_mapping
+                        )
+                    )
+                    pi = self._apply_zero_cnot_candidate_exit_to_pi(
+                        pi, candidate, node_mapping
+                    )
+                else:
+                    partition_order.append(candidate)
+                    if pi is not None:
+                        pi = self._apply_candidate_exit_to_pi(pi, candidate)
             elif kind == "single":
                 partition_idx = int(step[1])
                 physical_qubit = int(step[2])
@@ -1298,6 +1343,24 @@ def _apply_candidate_exit_to_pi(pi, candidate):
                 pi_out[logical_q] = candidate.node_mapping[mapped_qstar]
         return pi_out
 
+    @staticmethod
+    def _zero_cnot_dynamic_node_mapping(pi, candidate):
+        P_i_inv = [candidate.P_i.index(i) for i in range(len(candidate.P_i))]
+        node_mapping = {}
+        for logical_q, q_star in candidate.qbit_map.items():
+            node_mapping[P_i_inv[q_star]] = int(pi[int(logical_q)])
+        return node_mapping
+
+    @staticmethod
+    def _apply_zero_cnot_candidate_exit_to_pi(pi, candidate, node_mapping):
+        pi_out = [int(x) for x in pi]
+        qbit_map_inverse = {v: k for k, v in candidate.qbit_map.items()}
+        for q_star, mapped_qstar in enumerate(candidate.P_o):
+            if q_star in qbit_map_inverse:
+                logical_q = qbit_map_inverse[q_star]
+                pi_out[logical_q] = node_mapping[mapped_qstar]
+        return pi_out
+
     @staticmethod
     def _immediate_multi_successors(partition_idx, DAG, layout_partitions):
         successors = []
@@ -1427,9 +1490,21 @@ def _write_cpp_routing_trace(
             candidate = candidate_cache[partition_idx][candidate_idx]
             logical_qubits = tuple(int(q) for q in candidate.involved_qbits)
             entry_layout = [int(pi[q]) for q in logical_qubits]
-            exit_pi = self._apply_candidate_exit_to_pi(pi, candidate)
+            if int(candidate.cnot_count) == 0:
+                dynamic_node_mapping = self._zero_cnot_dynamic_node_mapping(
+                    pi, candidate
+                )
+                exit_pi = self._apply_zero_cnot_candidate_exit_to_pi(
+                    pi, candidate, dynamic_node_mapping
+                )
+                physical_nodes = sorted(dynamic_node_mapping.values())
+                topology_edges = ""
+            else:
+                dynamic_node_mapping = None
+                exit_pi = self._apply_candidate_exit_to_pi(pi, candidate)
+                physical_nodes = self._candidate_physical_nodes(candidate)
+                topology_edges = self._csv_edges(candidate.topology)
             exit_layout = [int(exit_pi[q]) for q in logical_qubits]
-            physical_nodes = self._candidate_physical_nodes(candidate)
             successors = self._immediate_multi_successors(
                 partition_idx, DAG, layout_partitions
             )
@@ -1456,7 +1531,7 @@ def _write_cpp_routing_trace(
                 "permutation_idx": int(candidate.permutation_idx),
                 "logical_qubits": self._csv_list(logical_qubits),
                 "physical_nodes": self._csv_list(physical_nodes),
-                "topology_edges": self._csv_edges(candidate.topology),
+                "topology_edges": topology_edges,
                 "entry_layout": self._csv_list(entry_layout),
                 "exit_layout": self._csv_list(exit_layout),
                 "swap_count": swap_count,
@@ -1705,7 +1780,11 @@ def Partition_Aware_Mapping(
                     )
                     if route_steps is not None:
                         partition_order = self._partition_order_from_cpp_steps(
-                            route_steps, optimized_partitions, candidate_cache, N
+                            route_steps,
+                            optimized_partitions,
+                            candidate_cache,
+                            N,
+                            pi_initial=trace_pi_init,
                         )
                         pi_out = np.asarray(trial_pi, dtype=np.int64)
                         pi_init = np.asarray(trace_pi_init, dtype=np.int64)
@@ -1786,7 +1865,11 @@ def Partition_Aware_Mapping(
                         optimized_partitions, saved_sq_circuits
                     )
                     partition_order = self._partition_order_from_cpp_steps(
-                        route_steps, optimized_partitions, candidate_cache, N
+                        route_steps,
+                        optimized_partitions,
+                        candidate_cache,
+                        N,
+                        pi_initial=trace_pi_init,
                     )
                     pi = np.asarray(best_pi, dtype=np.int64)
                     pi_initial = np.asarray(trace_pi_init, dtype=np.int64)

From 83a8a5d6655b776ab3c444681536502898043f92 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 10 May 2026 12:34:35 +0200
Subject: [PATCH 204/232] Use body structure for transparent PartAM routing

---
 .../sabre_router/include/sabre_router.hpp       |  1 +
 squander/src-cpp/sabre_router/sabre_router.cpp  |  6 +++---
 squander/synthesis/PartAM.py                    | 17 +++++++++++++++--
 squander/synthesis/bindings.cpp                 |  1 +
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index b6c50d3d0..fb4923bed 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -39,6 +39,7 @@ struct CandidateData {
     int permutation_idx;
     int candidate_idx = -1;
     int cnot_count;
+    bool has_multi_qubit_body = true;
 
     // Permutations within the reduced (q*) space
     // P_i[v] = position in Q* space for input routing
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 767166ca6..4aba51929 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -484,7 +484,7 @@ int SabreRouter::estimate_swap_count(
     const std::vector<int>& pi,
     bool reverse
 ) const {
-    if (cand.cnot_count == 0) {
+    if (!cand.has_multi_qubit_body) {
         return 0;
     }
 
@@ -902,7 +902,7 @@ SabreRouter::transform_pi(
     const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
     const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
 
-    if (cand.cnot_count == 0) {
+    if (!cand.has_multi_qubit_body) {
         std::vector<int> dynamic_node_mapping(P_route_inv.size(), -1);
         for (size_t i = 0; i < cand.qbit_map_keys_sorted.size(); i++) {
             const int logical_q = cand.qbit_map_keys_sorted[i];
@@ -1112,7 +1112,7 @@ std::vector<int> SabreRouter::estimate_candidate_output_layout(
     const std::vector<int>& pi,
     bool reverse
 ) const {
-    if (cand.cnot_count == 0) {
+    if (!cand.has_multi_qubit_body) {
         const std::vector<int>& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv;
         const std::vector<int>& P_exit = reverse ? cand.P_i : cand.P_o;
         std::vector<int> dynamic_node_mapping(P_route_inv.size(), -1);
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 459a1d70a..1405f7a76 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1288,7 +1288,10 @@ def _partition_order_from_cpp_steps(
                 partition_idx = int(step[1])
                 candidate_idx = int(step[2])
                 candidate = candidate_cache[partition_idx][candidate_idx]
-                if pi is not None and int(candidate.cnot_count) == 0:
+                if (
+                    pi is not None
+                    and self._candidate_is_layout_transparent(candidate)
+                ):
                     node_mapping = self._zero_cnot_dynamic_node_mapping(
                         pi, candidate
                     )
@@ -1333,6 +1336,16 @@ def _candidate_physical_nodes(candidate):
             nodes.update(int(v) for v in candidate.node_mapping.values())
         return sorted(nodes)
 
+    @staticmethod
+    def _candidate_has_multi_qubit_body(candidate):
+        return bool(getattr(candidate, "circuit_structure", ()))
+
+    @staticmethod
+    def _candidate_is_layout_transparent(candidate):
+        return not qgd_Partition_Aware_Mapping._candidate_has_multi_qubit_body(
+            candidate
+        )
+
     @staticmethod
     def _apply_candidate_exit_to_pi(pi, candidate):
         pi_out = [int(x) for x in pi]
@@ -1490,7 +1503,7 @@ def _write_cpp_routing_trace(
             candidate = candidate_cache[partition_idx][candidate_idx]
             logical_qubits = tuple(int(q) for q in candidate.involved_qbits)
             entry_layout = [int(pi[q]) for q in logical_qubits]
-            if int(candidate.cnot_count) == 0:
+            if self._candidate_is_layout_transparent(candidate):
                 dynamic_node_mapping = self._zero_cnot_dynamic_node_mapping(
                     pi, candidate
                 )
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index a37c7dfda..2df284162 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -25,6 +25,7 @@ static CandidateData extract_candidate(py::handle pc) {
     cd.topology_idx = pc.attr("topology_idx").cast<int>();
     cd.permutation_idx = pc.attr("permutation_idx").cast<int>();
     cd.cnot_count = pc.attr("cnot_count").cast<int>();
+    cd.has_multi_qubit_body = py::len(pc.attr("circuit_structure")) > 0;
 
     // P_i, P_o: tuples of ints
     cd.P_i = pc.attr("P_i").cast<std::vector<int>>();

From 701c6f8aa22b6e287835d7efd20b56ad755b1dd1 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 10 May 2026 13:14:48 +0200
Subject: [PATCH 205/232] Disable adaptive PartAM beam by default

---
 squander/src-cpp/sabre_router/include/sabre_router.hpp | 2 +-
 squander/synthesis/PartAM.py                           | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index fb4923bed..0b2e19b2f 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -97,7 +97,7 @@ struct SabreConfig {
     int boundary_beam_depth = 1;
     int layout_trial_boundary_beam_width = 1;
     int layout_trial_boundary_beam_depth = 1;
-    bool adaptive_boundary_beam = true;
+    bool adaptive_boundary_beam = false;
     double successor_handoff_weight = 1.0;
 };
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 1405f7a76..cc9a1c659 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -194,7 +194,7 @@ def __init__(self, config):
         self.config.setdefault('boundary_beam_depth', 1)
         self.config.setdefault('layout_trial_boundary_beam_width', None)
         self.config.setdefault('layout_trial_boundary_beam_depth', None)
-        self.config.setdefault('adaptive_boundary_beam', True)
+        self.config.setdefault('adaptive_boundary_beam', False)
         self.config.setdefault('successor_handoff_weight', 1.0)
         self.config.setdefault('initial_layout_seed_pair_top_k', 8)
         self.config.setdefault('initial_layout_line_ordering', True)
@@ -1165,7 +1165,7 @@ def _run_layout_trials_cpp(
             cfg.layout_trial_boundary_beam_depth = layout_beam_depth
         if hasattr(cfg, 'adaptive_boundary_beam'):
             cfg.adaptive_boundary_beam = bool(
-                self.config.get('adaptive_boundary_beam', True)
+                self.config.get('adaptive_boundary_beam', False)
             )
         if hasattr(cfg, 'successor_handoff_weight'):
             cfg.successor_handoff_weight = self.config.get(

From a11a1348b7d3c2e3112a8110c0d87ada96a2555a Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 10 May 2026 15:29:12 +0200
Subject: [PATCH 206/232] Revert PartAM seed layout heuristics

---
 squander/synthesis/PartAM.py | 275 +++++++----------------------------
 1 file changed, 49 insertions(+), 226 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index cc9a1c659..73b39ff08 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -196,8 +196,6 @@ def __init__(self, config):
         self.config.setdefault('layout_trial_boundary_beam_depth', None)
         self.config.setdefault('adaptive_boundary_beam', False)
         self.config.setdefault('successor_handoff_weight', 1.0)
-        self.config.setdefault('initial_layout_seed_pair_top_k', 8)
-        self.config.setdefault('initial_layout_line_ordering', True)
         self.config.setdefault('routing_trace_path', None)
         self.config['partition_weight_model'] = 'window_turnover'
         strategy = self.config['strategy']
@@ -3417,17 +3415,14 @@ def _compute_seeded_layout(self, optimized_partitions, D, N, circ):
         if not self.topology:
             return np.arange(N)
 
-        # --- build gate-level interaction graph from all multi-qubit gates ---
+        # --- build gate-level interaction graph from circuit CNOT pairs ---
         gate_edges = set()
         for g in circ.get_Gates():
-            qbits = list(g.get_Involved_Qbits())
-            if len(qbits) < 2:
-                continue
-            for i in range(len(qbits)):
-                for j in range(i + 1, len(qbits)):
-                    gate_edges.add(
-                        (min(qbits[i], qbits[j]), max(qbits[i], qbits[j]))
-                    )
+            gname = str(type(g).__name__)
+            if 'CNOT' in gname or 'CX' in gname:
+                ctrl = g.get_Control_Qbit()
+                tgt = g.get_Target_Qbit()
+                gate_edges.add((min(ctrl, tgt), max(ctrl, tgt)))
 
         if not gate_edges:
             return np.arange(N)
@@ -3535,231 +3530,59 @@ def _greedy_seeded_layout(self, optimized_partitions, D, N):
         if not interaction_weight:
             return np.arange(N)
 
-        logical_degree = defaultdict(float)
-        for (u, v), weight in interaction_weight.items():
-            logical_degree[u] += weight
-            logical_degree[v] += weight
-
-        physical_centrality = np.sum(D, axis=1)
-        ranked_logical_pairs = sorted(
-            interaction_weight.items(),
-            key=lambda item: (
-                -item[1],
-                -(logical_degree[item[0][0]] + logical_degree[item[0][1]]),
-                item[0],
-            ),
-        )
-        seed_pair_top_k = int(
-            self.config.get('initial_layout_seed_pair_top_k', 8)
-        )
-        if seed_pair_top_k > 0:
-            ranked_logical_pairs = ranked_logical_pairs[:seed_pair_top_k]
-
-        physical_edges = sorted(
-            [(int(u), int(v)) for u, v in self.topology],
-            key=lambda edge: (
-                physical_centrality[edge[0]] + physical_centrality[edge[1]],
-                edge,
-            ),
-        )
+        pi = np.arange(N)
+        placed_logical = set()
+        placed_physical = set()
 
-        def layout_score(pi):
-            total = 0.0
-            for (u, v), weight in interaction_weight.items():
-                total += weight * D[int(pi[u])][int(pi[v])]
-            return total
-
-        def physical_line_orders():
-            if not self.config.get('initial_layout_line_ordering', True):
-                return []
-            adj = defaultdict(list)
-            nodes = set(range(N))
-            for u, v in self.topology:
-                u = int(u)
-                v = int(v)
-                adj[u].append(v)
-                adj[v].append(u)
-                nodes.add(u)
-                nodes.add(v)
-            if any(len(adj[node]) > 2 for node in nodes):
-                return []
-            endpoints = [node for node in range(N) if len(adj[node]) <= 1]
-            if len(endpoints) != 2 and N > 1:
-                return []
-
-            start = endpoints[0] if endpoints else 0
-            order = []
-            prev = None
-            node = start
-            while node is not None:
-                order.append(node)
-                next_nodes = [nb for nb in adj[node] if nb != prev]
-                if len(next_nodes) > 1:
-                    return []
-                prev, node = node, next_nodes[0] if next_nodes else None
-            if len(order) != N:
-                return []
-            return [order, list(reversed(order))]
-
-        def add_unique_order(orders, order):
-            key = tuple(int(q) for q in order)
-            if len(key) != N or set(key) != set(range(N)):
-                return
-            if key not in orders:
-                orders[key] = list(key)
-
-        def spectral_logical_orders():
-            W = np.zeros((N, N), dtype=float)
-            for (u, v), weight in interaction_weight.items():
-                W[u, v] += weight
-                W[v, u] += weight
-            degree = np.sum(W, axis=1)
-            if np.count_nonzero(degree) < 2:
-                return []
-            L = np.diag(degree) - W
-            try:
-                eigvals, eigvecs = np.linalg.eigh(L)
-            except np.linalg.LinAlgError:
-                return []
-            fiedler_idx = 1 if len(eigvals) > 1 else 0
-            fiedler = eigvecs[:, fiedler_idx]
-            order = sorted(
-                range(N),
-                key=lambda q: (float(fiedler[q]), -logical_degree[q], q),
-            )
-            return [order, list(reversed(order))]
-
-        def insertion_logical_orders():
-            if not ranked_logical_pairs:
-                return []
-            (q1, q2), _ = ranked_logical_pairs[0]
-            remaining = [
-                q for q in sorted(
-                    range(N),
-                    key=lambda q: (-logical_degree[q], q),
-                )
-                if q not in (q1, q2)
-            ]
-            orders = []
-            for seed in ([q1, q2], [q2, q1]):
-                order = list(seed)
-                for q in remaining:
-                    best_pos = 0
-                    best_cost = float('inf')
-                    for pos in range(len(order) + 1):
-                        trial = order[:pos] + [q] + order[pos:]
-                        pos_map = {logical: idx for idx, logical in enumerate(trial)}
-                        cost = 0.0
-                        for (u, v), weight in interaction_weight.items():
-                            if u in pos_map and v in pos_map:
-                                cost += weight * abs(pos_map[u] - pos_map[v])
-                        if cost < best_cost:
-                            best_cost = cost
-                            best_pos = pos
-                    order.insert(best_pos, q)
-                orders.append(order)
-            return orders
-
-        def center_out(order):
-            center = (len(order) - 1) / 2.0
-            return sorted(order, key=lambda p: (abs(order.index(p) - center), p))
-
-        def layout_from_orders(logical_order, physical_order):
-            pi = np.arange(N)
-            for logical_q, physical_q in zip(logical_order, physical_order):
-                pi[int(logical_q)] = int(physical_q)
-            return pi
+        (q1, q2), _ = max(interaction_weight.items(), key=lambda x: x[1])
+        p1, p2 = self.topology[0]
 
-        def build_layout(q1, q2, p1, p2):
-            pi = np.arange(N)
-            placed_logical = {q1, q2}
-            placed_physical = {p1, p2}
+        holder1 = np.where(pi == p1)[0][0]
+        pi[q1], pi[holder1] = p1, pi[q1]
+        holder2 = np.where(pi == p2)[0][0]
+        pi[q2], pi[holder2] = p2, pi[q2]
+        placed_logical.update([q1, q2])
+        placed_physical.update([p1, p2])
 
-            holder1 = int(np.where(pi == p1)[0][0])
-            pi[q1], pi[holder1] = p1, pi[q1]
-            holder2 = int(np.where(pi == p2)[0][0])
-            pi[q2], pi[holder2] = p2, pi[q2]
+        remaining = [q for q in range(N) if q not in placed_logical]
 
-            remaining = [q for q in range(N) if q not in placed_logical]
-
-            def logical_frontier_score(q):
-                return sum(
-                    interaction_weight.get((min(q, pq), max(q, pq)), 0.0)
-                    for pq in placed_logical
-                )
-
-            remaining.sort(
-                key=lambda q: (-logical_frontier_score(q), -logical_degree[q], q)
+        def _score(q):
+            return sum(
+                interaction_weight.get((min(q, pq), max(q, pq)), 0.0)
+                for pq in placed_logical
             )
 
-            for logical_q in remaining:
-                best_physical = None
-                best_key = None
+        remaining.sort(key=_score, reverse=True)
 
-                for physical_q in range(N):
-                    if physical_q in placed_physical:
-                        continue
-
-                    total_dist = 0.0
-                    total_w = 0.0
-                    for other_q in placed_logical:
-                        key = (min(logical_q, other_q), max(logical_q, other_q))
-                        w = interaction_weight.get(key, 0.0)
-                        if w > 0:
-                            total_dist += D[physical_q][pi[other_q]] * w
-                            total_w += w
-
-                    avg = total_dist / total_w if total_w > 0 else 0.0
-                    candidate_key = (avg, physical_centrality[physical_q], physical_q)
-                    if best_key is None or candidate_key < best_key:
-                        best_key = candidate_key
-                        best_physical = physical_q
-
-                if best_physical is not None:
-                    holder = int(np.where(pi == best_physical)[0][0])
-                    pi[logical_q], pi[holder] = best_physical, pi[logical_q]
-                    placed_logical.add(logical_q)
-                    placed_physical.add(best_physical)
+        for logical_q in remaining:
+            best_physical = None
+            best_dist = float('inf')
 
-            return pi
+            for physical_q in range(N):
+                if physical_q in placed_physical:
+                    continue
 
-        best_pi = None
-        best_score = float('inf')
-
-        def consider_layout(pi):
-            nonlocal best_pi, best_score
-            score = layout_score(pi)
-            if score < best_score:
-                best_score = score
-                best_pi = pi
-
-        for (q1, q2), _ in ranked_logical_pairs:
-            for p1, p2 in physical_edges:
-                for seed in ((q1, q2, p1, p2), (q1, q2, p2, p1)):
-                    consider_layout(build_layout(*seed))
-
-        line_orders = physical_line_orders()
-        if line_orders:
-            logical_orders = {}
-            for order in spectral_logical_orders():
-                add_unique_order(logical_orders, order)
-            for order in insertion_logical_orders():
-                add_unique_order(logical_orders, order)
-
-            degree_order = sorted(
-                range(N),
-                key=lambda q: (-logical_degree[q], q),
-            )
-            for physical_order in line_orders:
-                for logical_order in logical_orders.values():
-                    consider_layout(
-                        layout_from_orders(logical_order, physical_order)
-                    )
-                consider_layout(
-                    layout_from_orders(degree_order, center_out(physical_order))
-                )
+                total_dist = 0.0
+                total_w = 0.0
+                for other_q in placed_logical:
+                    key = (min(logical_q, other_q), max(logical_q, other_q))
+                    w = interaction_weight.get(key, 0.0)
+                    if w > 0:
+                        total_dist += D[physical_q][pi[other_q]] * w
+                        total_w += w
+
+                avg = total_dist / total_w if total_w > 0 else 0.0
+                if avg < best_dist:
+                    best_dist = avg
+                    best_physical = physical_q
+
+            if best_physical is not None:
+                holder = np.where(pi == best_physical)[0][0]
+                pi[logical_q], pi[holder] = best_physical, pi[logical_q]
+                placed_logical.add(logical_q)
+                placed_physical.add(best_physical)
 
-        return best_pi if best_pi is not None else np.arange(N)
+        return pi
 
 
     def generate_DAG_levels(self, circuit):

From 54354c1220f39065bccc08401ad62ee6423ae268 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 10 May 2026 18:09:16 +0200
Subject: [PATCH 207/232] Restore PartAM routing baseline

---
 .../sabre_router/include/sabre_router.hpp     |  28 +-
 .../src-cpp/sabre_router/sabre_router.cpp     | 196 +----------
 squander/synthesis/PartAM.py                  | 303 +-----------------
 squander/synthesis/bindings.cpp               |   6 +-
 4 files changed, 19 insertions(+), 514 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index 0b2e19b2f..a77eee1c6 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -95,10 +95,6 @@ struct SabreConfig {
     double three_qubit_exit_weight = 1.0;
     int boundary_beam_width = 1;
     int boundary_beam_depth = 1;
-    int layout_trial_boundary_beam_width = 1;
-    int layout_trial_boundary_beam_depth = 1;
-    bool adaptive_boundary_beam = false;
-    double successor_handoff_weight = 1.0;
 };
 
 struct RouteStep {
@@ -388,29 +384,7 @@ class SabreRouter {
         const std::vector<std::vector<int>>& parents_graph,
         bool reverse,
         const std::unordered_map<int, CanonicalEntry>& canonical_data,
-        SwapCache* swap_cache,
-        bool final_route
-    ) const;
-
-    int boundary_beam_risk(
-        const std::vector<int>& F_snapshot,
-        const std::vector<const CandidateData*>& candidates,
-        const std::vector<std::vector<int>>& children_graph
-    ) const;
-
-    void collect_immediate_multi_successors(
-        int partition_idx,
-        const std::vector<std::vector<int>>& children_graph,
-        std::vector<int>& successors
-    ) const;
-
-    double successor_handoff_cost(
-        int selected_partition_idx,
-        const std::vector<int>& pi,
-        const std::vector<int>& F_after,
-        bool reverse,
-        const std::vector<std::vector<int>>& children_graph,
-        const std::unordered_map<int, CanonicalEntry>& canonical_data
+        SwapCache* swap_cache
     ) const;
 
     // Check if partition is single-qubit
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 4aba51929..43f352e8f 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -1403,131 +1403,6 @@ SabreRouter::advance_layout_frontier(
     return {std::move(F_next), std::move(resolved_next)};
 }
 
-void SabreRouter::collect_immediate_multi_successors(
-    int partition_idx,
-    const std::vector<std::vector<int>>& children_graph,
-    std::vector<int>& successors
-) const {
-    successors.clear();
-    std::vector<uint8_t> seen(num_partitions_, 0);
-    std::deque<int> queue;
-    for (int child : children_graph[partition_idx]) {
-        queue.push_back(child);
-    }
-
-    while (!queue.empty()) {
-        const int child = queue.front();
-        queue.pop_front();
-        if (child < 0 || child >= num_partitions_ || seen[child]) {
-            continue;
-        }
-        seen[child] = 1;
-
-        if (layout_partitions_[child].is_single) {
-            for (int grandchild : children_graph[child]) {
-                queue.push_back(grandchild);
-            }
-        } else {
-            successors.push_back(child);
-        }
-    }
-}
-
-int SabreRouter::boundary_beam_risk(
-    const std::vector<int>& F_snapshot,
-    const std::vector<const CandidateData*>& candidates,
-    const std::vector<std::vector<int>>& children_graph
-) const {
-    int risk = 0;
-    for (const auto* cand : candidates) {
-        if (cand->involved_qbits.size() >= 3) {
-            risk = std::max(risk, 1);
-            break;
-        }
-    }
-
-    auto support_turnover = [&](int a, int b) {
-        const auto& as = layout_partitions_[a].involved_qbits;
-        const auto& bs = layout_partitions_[b].involved_qbits;
-        if (as.size() < 2 || bs.size() < 2) {
-            return 0;
-        }
-        int overlap = 0;
-        for (int qa : as) {
-            for (int qb : bs) {
-                if (qa == qb) {
-                    overlap++;
-                    break;
-                }
-            }
-        }
-        const int min_size = static_cast<int>(std::min(as.size(), bs.size()));
-        return min_size - overlap;
-    };
-
-    std::vector<int> successors;
-    for (int p : F_snapshot) {
-        collect_immediate_multi_successors(p, children_graph, successors);
-        for (int child : successors) {
-            const int turnover = support_turnover(p, child);
-            const int min_size = static_cast<int>(std::min(
-                layout_partitions_[p].involved_qbits.size(),
-                layout_partitions_[child].involved_qbits.size()
-            ));
-            if (turnover >= 2 || (min_size >= 3 && turnover >= min_size - 1)) {
-                risk = std::max(risk, 2);
-            }
-        }
-    }
-
-    if (risk > 0 && F_snapshot.size() > 2) {
-        risk = std::max(risk, 2);
-    }
-    return risk;
-}
-
-double SabreRouter::successor_handoff_cost(
-    int selected_partition_idx,
-    const std::vector<int>& pi,
-    const std::vector<int>& F_after,
-    bool reverse,
-    const std::vector<std::vector<int>>& children_graph,
-    const std::unordered_map<int, CanonicalEntry>& canonical_data
-) const {
-    if (config_.successor_handoff_weight <= 0.0 || F_after.empty()) {
-        return 0.0;
-    }
-
-    std::vector<int> successors;
-    collect_immediate_multi_successors(
-        selected_partition_idx,
-        children_graph,
-        successors
-    );
-    if (successors.empty()) {
-        return 0.0;
-    }
-
-    double total = 0.0;
-    int count = 0;
-    for (int child : successors) {
-        if (std::find(F_after.begin(), F_after.end(), child) == F_after.end()) {
-            continue;
-        }
-        const double cost = future_partition_cost(
-            child,
-            pi,
-            reverse,
-            canonical_data
-        );
-        if (std::isfinite(cost)) {
-            total += cost;
-            count++;
-        }
-    }
-    return count > 0 ? total / static_cast<double>(count) : 0.0;
-}
-
 size_t SabreRouter::boundary_beam_select_index(
     const std::vector<const CandidateData*>& candidates,
     const std::vector<double>& scores,
@@ -1539,8 +1414,7 @@ size_t SabreRouter::boundary_beam_select_index(
     const std::vector<std::vector<int>>& parents_graph,
     bool reverse,
     const std::unordered_map<int, CanonicalEntry>& canonical_data,
-    SwapCache* swap_cache,
-    bool final_route
+    SwapCache* swap_cache
 ) const {
     size_t fallback_idx = 0;
     for (size_t i = 1; i < scores.size(); i++) {
@@ -1549,37 +1423,20 @@ size_t SabreRouter::boundary_beam_select_index(
         }
     }
 
-    const int max_beam_width = std::max(
-        1,
-        final_route
-            ? config_.boundary_beam_width
-            : config_.layout_trial_boundary_beam_width
-    );
-    const int max_beam_depth = std::max(
-        1,
-        final_route
-            ? config_.boundary_beam_depth
-            : config_.layout_trial_boundary_beam_depth
-    );
-    const int risk = boundary_beam_risk(
-        F_snapshot,
-        candidates,
-        children_graph
-    );
-    int beam_width = max_beam_width;
-    int beam_depth = max_beam_depth;
-    if (config_.adaptive_boundary_beam) {
-        if (risk <= 0) {
-            return fallback_idx;
+    const int beam_width = std::max(1, config_.boundary_beam_width);
+    const int beam_depth = std::max(1, config_.boundary_beam_depth);
+    if (beam_width <= 1 || beam_depth <= 1 || candidates.size() <= 1) {
+        return fallback_idx;
+    }
+
+    bool has_three_qubit_candidate = false;
+    for (const auto* cand : candidates) {
+        if (cand->involved_qbits.size() >= 3) {
+            has_three_qubit_candidate = true;
+            break;
         }
-        beam_width = (risk >= 2)
-            ? max_beam_width
-            : std::min(max_beam_width, 2);
-        beam_depth = (risk >= 2)
-            ? max_beam_depth
-            : std::min(max_beam_depth, 2);
     }
-    if (beam_width <= 1 || beam_depth <= 1 || candidates.size() <= 1) {
+    if (!has_three_qubit_candidate) {
         return fallback_idx;
     }
 
@@ -1617,16 +1474,8 @@ size_t SabreRouter::boundary_beam_select_index(
             parents_graph
         );
         const double trans_cost = transition_cost(cand, idx);
-        const double handoff_cost = successor_handoff_cost(
-            cand.partition_idx,
-            cached_pi[idx],
-            F_next,
-            reverse,
-            children_graph,
-            canonical_data
-        );
         states.push_back(BeamState{
-            scores[idx] + config_.successor_handoff_weight * handoff_cost,
+            scores[idx],
             trans_cost,
             cached_pi[idx],
             std::move(F_next),
@@ -1718,6 +1567,7 @@ size_t SabreRouter::boundary_beam_select_index(
                 );
                 const double future_cost = score - trans_cost;
                 const double new_total = state.total_cost + trans_cost;
+                const double rank_cost = new_total + future_cost;
 
                 auto [F_next, resolved_next] = advance_layout_frontier(
                     cand->partition_idx,
@@ -1726,19 +1576,6 @@ size_t SabreRouter::boundary_beam_select_index(
                     children_graph,
                     parents_graph
                 );
-                const double handoff_cost = successor_handoff_cost(
-                    cand->partition_idx,
-                    output_perm,
-                    F_next,
-                    reverse,
-                    children_graph,
-                    canonical_data
-                );
-                const double rank_cost = (
-                    new_total
-                    + future_cost
-                    + config_.successor_handoff_weight * handoff_cost
-                );
                 expanded.push_back(BeamState{
                     rank_cost,
                     new_total,
@@ -1925,8 +1762,7 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
             pg,
             reverse,
             canonical_data,
-            &swap_cache,
-            route_trace != nullptr
+            &swap_cache
         );
         const auto& best = *candidates[best_ci];
 
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 73b39ff08..4bb46200f 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1,7 +1,6 @@
 """
 This is an implementation of Partition Aware Mapping.
 """
-import csv
 import logging
 import multiprocessing as mp
 import os
@@ -72,13 +71,7 @@ def _available_cpus():
 
 
 class _DynamicMappedPartitionCandidate:
-    """Partition candidate remapped to a route-time physical layout.
-
-    This is used for synthesized partition bodies with zero CNOTs.  They do
-    not require adjacent physical nodes, but their single-qubit gates still
-    need to be emitted on the physical wires occupied by the partition's
-    logical qubits at that point in the route.
-    """
+    """Partition candidate remapped to the route-time physical layout."""
 
     def __init__(self, candidate, node_mapping):
         self.candidate = candidate
@@ -192,11 +185,6 @@ def __init__(self, config):
         self.config.setdefault('three_qubit_exit_weight', 1.0)
         self.config.setdefault('boundary_beam_width', 1)
         self.config.setdefault('boundary_beam_depth', 1)
-        self.config.setdefault('layout_trial_boundary_beam_width', None)
-        self.config.setdefault('layout_trial_boundary_beam_depth', None)
-        self.config.setdefault('adaptive_boundary_beam', False)
-        self.config.setdefault('successor_handoff_weight', 1.0)
-        self.config.setdefault('routing_trace_path', None)
         self.config['partition_weight_model'] = 'window_turnover'
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
@@ -1147,28 +1135,6 @@ def _run_layout_trials_cpp(
             cfg.boundary_beam_depth = self.config.get(
                 'boundary_beam_depth', 1
             )
-        if hasattr(cfg, 'layout_trial_boundary_beam_width'):
-            layout_beam_width = self.config.get(
-                'layout_trial_boundary_beam_width', None
-            )
-            if layout_beam_width is None:
-                layout_beam_width = cfg.boundary_beam_width
-            cfg.layout_trial_boundary_beam_width = layout_beam_width
-        if hasattr(cfg, 'layout_trial_boundary_beam_depth'):
-            layout_beam_depth = self.config.get(
-                'layout_trial_boundary_beam_depth', None
-            )
-            if layout_beam_depth is None:
-                layout_beam_depth = cfg.boundary_beam_depth
-            cfg.layout_trial_boundary_beam_depth = layout_beam_depth
-        if hasattr(cfg, 'adaptive_boundary_beam'):
-            cfg.adaptive_boundary_beam = bool(
-                self.config.get('adaptive_boundary_beam', False)
-            )
-        if hasattr(cfg, 'successor_handoff_weight'):
-            cfg.successor_handoff_weight = self.config.get(
-                'successor_handoff_weight', 1.0
-            )
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
@@ -1316,24 +1282,6 @@ def _partition_order_from_cpp_steps(
                 partition_order.append(part)
         return partition_order
 
-    @staticmethod
-    def _csv_list(values):
-        return " ".join(str(int(v)) for v in values)
-
-    @staticmethod
-    def _csv_edges(edges):
-        return " ".join(f"{int(u)}-{int(v)}" for u, v in edges)
-
-    @staticmethod
-    def _candidate_physical_nodes(candidate):
-        nodes = set()
-        for u, v in candidate.topology:
-            nodes.add(int(u))
-            nodes.add(int(v))
-        if not nodes:
-            nodes.update(int(v) for v in candidate.node_mapping.values())
-        return sorted(nodes)
-
     @staticmethod
     def _candidate_has_multi_qubit_body(candidate):
         return bool(getattr(candidate, "circuit_structure", ()))
@@ -1372,228 +1320,6 @@ def _apply_zero_cnot_candidate_exit_to_pi(pi, candidate, node_mapping):
                 pi_out[logical_q] = node_mapping[mapped_qstar]
         return pi_out
 
-    @staticmethod
-    def _immediate_multi_successors(partition_idx, DAG, layout_partitions):
-        successors = []
-        seen = set()
-        queue = deque(DAG[partition_idx])
-        while queue:
-            child = queue.popleft()
-            if child in seen:
-                continue
-            seen.add(child)
-            if layout_partitions[child]["is_single"]:
-                queue.extend(DAG[child])
-            else:
-                successors.append(child)
-        return successors
-
-    @staticmethod
-    def _support_overlap_summary(partition_idx, successors, layout_partitions):
-        support = set(layout_partitions[partition_idx]["involved_qbits"])
-        summary = []
-        max_overlap = 0
-        min_turnover = None
-        for child in successors:
-            child_support = set(layout_partitions[child]["involved_qbits"])
-            overlap = len(support & child_support)
-            turnover = min(len(support), len(child_support)) - overlap
-            max_overlap = max(max_overlap, overlap)
-            min_turnover = (
-                turnover
-                if min_turnover is None
-                else min(min_turnover, turnover)
-            )
-            summary.append(f"{child}:{overlap}/{turnover}")
-        return max_overlap, (0 if min_turnover is None else min_turnover), " ".join(summary)
-
-    @staticmethod
-    def _eligible_multi_frontier(resolved, IDAG, layout_partitions):
-        frontier = []
-        for idx, info in enumerate(layout_partitions):
-            if resolved[idx] or info["is_single"]:
-                continue
-            if all(resolved[parent] for parent in IDAG[idx]):
-                frontier.append(idx)
-        return frontier
-
-    def _write_cpp_routing_trace(
-        self,
-        trace_path,
-        steps,
-        pi_initial,
-        candidate_cache,
-        layout_partitions,
-        DAG,
-        IDAG,
-        N,
-    ):
-        """Write a CSV trace for the final selected C++ route."""
-        if not trace_path:
-            return
-
-        trace_dir = os.path.dirname(os.path.abspath(trace_path))
-        if trace_dir:
-            os.makedirs(trace_dir, exist_ok=True)
-
-        pi = [int(x) for x in pi_initial]
-        resolved = [False] * len(layout_partitions)
-        pending_swaps = []
-        cumulative_swaps = 0
-        cumulative_body_cnot = 0
-        rows = []
-
-        for route_step_idx, step in enumerate(steps):
-            kind = step[0]
-            if kind == "swap":
-                swaps = [(int(u), int(v)) for u, v in step[1]]
-                if swaps:
-                    pending_swaps.extend(swaps)
-                    pi = self._apply_swaps_to_pi(pi, swaps)
-                continue
-
-            if kind == "single":
-                partition_idx = int(step[1])
-                logical_qubits = tuple(layout_partitions[partition_idx]["involved_qbits"])
-                physical_qubit = int(step[2])
-                resolved[partition_idx] = True
-                rows.append({
-                    "row": len(rows),
-                    "route_step": route_step_idx,
-                    "kind": "single",
-                    "partition_idx": partition_idx,
-                    "candidate_idx": "",
-                    "topology_idx": "",
-                    "permutation_idx": "",
-                    "logical_qubits": self._csv_list(logical_qubits),
-                    "physical_nodes": str(physical_qubit),
-                    "topology_edges": "",
-                    "entry_layout": self._csv_list(
-                        pi[q] for q in logical_qubits
-                    ),
-                    "exit_layout": self._csv_list(
-                        pi[q] for q in logical_qubits
-                    ),
-                    "swap_count": 0,
-                    "routing_cnot": 0,
-                    "body_cnot": 0,
-                    "cumulative_swap_count": cumulative_swaps,
-                    "cumulative_routing_cnot": 3 * cumulative_swaps,
-                    "cumulative_body_cnot": cumulative_body_cnot,
-                    "frontier_size": len(
-                        self._eligible_multi_frontier(
-                            resolved, IDAG, layout_partitions
-                        )
-                    ),
-                    "successor_count": 0,
-                    "max_successor_overlap": 0,
-                    "min_successor_turnover": 0,
-                    "successor_overlap": "",
-                    "swaps": "",
-                })
-                continue
-
-            if kind != "partition":
-                continue
-
-            partition_idx = int(step[1])
-            candidate_idx = int(step[2])
-            candidate = candidate_cache[partition_idx][candidate_idx]
-            logical_qubits = tuple(int(q) for q in candidate.involved_qbits)
-            entry_layout = [int(pi[q]) for q in logical_qubits]
-            if self._candidate_is_layout_transparent(candidate):
-                dynamic_node_mapping = self._zero_cnot_dynamic_node_mapping(
-                    pi, candidate
-                )
-                exit_pi = self._apply_zero_cnot_candidate_exit_to_pi(
-                    pi, candidate, dynamic_node_mapping
-                )
-                physical_nodes = sorted(dynamic_node_mapping.values())
-                topology_edges = ""
-            else:
-                dynamic_node_mapping = None
-                exit_pi = self._apply_candidate_exit_to_pi(pi, candidate)
-                physical_nodes = self._candidate_physical_nodes(candidate)
-                topology_edges = self._csv_edges(candidate.topology)
-            exit_layout = [int(exit_pi[q]) for q in logical_qubits]
-            successors = self._immediate_multi_successors(
-                partition_idx, DAG, layout_partitions
-            )
-            max_overlap, min_turnover, overlap_summary = (
-                self._support_overlap_summary(
-                    partition_idx, successors, layout_partitions
-                )
-            )
-            frontier_size = len(
-                self._eligible_multi_frontier(
-                    resolved, IDAG, layout_partitions
-                )
-            )
-            swap_count = len(pending_swaps)
-            cumulative_swaps += swap_count
-            cumulative_body_cnot += int(candidate.cnot_count)
-            rows.append({
-                "row": len(rows),
-                "route_step": route_step_idx,
-                "kind": "partition",
-                "partition_idx": partition_idx,
-                "candidate_idx": candidate_idx,
-                "topology_idx": int(candidate.topology_idx),
-                "permutation_idx": int(candidate.permutation_idx),
-                "logical_qubits": self._csv_list(logical_qubits),
-                "physical_nodes": self._csv_list(physical_nodes),
-                "topology_edges": topology_edges,
-                "entry_layout": self._csv_list(entry_layout),
-                "exit_layout": self._csv_list(exit_layout),
-                "swap_count": swap_count,
-                "routing_cnot": 3 * swap_count,
-                "body_cnot": int(candidate.cnot_count),
-                "cumulative_swap_count": cumulative_swaps,
-                "cumulative_routing_cnot": 3 * cumulative_swaps,
-                "cumulative_body_cnot": cumulative_body_cnot,
-                "frontier_size": frontier_size,
-                "successor_count": len(successors),
-                "max_successor_overlap": max_overlap,
-                "min_successor_turnover": min_turnover,
-                "successor_overlap": overlap_summary,
-                "swaps": self._csv_edges(pending_swaps),
-            })
-            resolved[partition_idx] = True
-            pi = exit_pi
-            pending_swaps = []
-
-        fieldnames = [
-            "row",
-            "route_step",
-            "kind",
-            "partition_idx",
-            "candidate_idx",
-            "topology_idx",
-            "permutation_idx",
-            "logical_qubits",
-            "physical_nodes",
-            "topology_edges",
-            "entry_layout",
-            "exit_layout",
-            "swap_count",
-            "routing_cnot",
-            "body_cnot",
-            "cumulative_swap_count",
-            "cumulative_routing_cnot",
-            "cumulative_body_cnot",
-            "frontier_size",
-            "successor_count",
-            "max_successor_overlap",
-            "min_successor_turnover",
-            "successor_overlap",
-            "swaps",
-        ]
-        with open(trace_path, "w", newline="") as f:
-            writer = csv.DictWriter(f, fieldnames=fieldnames)
-            writer.writeheader()
-            writer.writerows(rows)
-        self._routing_trace_path = trace_path
-
 
     def _rank_layout_trials_by_actual_routing(
         self,
@@ -1696,8 +1422,6 @@ def Partition_Aware_Mapping(
         partition_body_cnot = 0
         routing_elapsed_before_cleanup = None
         cleanup_total = 0.0
-        final_route_steps = None
-        final_route_pi_initial = None
 
         if n_iterations == 0:
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
@@ -1783,7 +1507,6 @@ def Partition_Aware_Mapping(
                 best_pre_cleanup = None
                 best_routing_swap_cnot = 0
                 best_partition_body_cnot = 0
-                best_route_steps = None
 
                 for _, trial_pi, _, trace_pi_init, route_steps in top_layouts:
                     self._restore_single_qubit_circuits(
@@ -1843,7 +1566,6 @@ def Partition_Aware_Mapping(
                         best_pi = pi_out
                         best_routing_swap_cnot = trial_routing_cnot
                         best_partition_body_cnot = trial_partition_cnot
-                        best_route_steps = route_steps
 
                 final_cleanup_config = dict(cleanup_config)
                 final_cleanup_config['use_osr'] = 1
@@ -1862,8 +1584,6 @@ def Partition_Aware_Mapping(
                 pi = best_pi
                 routing_swap_cnot = best_routing_swap_cnot
                 partition_body_cnot = best_partition_body_cnot
-                final_route_steps = best_route_steps
-                final_route_pi_initial = best_pi_init
 
             else:
                 _, best_pi, _, trace_pi_init, route_steps = trial_results[0]
@@ -1884,8 +1604,6 @@ def Partition_Aware_Mapping(
                     )
                     pi = np.asarray(best_pi, dtype=np.int64)
                     pi_initial = np.asarray(trace_pi_init, dtype=np.int64)
-                    final_route_steps = route_steps
-                    final_route_pi_initial = pi_initial.copy()
                 else:
                     F = self.get_initial_layer(IDAG, N, optimized_partitions)
                     partition_order, pi, pi_initial = self.Heuristic_Search(
@@ -1936,25 +1654,6 @@ def Partition_Aware_Mapping(
         self._routing_swap_cnot = routing_swap_cnot
         self._partition_body_cnot = partition_body_cnot
 
-        routing_trace_path = self.config.get("routing_trace_path", None)
-        if routing_trace_path:
-            if final_route_steps is not None and final_route_pi_initial is not None:
-                self._write_cpp_routing_trace(
-                    routing_trace_path,
-                    final_route_steps,
-                    final_route_pi_initial,
-                    candidate_cache,
-                    layout_partitions,
-                    DAG,
-                    IDAG,
-                    N,
-                )
-            else:
-                logging.warning(
-                    "routing_trace_path was set, but no C++ route steps were "
-                    "available for the selected route."
-                )
-
         return final_circuit, final_parameters, pi_initial, pi
 
     # ------------------------------------------------------------------------
diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 2df284162..76d5a3452 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -137,11 +137,7 @@ PYBIND11_MODULE(_sabre_router, m) {
         .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight)
         .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight)
         .def_readwrite("boundary_beam_width", &SabreConfig::boundary_beam_width)
-        .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth)
-        .def_readwrite("layout_trial_boundary_beam_width", &SabreConfig::layout_trial_boundary_beam_width)
-        .def_readwrite("layout_trial_boundary_beam_depth", &SabreConfig::layout_trial_boundary_beam_depth)
-        .def_readwrite("adaptive_boundary_beam", &SabreConfig::adaptive_boundary_beam)
-        .def_readwrite("successor_handoff_weight", &SabreConfig::successor_handoff_weight);
+        .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth);
 
     // Bind SabreRouter with data-converting constructor
     py::class_<SabreRouter>(m, "SabreRouter")

From 7950f7f547aa20a732c240284200c83ba442c861 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 10 May 2026 22:30:25 +0200
Subject: [PATCH 208/232] Parallelize PartAM actual route ranking

---
 squander/synthesis/PartAM.py | 31 ++++++++++++++++++++++++++++---
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 4bb46200f..2cf9d6747 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1196,12 +1196,37 @@ def _run_layout_trials_cpp(
             actual_rank_top_k = len(heuristic_ranked)
         actual_rank_top_k = min(int(actual_rank_top_k), len(heuristic_ranked))
 
-        ranked = []
-        for heuristic_cost, trial_pi in heuristic_ranked[:actual_rank_top_k]:
+        actual_rank_inputs = heuristic_ranked[:actual_rank_top_k]
+
+        def route_rank_input(item):
+            heuristic_cost, trial_pi = item
             actual_cnot, pi_out, pi_init, steps = router.route_forward(
                 [int(x) for x in trial_pi]
             )
-            ranked.append((actual_cnot, pi_out, heuristic_cost, pi_init, steps))
+            return (actual_cnot, pi_out, heuristic_cost, pi_init, steps)
+
+        use_parallel_actual_routing = (
+            self.config.get("parallel_layout_trials", False)
+            and len(actual_rank_inputs) > 1
+        )
+        if use_parallel_actual_routing:
+            from concurrent.futures import ThreadPoolExecutor
+            workers = self.config.get("layout_trial_workers", 0)
+            if workers <= 0:
+                workers = min(len(actual_rank_inputs), _available_cpus())
+
+            with ThreadPoolExecutor(max_workers=workers) as pool:
+                futures = [
+                    pool.submit(route_rank_input, item)
+                    for item in actual_rank_inputs
+                ]
+                ranked = [f.result() for f in futures]
+        else:
+            ranked = [
+                route_rank_input(item)
+                for item in actual_rank_inputs
+            ]
+
         ranked.sort(key=lambda x: (x[0], x[2]))
         ranked.extend(
             (float("inf"), pi, cost, None, None)

From b204545eff72ed92d1ea8d64529a27b087c948d4 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 11 May 2026 00:12:59 +0200
Subject: [PATCH 209/232] Split PartAM layout and final routing beams

---
 squander/synthesis/PartAM.py | 123 +++++++++++++++++++++++++----------
 1 file changed, 88 insertions(+), 35 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2cf9d6747..291b48cbe 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -185,6 +185,8 @@ def __init__(self, config):
         self.config.setdefault('three_qubit_exit_weight', 1.0)
         self.config.setdefault('boundary_beam_width', 1)
         self.config.setdefault('boundary_beam_depth', 1)
+        self.config.setdefault('layout_boundary_beam_width', None)
+        self.config.setdefault('layout_boundary_beam_depth', None)
         self.config['partition_weight_model'] = 'window_turnover'
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
@@ -1102,39 +1104,65 @@ def _run_layout_trials_cpp(
     ):
         from squander.synthesis._sabre_router import SabreRouter, SabreConfig
 
-        cfg = SabreConfig()
-        cfg.prefilter_top_k = self.config.get('prefilter_top_k', 50)
-        if hasattr(cfg, 'prefilter_min_per_partition'):
-            cfg.prefilter_min_per_partition = self.config.get(
-                'prefilter_min_per_partition', 2
-            )
-        if hasattr(cfg, 'prefilter_min_3q'):
-            cfg.prefilter_min_3q = self.config.get('prefilter_min_3q', 12)
-        cfg.max_E_size = self.config.get('max_E_size', 20)
-        cfg.max_lookahead = self.config.get('max_lookahead', 4)
-        cfg.E_weight = self.config.get('E_weight', 0.5)
-        cfg.E_alpha = self.config.get('E_alpha', 1.0)
-        cfg.cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
-        cfg.sabre_iterations = n_iterations
-        cfg.n_layout_trials = max(1, n_trials)
-        cfg.random_seed = random_seed
-        cfg.decay_delta = self.config.get('decay_delta', 0.001)
-        cfg.swap_burst_budget = self.config.get('swap_burst_budget', 5)
-        cfg.path_tiebreak_weight = self.config.get(
-            'path_tiebreak_weight', 0.2
+        route_beam_width = self.config.get('boundary_beam_width', 1)
+        route_beam_depth = self.config.get('boundary_beam_depth', 1)
+        layout_beam_width = self.config.get(
+            'layout_boundary_beam_width', route_beam_width
         )
-        if hasattr(cfg, 'three_qubit_exit_weight'):
-            cfg.three_qubit_exit_weight = self.config.get(
-                'three_qubit_exit_weight', 1.0
-            )
-        if hasattr(cfg, 'boundary_beam_width'):
-            cfg.boundary_beam_width = self.config.get(
-                'boundary_beam_width', 1
-            )
-        if hasattr(cfg, 'boundary_beam_depth'):
-            cfg.boundary_beam_depth = self.config.get(
-                'boundary_beam_depth', 1
+        layout_beam_depth = self.config.get(
+            'layout_boundary_beam_depth', route_beam_depth
+        )
+        if layout_beam_width is None:
+            layout_beam_width = route_beam_width
+        if layout_beam_depth is None:
+            layout_beam_depth = route_beam_depth
+
+        def make_cpp_config(beam_width, beam_depth):
+            cfg = SabreConfig()
+            cfg.prefilter_top_k = self.config.get('prefilter_top_k', 50)
+            if hasattr(cfg, 'prefilter_min_per_partition'):
+                cfg.prefilter_min_per_partition = self.config.get(
+                    'prefilter_min_per_partition', 2
+                )
+            if hasattr(cfg, 'prefilter_min_3q'):
+                cfg.prefilter_min_3q = self.config.get('prefilter_min_3q', 12)
+            cfg.max_E_size = self.config.get('max_E_size', 20)
+            cfg.max_lookahead = self.config.get('max_lookahead', 4)
+            cfg.E_weight = self.config.get('E_weight', 0.5)
+            cfg.E_alpha = self.config.get('E_alpha', 1.0)
+            cfg.cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0)
+            cfg.sabre_iterations = n_iterations
+            cfg.n_layout_trials = max(1, n_trials)
+            cfg.random_seed = random_seed
+            cfg.decay_delta = self.config.get('decay_delta', 0.001)
+            cfg.swap_burst_budget = self.config.get('swap_burst_budget', 5)
+            cfg.path_tiebreak_weight = self.config.get(
+                'path_tiebreak_weight', 0.2
             )
+            if hasattr(cfg, 'three_qubit_exit_weight'):
+                cfg.three_qubit_exit_weight = self.config.get(
+                    'three_qubit_exit_weight', 1.0
+                )
+            if hasattr(cfg, 'boundary_beam_width'):
+                cfg.boundary_beam_width = beam_width
+            if hasattr(cfg, 'boundary_beam_depth'):
+                cfg.boundary_beam_depth = beam_depth
+            return cfg
+
+        layout_cfg = make_cpp_config(layout_beam_width, layout_beam_depth)
+        route_cfg = make_cpp_config(route_beam_width, route_beam_depth)
+        use_distinct_route_router = (
+            layout_beam_width != route_beam_width
+            or layout_beam_depth != route_beam_depth
+        )
+        self._routing_layout_boundary_beam = (
+            int(layout_beam_width),
+            int(layout_beam_depth),
+        )
+        self._routing_boundary_beam = (
+            int(route_beam_width),
+            int(route_beam_depth),
+        )
         canonical_fwd = self._build_canonical_neighbor_data(
             scoring_partitions, reverse=False
         )
@@ -1151,11 +1179,18 @@ def _run_layout_trials_cpp(
             for lp in layout_partitions
         ]
 
-        router = SabreRouter(
-            cfg, D, self._adj, DAG, IDAG,
+        trial_router = SabreRouter(
+            layout_cfg, D, self._adj, DAG, IDAG,
             candidate_cache_lists, layout_partitions_lists,
             canonical_fwd, canonical_rev,
         )
+        router = trial_router
+        if use_distinct_route_router:
+            router = SabreRouter(
+                route_cfg, D, self._adj, DAG, IDAG,
+                candidate_cache_lists, layout_partitions_lists,
+                canonical_fwd, canonical_rev,
+            )
 
         seeded_pi_list = [int(x) for x in seeded_pi]
         n_trials_actual = max(1, n_trials)
@@ -1167,8 +1202,12 @@ def _run_layout_trials_cpp(
         )
 
         if not use_parallel:
+            self._routing_layout_trial_workers = 1
+            layout_trials_t0 = time.time()
             trial_results = [
-                router.run_trial(idx, seeded_pi_list, n_iterations, n_trials_actual)
+                trial_router.run_trial(
+                    idx, seeded_pi_list, n_iterations, n_trials_actual
+                )
                 for idx in trial_indices
             ]
         else:
@@ -1177,12 +1216,21 @@ def _run_layout_trials_cpp(
             if workers <= 0:
                 workers = min(n_trials_actual, _available_cpus())
 
+            self._routing_layout_trial_workers = workers
+            layout_trials_t0 = time.time()
             with ThreadPoolExecutor(max_workers=workers) as pool:
                 futures = [
-                    pool.submit(router.run_trial, idx, seeded_pi_list, n_iterations, n_trials_actual)
+                    pool.submit(
+                        trial_router.run_trial,
+                        idx,
+                        seeded_pi_list,
+                        n_iterations,
+                        n_trials_actual,
+                    )
                     for idx in trial_indices
                 ]
                 trial_results = [f.result() for f in futures]
+        self._routing_layout_trials_time = time.time() - layout_trials_t0
 
         heuristic_ranked = sorted(trial_results, key=lambda x: x[0])
         actual_rank_default = min(
@@ -1197,6 +1245,7 @@ def _run_layout_trials_cpp(
         actual_rank_top_k = min(int(actual_rank_top_k), len(heuristic_ranked))
 
         actual_rank_inputs = heuristic_ranked[:actual_rank_top_k]
+        self._routing_actual_rank_count = len(actual_rank_inputs)
 
         def route_rank_input(item):
             heuristic_cost, trial_pi = item
@@ -1209,12 +1258,14 @@ def route_rank_input(item):
             self.config.get("parallel_layout_trials", False)
             and len(actual_rank_inputs) > 1
         )
+        actual_rank_t0 = time.time()
         if use_parallel_actual_routing:
             from concurrent.futures import ThreadPoolExecutor
             workers = self.config.get("layout_trial_workers", 0)
             if workers <= 0:
                 workers = min(len(actual_rank_inputs), _available_cpus())
 
+            self._routing_actual_rank_workers = workers
             with ThreadPoolExecutor(max_workers=workers) as pool:
                 futures = [
                     pool.submit(route_rank_input, item)
@@ -1222,10 +1273,12 @@ def route_rank_input(item):
                 ]
                 ranked = [f.result() for f in futures]
         else:
+            self._routing_actual_rank_workers = 1
             ranked = [
                 route_rank_input(item)
                 for item in actual_rank_inputs
             ]
+        self._routing_actual_rank_time = time.time() - actual_rank_t0
 
         ranked.sort(key=lambda x: (x[0], x[2]))
         ranked.extend(

From deafd781ca5572e075d5c84c2cf31d8305addb87 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 11 May 2026 11:01:15 +0200
Subject: [PATCH 210/232] Restore seqpam ILP partitioning

---
 .../qgd_Wide_Circuit_Optimization.py          | 160 +++++-------------
 1 file changed, 38 insertions(+), 122 deletions(-)

diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
index e6f17568d..423038a3b 100644
--- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py
+++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py
@@ -142,7 +142,7 @@ def generate_squander_seqpam(squander_config, block_size):
     from bqskit.compiler import Workflow, BasePass
 
     class SquanderILPPartitioner(BasePass):
-        """Partition a bqskit circuit using squander's ILP."""
+        """Partition a bqskit circuit using Squander's ILP partitioner."""
 
         def __init__(self, block_size):
             self.block_size = block_size
@@ -150,11 +150,10 @@ def __init__(self, block_size):
         async def run(self, circuit, data):
             from bqskit.ir import Circuit as BQCircuit
             from bqskit.ir.lang.qasm2 import OPENQASM2Language
+            from qiskit import qasm2
             from qiskit import QuantumCircuit as QkCircuit
             from squander import Qiskit_IO
-            from squander.partitioning.ilp import (
-                get_all_partitions, _get_topo_order, ilp_global_optimal,
-            )
+            from squander.partitioning.partition import PartitionCircuit
 
             # Unfold any CircuitGate blocks (e.g. from a prior SubtopologySelectionPass)
             # so that bqskit op indices align 1:1 with squander gate indices after the
@@ -164,125 +163,42 @@ async def run(self, circuit, data):
 
             qasm_str = OPENQASM2Language().encode(flat_circuit)
             qk_circ = QkCircuit.from_qasm_str(qasm_str)
-            sqdr_circ, _ = Qiskit_IO.convert_Qiskit_to_Squander(qk_circ)
-
-            allparts, g, go, rgo, sq_chains, gate_to_qubit, _ = \
-                get_all_partitions(sqdr_circ, self.block_size)
-            gate_dict = {i: gate for i, gate in enumerate(sqdr_circ.get_Gates())}
-
-            L_parts, _ = ilp_global_optimal(allparts, g)
-
-            bqskit_ops = list(flat_circuit.operations_with_cycles())
-
-            sqc_pre     = {x[0]: x for x in sq_chains if rgo[x[0]]}
-            sqc_post    = {x[-1]: x for x in sq_chains if go[x[-1]]}
-            sqc_prepost = {x[0]: x for x in sq_chains
-                           if x[0] in sqc_pre and x[-1] in sqc_post}
-
-            # Build expanded gate_idxs per ILP partition.  The ILP operates on a
-            # graph with single-qubit chains contracted out, so only reinsert
-            # chains that are enclosed by a selected partition.  Do not absorb
-            # every intermediate operation on these qubits: BQSKit sees each
-            # CircuitGate as a synthesis block, and broad expansion creates
-            # large overlapping blocks that are expensive and can duplicate work.
-            expanded = {}
-            for i in L_parts:
-                part = allparts[i]
-                surrounded = {
-                    t for s in part for t in go[s]
-                    if t in sqc_prepost
-                    and go[sqc_prepost[t][-1]]
-                    and next(iter(go[sqc_prepost[t][-1]])) in part
-                }
-                gate_idxs = frozenset.union(part, *(sqc_prepost[v] for v in surrounded))
-                expanded[i] = gate_idxs
-
-            # Sort partitions by their minimum gate index to preserve original order
-            seen_parts = set()
-            sorted_parts = []
-            claimed_gates = set()
-            for i in L_parts:
-                gate_idxs = expanded[i] - claimed_gates
-                if not gate_idxs:
-                    continue
-                part_key = min(gate_idxs)
-                if part_key not in seen_parts:
-                    seen_parts.add(part_key)
-                    sorted_parts.append((part_key, gate_idxs))
-                    claimed_gates.update(gate_idxs)
-            sorted_parts.sort(key=lambda x: x[0])
-
-            # Map gate_idx -> sorted partition index
-            gate_to_part = {}
-            for pidx, (_, gate_idxs) in enumerate(sorted_parts):
-                for gi in gate_idxs:
-                    gate_to_part[gi] = pidx
-
-            # PAMRoutingPass expects permutation data for every non-barrier
-            # operation it may execute.  Keep gates outside ILP blocks wrapped as
-            # CircuitGates too, but group whole unclaimed 1q chains to avoid
-            # spawning one BQSKit task per single-qubit gate.
-            unclaimed_chain_by_gate = {}
-            for chain in sq_chains:
-                if all(gi not in gate_to_part for gi in chain):
-                    for gi in chain:
-                        unclaimed_chain_by_gate[gi] = chain
-
-            # Build partitioned circuit by iterating gates in original order
+            sqdr_circ, sqdr_parameters = Qiskit_IO.convert_Qiskit_to_Squander(
+                qk_circ
+            )
+            partitioned_circuit, parameters, _ = PartitionCircuit(
+                sqdr_circ,
+                sqdr_parameters,
+                self.block_size,
+                strategy="ilp",
+            )
+
             partitioned = BQCircuit(circuit.num_qudits, circuit.radixes)
-            built_parts = set()
-            built_chains = set()
-
-            for gi, (_, op) in enumerate(bqskit_ops):
-                pidx = gate_to_part.get(gi, -1)
-
-                if pidx >= 0 and pidx not in built_parts:
-                    built_parts.add(pidx)
-                    _, gate_idxs = sorted_parts[pidx]
-                    global_qudits = sorted({
-                        q for ggi in gate_idxs
-                        for q in gate_dict[ggi].get_Involved_Qbits()
-                    })
-                    local_map = {gq: l for l, gq in enumerate(global_qudits)}
-
-                    topo = _get_topo_order(
-                        {x: go[x] & gate_idxs for x in gate_idxs},
-                        {x: rgo[x] & gate_idxs for x in gate_idxs},
-                        gate_to_qubit,
-                    )
-                    sub = BQCircuit(len(global_qudits))
-                    for ggi in topo:
-                        _, gop = bqskit_ops[ggi]
-                        sub.append_gate(gop.gate, [local_map[q] for q in gop.location], gop.params)
-                    partitioned.append_circuit(sub, global_qudits, as_circuit_gate=True)
-
-                elif pidx < 0:
-                    chain = unclaimed_chain_by_gate.get(gi)
-                    if chain is not None:
-                        if chain in built_chains:
-                            continue
-                        built_chains.add(chain)
-                        global_qudits = list(gate_dict[chain[0]].get_Involved_Qbits())
-                        local_map = {gq: l for l, gq in enumerate(global_qudits)}
-                        sub = BQCircuit(len(global_qudits))
-                        for ggi in chain:
-                            _, gop = bqskit_ops[ggi]
-                            sub.append_gate(
-                                gop.gate,
-                                [local_map[q] for q in gop.location],
-                                gop.params,
-                            )
-                        partitioned.append_circuit(
-                            sub, global_qudits, as_circuit_gate=True
-                        )
-                    else:
-                        sub = BQCircuit(len(op.location))
-                        sub.append_gate(
-                            op.gate, list(range(len(op.location))), op.params
-                        )
-                        partitioned.append_circuit(
-                            sub, list(op.location), as_circuit_gate=True
-                        )
+            qasm = OPENQASM2Language()
+
+            for subcircuit in partitioned_circuit.get_Gates():
+                global_qudits = list(subcircuit.get_Qbits())
+                if not global_qudits:
+                    continue
+
+                start = subcircuit.get_Parameter_Start_Index()
+                stop = start + subcircuit.get_Parameter_Num()
+                sub_parameters = parameters[start:stop]
+                local_map = {q: i for i, q in enumerate(global_qudits)}
+                local_subcircuit = subcircuit.Remap_Qbits(
+                    local_map,
+                    len(global_qudits),
+                )
+                local_qiskit = Qiskit_IO.get_Qiskit_Circuit(
+                    local_subcircuit,
+                    sub_parameters,
+                )
+                local_bqskit = qasm.decode(qasm2.dumps(local_qiskit))
+                partitioned.append_circuit(
+                    local_bqskit,
+                    global_qudits,
+                    as_circuit_gate=True,
+                )
 
             circuit.become(partitioned, False)
 

From 8e31cd305ab4ac7fe9b783df8bee5f0366bc4ed0 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Mon, 11 May 2026 11:09:15 +0200
Subject: [PATCH 211/232] Use flat circuit structure for PartAM candidates

---
 squander/synthesis/PartAM_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 591722d5f..724ab4fb0 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -443,11 +443,12 @@ def __init__(self, N, mini_topologies, involved_qbits, qubit_map, topology=None,
         self._topology_cache = topology_cache
 
     def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
+        flat_circuit = synthesised_circuit.get_Flat_Circuit()
         self.permutations_pairs[topology_idx].append(permutations_pair)
         self.synthesised_circuits[topology_idx].append(synthesised_circuit)
         self.synthesised_parameters[topology_idx].append(synthesised_parameters)
-        self.cnot_counts[topology_idx].append(synthesised_circuit.get_Gate_Nums().get('CNOT', 0))
-        self.circuit_structures[topology_idx].append(self.extract_circuit_structure(synthesised_circuit))
+        self.cnot_counts[topology_idx].append(flat_circuit.get_Gate_Nums().get('CNOT', 0))
+        self.circuit_structures[topology_idx].append(self.extract_circuit_structure(flat_circuit))
     
     def extract_circuit_structure(self, circuit):
         circuit_structure = []

From 29c3bfd650417b3d4feec301e0a53a02ceb4221f Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 12 May 2026 11:30:47 +0200
Subject: [PATCH 212/232] fix topology check

---
 squander/synthesis/PartAM_utils.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 724ab4fb0..fad346b79 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -634,24 +634,28 @@ class PartitionScoreData:
 
 def check_circuit_compatibility(circuit: Circuit, topology):
     circuit_topology = []
-    gates = circuit.get_Gates()
-    for gate in gates:
+
+    def collect_two_qubit_edges(gate):
+        if isinstance(gate, Circuit):
+            for subgate in gate.get_Gates():
+                collect_two_qubit_edges(subgate)
+            return
+
         qubits = gate.get_Involved_Qbits()
         if len(qubits) == 1:
-            continue
-        elif len(qubits) == 2:
+            return
+        if len(qubits) == 2:
             qubits = tuple(qubits)
             if qubits not in circuit_topology and qubits[::-1] not in circuit_topology:
                 circuit_topology.append(qubits)
-        else:
-            gates_new = gate.get_Gates()
-            for gate_new in gates_new:
-                qubits_new = gate_new.get_Involved_Qbits()
-                if len(qubits_new)==1:
-                    continue
-                qubits_new = tuple(qubits_new)
-                if qubits_new not in circuit_topology and qubits_new[::-1] not in circuit_topology:
-                    circuit_topology.append(qubits_new)
+            return
+
+        for subgate in gate.get_Gates():
+            collect_two_qubit_edges(subgate)
+
+    for gate in circuit.get_Gates():
+        collect_two_qubit_edges(gate)
+
     for qubits in circuit_topology:
         if qubits not in topology and qubits[::-1] not in topology:
             return False

From d58979b2670791d3077bce07d10553007214ac35 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 12 May 2026 12:07:28 +0200
Subject: [PATCH 213/232] Restore PartAM routing trace

---
 squander/synthesis/PartAM.py       | 280 +++++++++++++++++++++++++++++
 squander/synthesis/PartAM_utils.py |   2 +
 2 files changed, 282 insertions(+)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 291b48cbe..cab2e8930 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -1,6 +1,7 @@
 """
 This is an implementation of Partition Aware Mapping.
 """
+import csv
 import logging
 import multiprocessing as mp
 import os
@@ -187,6 +188,7 @@ def __init__(self, config):
         self.config.setdefault('boundary_beam_depth', 1)
         self.config.setdefault('layout_boundary_beam_width', None)
         self.config.setdefault('layout_boundary_beam_depth', None)
+        self.config.setdefault('routing_trace_path', None)
         self.config['partition_weight_model'] = 'window_turnover'
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
@@ -1360,6 +1362,24 @@ def _partition_order_from_cpp_steps(
                 partition_order.append(part)
         return partition_order
 
+    @staticmethod
+    def _csv_list(values):
+        return " ".join(str(int(v)) for v in values)
+
+    @staticmethod
+    def _csv_edges(edges):
+        return " ".join(f"{int(u)}-{int(v)}" for u, v in edges)
+
+    @staticmethod
+    def _candidate_physical_nodes(candidate):
+        nodes = set()
+        for u, v in candidate.topology:
+            nodes.add(int(u))
+            nodes.add(int(v))
+        if not nodes:
+            nodes.update(int(v) for v in candidate.node_mapping.values())
+        return sorted(nodes)
+
     @staticmethod
     def _candidate_has_multi_qubit_body(candidate):
         return bool(getattr(candidate, "circuit_structure", ()))
@@ -1398,6 +1418,233 @@ def _apply_zero_cnot_candidate_exit_to_pi(pi, candidate, node_mapping):
                 pi_out[logical_q] = node_mapping[mapped_qstar]
         return pi_out
 
+    @staticmethod
+    def _immediate_multi_successors(partition_idx, DAG, layout_partitions):
+        successors = []
+        seen = set()
+        queue = deque(DAG[partition_idx])
+        while queue:
+            child = queue.popleft()
+            if child in seen:
+                continue
+            seen.add(child)
+            if layout_partitions[child]["is_single"]:
+                queue.extend(DAG[child])
+            else:
+                successors.append(child)
+        return successors
+
+    @staticmethod
+    def _support_overlap_summary(partition_idx, successors, layout_partitions):
+        support = set(layout_partitions[partition_idx]["involved_qbits"])
+        summary = []
+        max_overlap = 0
+        min_turnover = None
+        for child in successors:
+            child_support = set(layout_partitions[child]["involved_qbits"])
+            overlap = len(support & child_support)
+            turnover = min(len(support), len(child_support)) - overlap
+            max_overlap = max(max_overlap, overlap)
+            min_turnover = (
+                turnover
+                if min_turnover is None
+                else min(min_turnover, turnover)
+            )
+            summary.append(f"{child}:{overlap}/{turnover}")
+        return (
+            max_overlap,
+            0 if min_turnover is None else min_turnover,
+            " ".join(summary),
+        )
+
+    @staticmethod
+    def _eligible_multi_frontier(resolved, IDAG, layout_partitions):
+        frontier = []
+        for idx, info in enumerate(layout_partitions):
+            if resolved[idx] or info["is_single"]:
+                continue
+            if all(resolved[parent] for parent in IDAG[idx]):
+                frontier.append(idx)
+        return frontier
+
+    def _write_cpp_routing_trace(
+        self,
+        trace_path,
+        steps,
+        pi_initial,
+        candidate_cache,
+        layout_partitions,
+        DAG,
+        IDAG,
+        N,
+    ):
+        """Write a CSV trace for the final selected C++ route."""
+        if not trace_path:
+            return
+
+        trace_dir = os.path.dirname(os.path.abspath(trace_path))
+        if trace_dir:
+            os.makedirs(trace_dir, exist_ok=True)
+
+        pi = [int(x) for x in pi_initial]
+        resolved = [False] * len(layout_partitions)
+        pending_swaps = []
+        cumulative_swaps = 0
+        cumulative_body_cnot = 0
+        rows = []
+
+        for route_step_idx, step in enumerate(steps):
+            kind = step[0]
+            if kind == "swap":
+                swaps = [(int(u), int(v)) for u, v in step[1]]
+                if swaps:
+                    pending_swaps.extend(swaps)
+                    pi = self._apply_swaps_to_pi(pi, swaps)
+                continue
+
+            if kind == "single":
+                partition_idx = int(step[1])
+                logical_qubits = tuple(
+                    layout_partitions[partition_idx]["involved_qbits"]
+                )
+                physical_qubit = int(step[2])
+                resolved[partition_idx] = True
+                rows.append({
+                    "row": len(rows),
+                    "route_step": route_step_idx,
+                    "kind": "single",
+                    "partition_idx": partition_idx,
+                    "candidate_idx": "",
+                    "topology_idx": "",
+                    "permutation_idx": "",
+                    "logical_qubits": self._csv_list(logical_qubits),
+                    "physical_nodes": str(physical_qubit),
+                    "topology_edges": "",
+                    "entry_layout": self._csv_list(
+                        pi[q] for q in logical_qubits
+                    ),
+                    "exit_layout": self._csv_list(
+                        pi[q] for q in logical_qubits
+                    ),
+                    "swap_count": 0,
+                    "routing_cnot": 0,
+                    "body_cnot": 0,
+                    "cumulative_swap_count": cumulative_swaps,
+                    "cumulative_routing_cnot": 3 * cumulative_swaps,
+                    "cumulative_body_cnot": cumulative_body_cnot,
+                    "frontier_size": len(
+                        self._eligible_multi_frontier(
+                            resolved, IDAG, layout_partitions
+                        )
+                    ),
+                    "successor_count": 0,
+                    "max_successor_overlap": 0,
+                    "min_successor_turnover": 0,
+                    "successor_overlap": "",
+                    "swaps": "",
+                })
+                continue
+
+            if kind != "partition":
+                continue
+
+            partition_idx = int(step[1])
+            candidate_idx = int(step[2])
+            candidate = candidate_cache[partition_idx][candidate_idx]
+            logical_qubits = tuple(int(q) for q in candidate.involved_qbits)
+            entry_layout = [int(pi[q]) for q in logical_qubits]
+            if self._candidate_is_layout_transparent(candidate):
+                dynamic_node_mapping = self._zero_cnot_dynamic_node_mapping(
+                    pi, candidate
+                )
+                exit_pi = self._apply_zero_cnot_candidate_exit_to_pi(
+                    pi, candidate, dynamic_node_mapping
+                )
+                physical_nodes = sorted(dynamic_node_mapping.values())
+                topology_edges = ""
+            else:
+                exit_pi = self._apply_candidate_exit_to_pi(pi, candidate)
+                physical_nodes = self._candidate_physical_nodes(candidate)
+                topology_edges = self._csv_edges(candidate.topology)
+            exit_layout = [int(exit_pi[q]) for q in logical_qubits]
+            successors = self._immediate_multi_successors(
+                partition_idx, DAG, layout_partitions
+            )
+            max_overlap, min_turnover, overlap_summary = (
+                self._support_overlap_summary(
+                    partition_idx, successors, layout_partitions
+                )
+            )
+            frontier_size = len(
+                self._eligible_multi_frontier(
+                    resolved, IDAG, layout_partitions
+                )
+            )
+            swap_count = len(pending_swaps)
+            cumulative_swaps += swap_count
+            cumulative_body_cnot += int(candidate.cnot_count)
+            rows.append({
+                "row": len(rows),
+                "route_step": route_step_idx,
+                "kind": "partition",
+                "partition_idx": partition_idx,
+                "candidate_idx": candidate_idx,
+                "topology_idx": int(candidate.topology_idx),
+                "permutation_idx": int(candidate.permutation_idx),
+                "logical_qubits": self._csv_list(logical_qubits),
+                "physical_nodes": self._csv_list(physical_nodes),
+                "topology_edges": topology_edges,
+                "entry_layout": self._csv_list(entry_layout),
+                "exit_layout": self._csv_list(exit_layout),
+                "swap_count": swap_count,
+                "routing_cnot": 3 * swap_count,
+                "body_cnot": int(candidate.cnot_count),
+                "cumulative_swap_count": cumulative_swaps,
+                "cumulative_routing_cnot": 3 * cumulative_swaps,
+                "cumulative_body_cnot": cumulative_body_cnot,
+                "frontier_size": frontier_size,
+                "successor_count": len(successors),
+                "max_successor_overlap": max_overlap,
+                "min_successor_turnover": min_turnover,
+                "successor_overlap": overlap_summary,
+                "swaps": self._csv_edges(pending_swaps),
+            })
+            resolved[partition_idx] = True
+            pi = exit_pi
+            pending_swaps = []
+
+        fieldnames = [
+            "row",
+            "route_step",
+            "kind",
+            "partition_idx",
+            "candidate_idx",
+            "topology_idx",
+            "permutation_idx",
+            "logical_qubits",
+            "physical_nodes",
+            "topology_edges",
+            "entry_layout",
+            "exit_layout",
+            "swap_count",
+            "routing_cnot",
+            "body_cnot",
+            "cumulative_swap_count",
+            "cumulative_routing_cnot",
+            "cumulative_body_cnot",
+            "frontier_size",
+            "successor_count",
+            "max_successor_overlap",
+            "min_successor_turnover",
+            "successor_overlap",
+            "swaps",
+        ]
+        with open(trace_path, "w", newline="") as f:
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            writer.writerows(rows)
+        self._routing_trace_path = trace_path
+
 
     def _rank_layout_trials_by_actual_routing(
         self,
@@ -1500,6 +1747,8 @@ def Partition_Aware_Mapping(
         partition_body_cnot = 0
         routing_elapsed_before_cleanup = None
         cleanup_total = 0.0
+        final_route_steps = None
+        final_route_pi_initial = None
 
         if n_iterations == 0:
             F = self.get_initial_layer(IDAG, N, optimized_partitions)
@@ -1585,6 +1834,8 @@ def Partition_Aware_Mapping(
                 best_pre_cleanup = None
                 best_routing_swap_cnot = 0
                 best_partition_body_cnot = 0
+                best_route_steps = None
+                best_route_pi_initial = None
 
                 for _, trial_pi, _, trace_pi_init, route_steps in top_layouts:
                     self._restore_single_qubit_circuits(
@@ -1644,6 +1895,12 @@ def Partition_Aware_Mapping(
                         best_pi = pi_out
                         best_routing_swap_cnot = trial_routing_cnot
                         best_partition_body_cnot = trial_partition_cnot
+                        best_route_steps = route_steps
+                        best_route_pi_initial = (
+                            pi_init.copy()
+                            if hasattr(pi_init, "copy")
+                            else list(pi_init)
+                        )
 
                 final_cleanup_config = dict(cleanup_config)
                 final_cleanup_config['use_osr'] = 1
@@ -1662,6 +1919,8 @@ def Partition_Aware_Mapping(
                 pi = best_pi
                 routing_swap_cnot = best_routing_swap_cnot
                 partition_body_cnot = best_partition_body_cnot
+                final_route_steps = best_route_steps
+                final_route_pi_initial = best_route_pi_initial
 
             else:
                 _, best_pi, _, trace_pi_init, route_steps = trial_results[0]
@@ -1682,6 +1941,8 @@ def Partition_Aware_Mapping(
                     )
                     pi = np.asarray(best_pi, dtype=np.int64)
                     pi_initial = np.asarray(trace_pi_init, dtype=np.int64)
+                    final_route_steps = route_steps
+                    final_route_pi_initial = pi_initial.copy()
                 else:
                     F = self.get_initial_layer(IDAG, N, optimized_partitions)
                     partition_order, pi, pi_initial = self.Heuristic_Search(
@@ -1732,6 +1993,25 @@ def Partition_Aware_Mapping(
         self._routing_swap_cnot = routing_swap_cnot
         self._partition_body_cnot = partition_body_cnot
 
+        routing_trace_path = self.config.get("routing_trace_path", None)
+        if routing_trace_path:
+            if final_route_steps is not None and final_route_pi_initial is not None:
+                self._write_cpp_routing_trace(
+                    routing_trace_path,
+                    final_route_steps,
+                    final_route_pi_initial,
+                    candidate_cache,
+                    layout_partitions,
+                    DAG,
+                    IDAG,
+                    N,
+                )
+            else:
+                logging.warning(
+                    "routing_trace_path was set, but no C++ route steps were "
+                    "available for the selected route."
+                )
+
         return final_circuit, final_parameters, pi_initial, pi
 
     # ------------------------------------------------------------------------
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index fad346b79..51eba9e59 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -453,6 +453,8 @@ def add_result(self, permutations_pair, synthesised_circuit, synthesised_paramet
     def extract_circuit_structure(self, circuit):
         circuit_structure = []
         for gate in circuit.get_Gates():
+            if gate.get_Name() == "Permutation":
+                continue
             involved_qbits = gate.get_Involved_Qbits()
             if len(involved_qbits) != 1:
                 circuit_structure.append(involved_qbits)

From 90ed99d049fad0d28d4fb15b3630f79b0c54a86c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 12 May 2026 14:32:02 +0200
Subject: [PATCH 214/232] Normalize PartAM partitions to CNOT basis

---
 squander/synthesis/PartAM_utils.py | 19 ++++++++++++++++++-
 squander/utils.py                  | 21 +++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 51eba9e59..727d19061 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -443,9 +443,26 @@ def __init__(self, N, mini_topologies, involved_qbits, qubit_map, topology=None,
         self._topology_cache = topology_cache
 
     def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx):
+        from squander.utils import circuit_to_CNOT_basis
+
         flat_circuit = synthesised_circuit.get_Flat_Circuit()
+        flat_circuit, synthesised_parameters = circuit_to_CNOT_basis(
+            flat_circuit,
+            np.asarray(synthesised_parameters),
+        )
+        unsupported_multi = [
+            gate.get_Name()
+            for gate in flat_circuit.get_Gates()
+            if len(gate.get_Involved_Qbits()) > 1
+            and gate.get_Name() != "CNOT"
+        ]
+        if unsupported_multi:
+            raise ValueError(
+                "Partition synthesis produced non-CNOT multi-qubit gates "
+                f"after CNOT-basis conversion: {unsupported_multi}"
+            )
         self.permutations_pairs[topology_idx].append(permutations_pair)
-        self.synthesised_circuits[topology_idx].append(synthesised_circuit)
+        self.synthesised_circuits[topology_idx].append(flat_circuit)
         self.synthesised_parameters[topology_idx].append(synthesised_parameters)
         self.cnot_counts[topology_idx].append(flat_circuit.get_Gate_Nums().get('CNOT', 0))
         self.circuit_structures[topology_idx].append(self.extract_circuit_structure(flat_circuit))
diff --git a/squander/utils.py b/squander/utils.py
index a434702c6..1ea558a51 100644
--- a/squander/utils.py
+++ b/squander/utils.py
@@ -460,6 +460,7 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray):
         RXX,
         RYY,
         RZZ,
+        Permutation,
     )
 
     gates = circ.get_Gates()
@@ -652,6 +653,24 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray):
             circuit.add_CNOT(t2, t1)
             circuit.add_CNOT(t1, t2)
             params.append([])
+        elif isinstance(gate, Permutation):
+            pattern = list(gate.get_Pattern())
+            inverse_pattern = [0] * len(pattern)
+            for idx, mapped_idx in enumerate(pattern):
+                inverse_pattern[mapped_idx] = idx
+            current = list(range(len(pattern)))
+            for idx, target in enumerate(inverse_pattern):
+                swap_idx = current.index(target)
+                if swap_idx == idx:
+                    continue
+                circuit.add_CNOT(idx, swap_idx)
+                circuit.add_CNOT(swap_idx, idx)
+                circuit.add_CNOT(idx, swap_idx)
+                current[idx], current[swap_idx] = (
+                    current[swap_idx],
+                    current[idx],
+                )
+            params.append([])
         elif isinstance(gate, RXX):
             t1, t2 = gate.get_Target_Qbits()
             circuit.add_CNOT(t1, t2)
@@ -695,6 +714,8 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray):
                 ]
             )
 
+    if not params:
+        return circuit, np.array([])
     return circuit, np.concatenate(params)
 
 

From b4786eae95f587e001ff243ef660084016d0e5b2 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 13 May 2026 18:14:19 +0200
Subject: [PATCH 215/232] Add ilp partition weight model

---
 squander/synthesis/PartAM.py | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index cab2e8930..f3699a61e 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -189,13 +189,20 @@ def __init__(self, config):
         self.config.setdefault('layout_boundary_beam_width', None)
         self.config.setdefault('layout_boundary_beam_depth', None)
         self.config.setdefault('routing_trace_path', None)
-        self.config['partition_weight_model'] = 'window_turnover'
+        self.config.setdefault('partition_weight_model', 'window_turnover')
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
         allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive']
         if not strategy in allowed_strategies:
             raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.")
+        allowed_partition_weight_models = ['window_turnover', 'ilp']
+        if self.config['partition_weight_model'] not in allowed_partition_weight_models:
+            raise Exception(
+                f"The partition_weight_model should be either of "
+                f"{allowed_partition_weight_models}, got "
+                f"{self.config['partition_weight_model']}."
+            )
         
         # Initialize caches for performance optimization
         self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
@@ -653,13 +660,20 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
         # ---- Phase 2: ILP partition selection ----
-        # PartAM keeps one partitioning strategy: window_turnover.
-        ilp_weights = self._parts_to_window_turnover_weights(
-            allparts,
-            gate_dict,
-            g,
-            pack_credit_weight=self.config['pack_credit_weight'],
-        )
+        partition_weight_model = self.config['partition_weight_model']
+        if partition_weight_model == 'ilp':
+            ilp_weights = None
+        elif partition_weight_model == 'window_turnover':
+            ilp_weights = self._parts_to_window_turnover_weights(
+                allparts,
+                gate_dict,
+                g,
+                pack_credit_weight=self.config['pack_credit_weight'],
+            )
+        else:
+            raise Exception(
+                f"Unknown partition_weight_model: {partition_weight_model}."
+            )
         L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----

From 2713ea7c82847ed87d8108342dd0ac1312f9e57e Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 14 May 2026 11:59:18 +0200
Subject: [PATCH 216/232] Auto stash before merge of "jn_partam_final" and
 "origin/jn_partam_final"

---
 squander/synthesis/PartAM.py | 363 ++++++++++++++++++++++++++++-------
 1 file changed, 291 insertions(+), 72 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index f3699a61e..1f39362b0 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -7,7 +7,7 @@
 import os
 import time
 from collections import deque, defaultdict
-from itertools import permutations
+from itertools import combinations, permutations
 from multiprocessing import Pool
 from typing import List, Optional
 
@@ -189,7 +189,19 @@ def __init__(self, config):
         self.config.setdefault('layout_boundary_beam_width', None)
         self.config.setdefault('layout_boundary_beam_depth', None)
         self.config.setdefault('routing_trace_path', None)
-        self.config.setdefault('partition_weight_model', 'window_turnover')
+        self.config['partition_weight_model'] = 'window_turnover'
+        self.config.setdefault('partition_density_weight', 4.0)
+        self.config.setdefault('partition_boundary_weight', 0.9)
+        self.config.setdefault('partition_depth_balance_weight', 0.25)
+        self.config.setdefault('partition_triangle_weight', 2.5)
+        self.config.setdefault('partition_triangle_threshold', 0.6)
+        self.config.setdefault('partition_triangle_window_radius', 8)
+        self.config.setdefault('partition_synthesis_cost_weight', 1.0)
+        self.config.setdefault('partition_min_cost', 0.05)
+        self.config.setdefault(
+            'partition_width_penalties',
+            {1: 0.25, 2: 1.0, 3: 4.0, 4: 16.0},
+        )
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -259,6 +271,147 @@ def _part_support_and_active_pairs(part, gate_dict):
                     )
         return frozenset(qubits_in_part), frozenset(active_pairs)
 
+    @staticmethod
+    def _two_qubit_gate_pair(gate):
+        qbs = list(gate.get_Involved_Qbits())
+        if len(qbs) != 2:
+            return None
+        return (min(qbs[0], qbs[1]), max(qbs[0], qbs[1]))
+
+    @staticmethod
+    def _part_two_qubit_gate_count(part, gate_dict):
+        count = 0
+        for gate_idx in part:
+            gate = gate_dict.get(gate_idx)
+            if gate is None:
+                continue
+            if qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate) is not None:
+                count += 1
+        return count
+
+    @staticmethod
+    def _synthesis_capacity(width):
+        return float(4 ** max(int(width), 1))
+
+    @staticmethod
+    def _configured_width_penalty(width, penalties):
+        if penalties is None:
+            penalties = {1: 0.25, 2: 1.0, 3: 4.0, 4: 16.0}
+
+        exact = None
+        if isinstance(penalties, dict):
+            exact = penalties.get(width)
+            if exact is None:
+                exact = penalties.get(str(width))
+        if exact is not None:
+            return float(exact)
+
+        if width <= 1:
+            return 0.25
+        if width == 2:
+            return 1.0
+        if width == 3:
+            return 4.0
+        if width == 4:
+            return 16.0
+        return 16.0 * (4.0 ** (width - 4))
+
+    @staticmethod
+    def _restricted_longest_path_depth(nodes, g, rg, topo_order):
+        nodes = set(nodes)
+        if not nodes:
+            return 0
+
+        depth = {}
+        best = 0
+        for gate_idx in topo_order:
+            if gate_idx not in nodes:
+                continue
+            pred_depth = 0
+            for pred in rg.get(gate_idx, ()):
+                if pred in nodes:
+                    pred_depth = max(pred_depth, depth.get(pred, 0))
+            depth[gate_idx] = pred_depth + 1
+            best = max(best, depth[gate_idx])
+        return best
+
+    @staticmethod
+    def _boundary_two_qubit_gate_count(part, support, g, rg, gate_dict,
+                                       max_partition_size):
+        """Count adjacent 2q gates that this candidate leaves over a boundary."""
+        support = set(support)
+        boundary_gates = set()
+        for gate_idx in part:
+            neighbors = set(g.get(gate_idx, ())) | set(rg.get(gate_idx, ()))
+            for other_idx in neighbors:
+                if other_idx in part:
+                    continue
+                gate = gate_dict.get(other_idx)
+                if gate is None:
+                    continue
+                if qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate) is None:
+                    continue
+                other_support = set(gate.get_Involved_Qbits())
+                if not (support & other_support):
+                    continue
+                if (
+                    max_partition_size is not None
+                    and len(support | other_support) > max_partition_size
+                ):
+                    continue
+                boundary_gates.add(other_idx)
+        return len(boundary_gates)
+
+    @staticmethod
+    def _pair_counts_in_topological_window(part, topo_order, topo_index,
+                                           gate_dict, radius):
+        if not part:
+            return defaultdict(int)
+
+        positions = [topo_index[g] for g in part if g in topo_index]
+        if not positions:
+            return defaultdict(int)
+
+        lo = max(0, min(positions) - radius)
+        hi = min(len(topo_order) - 1, max(positions) + radius)
+        pair_counts = defaultdict(int)
+        for pos in range(lo, hi + 1):
+            gate = gate_dict.get(topo_order[pos])
+            if gate is None:
+                continue
+            pair = qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate)
+            if pair is not None:
+                pair_counts[pair] += 1
+        return pair_counts
+
+    @staticmethod
+    def _triangle_density_from_pair_counts(support, pair_counts):
+        """Return a balanced local triangle score in [0, 1].
+
+        A chain has density zero because one triangle edge is missing.  A
+        balanced three-edge interaction has density one, while skewed triangles
+        are discounted by the weakest edge's share of the local interactions.
+        """
+        support = sorted(support)
+        if len(support) < 3:
+            return 0.0
+
+        best_density = 0.0
+        for a, b, c in combinations(support, 3):
+            counts = [
+                pair_counts.get((min(a, b), max(a, b)), 0),
+                pair_counts.get((min(a, c), max(a, c)), 0),
+                pair_counts.get((min(b, c), max(b, c)), 0),
+            ]
+            if min(counts) <= 0:
+                continue
+            total = sum(counts)
+            if total <= 0:
+                continue
+            density = (3.0 * min(counts)) / float(total)
+            best_density = max(best_density, min(density, 1.0))
+        return best_density
+
     @staticmethod
     def _turnover_between_supports(support_a, support_b):
         if len(support_a) < 2 or len(support_b) < 2:
@@ -290,86 +443,154 @@ def _average_turnover(part_idx, part, neighbor_gate_sets,
         return sum(turnovers) / len(turnovers)
 
     @staticmethod
-    def _parts_to_window_turnover_weights(allparts, gate_dict, g, pack_credit_weight=0.0):
-        """Linear ILP weights for 3q window continuity.
-
-        Dense 3q blocks are only routing-friendly when their local qubit window
-        persists into adjacent work.  A block like (0, i, j) followed by
-        (0, k, l) replaces two qubits in the 3q window, which is exactly the
-        expensive pattern on a line.  This cost keeps 2q parts at conceptual
-        cost one and charges 3q parts for active-pair count plus average
-        predecessor/successor window turnover.
+    def _parts_to_window_turnover_weights(allparts, gate_dict, g,
+                                          pack_credit_weight=0.0,
+                                          config=None,
+                                          max_partition_size=None):
+        """Linear ILP weights for local block quality.
+
+        The ILP accepts one linear cost per candidate part, so pairwise boundary
+        effects are approximated locally.  Lower cost is better.  The model
+        rewards dense blocks, penalizes adjacent 2q gates left across a block
+        boundary, charges a nonlinear synthesis-width cost, gives 3q blocks a
+        local triangle incentive only above a density threshold, and adds a
+        light critical-path depth balance penalty.
         """
+        cfg = {} if config is None else config
+        if max_partition_size is None:
+            max_partition_size = cfg.get("max_partition_size")
+
+        density_weight = float(cfg.get("partition_density_weight", 4.0))
+        boundary_weight = float(cfg.get("partition_boundary_weight", 0.9))
+        depth_balance_weight = float(
+            cfg.get("partition_depth_balance_weight", 0.25)
+        )
+        triangle_weight = float(cfg.get("partition_triangle_weight", 2.5))
+        triangle_threshold = float(
+            cfg.get("partition_triangle_threshold", 0.6)
+        )
+        triangle_threshold = min(max(triangle_threshold, 0.0), 1.0)
+        triangle_window_radius = max(
+            int(cfg.get("partition_triangle_window_radius", 8)),
+            0,
+        )
+        synthesis_cost_weight = float(
+            cfg.get("partition_synthesis_cost_weight", 1.0)
+        )
+        min_cost = float(cfg.get("partition_min_cost", 0.05))
+        width_penalties = cfg.get("partition_width_penalties")
+
         N = max(len(allparts), 1)
         supports = []
-        active_pairs_by_part = []
         for part in allparts:
-            support, active_pairs = (
+            support, _ = (
                 qgd_Partition_Aware_Mapping._part_support_and_active_pairs(
                     part,
                     gate_dict,
                 )
             )
             supports.append(support)
-            active_pairs_by_part.append(active_pairs)
 
-        gate_to_parts = defaultdict(list)
-        for part_idx, part in enumerate(allparts):
-            for gate_idx in part:
-                gate_to_parts[gate_idx].append(part_idx)
-
-        rg = defaultdict(set)
+        rg = {gate_idx: set() for gate_idx in g}
         for src, dsts in g.items():
             for dst in dsts:
-                rg[dst].add(src)
+                rg.setdefault(dst, set()).add(src)
+
+        gate_to_qubit = {
+            gate_idx: set(gate.get_Involved_Qbits())
+            for gate_idx, gate in gate_dict.items()
+            if gate is not None
+        }
+        topo_order = _get_topo_order(g, rg, gate_to_qubit) if g else []
+        topo_index = {gate_idx: idx for idx, gate_idx in enumerate(topo_order)}
+        global_depth = max(
+            qgd_Partition_Aware_Mapping._restricted_longest_path_depth(
+                set(g), g, rg, topo_order
+            ),
+            1,
+        )
 
         weights = []
         for part_idx, part in enumerate(allparts):
             support = supports[part_idx]
-            active_pairs = active_pairs_by_part[part_idx]
-            if len(support) < 3:
-                weights.append(0.0)
-                continue
-
-            succ_gate_sets = [g.get(gate_idx, set()) for gate_idx in part]
-            pred_gate_sets = [rg.get(gate_idx, set()) for gate_idx in part]
-            succ_turnover = qgd_Partition_Aware_Mapping._average_turnover(
-                part_idx,
-                part,
-                succ_gate_sets,
-                gate_to_parts,
-                allparts,
-                supports,
+            width = len(support)
+            width_penalty = (
+                qgd_Partition_Aware_Mapping._configured_width_penalty(
+                    width, width_penalties
+                )
+            )
+            two_qubit_gate_count = (
+                qgd_Partition_Aware_Mapping._part_two_qubit_gate_count(
+                    part, gate_dict
+                )
             )
-            pred_turnover = qgd_Partition_Aware_Mapping._average_turnover(
-                part_idx,
-                part,
-                pred_gate_sets,
-                gate_to_parts,
-                allparts,
-                supports,
+            block_density = (
+                two_qubit_gate_count
+                / qgd_Partition_Aware_Mapping._synthesis_capacity(width)
             )
-            boundary_turnover = len(support)
-            if succ_turnover is None:
-                succ_turnover = boundary_turnover
-            if pred_turnover is None:
-                pred_turnover = boundary_turnover
-            conceptual_cost = (
-                max(len(support), len(active_pairs), 1)
-                + succ_turnover
-                + pred_turnover
+            boundary_crossings = (
+                qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_count(
+                    part,
+                    support,
+                    g,
+                    rg,
+                    gate_dict,
+                    max_partition_size,
+                )
             )
+            pair_counts = (
+                qgd_Partition_Aware_Mapping._pair_counts_in_topological_window(
+                    part,
+                    topo_order,
+                    topo_index,
+                    gate_dict,
+                    triangle_window_radius,
+                )
+            )
+            triangle_density = (
+                qgd_Partition_Aware_Mapping._triangle_density_from_pair_counts(
+                    support,
+                    pair_counts,
+                )
+            )
+            if triangle_threshold >= 1.0:
+                triangle_bonus = 0.0
+            else:
+                triangle_bonus = triangle_weight * max(
+                    triangle_density - triangle_threshold,
+                    0.0,
+                ) / (1.0 - triangle_threshold)
+
+            internal_depth = (
+                qgd_Partition_Aware_Mapping._restricted_longest_path_depth(
+                    part, g, rg, topo_order
+                )
+            )
+            depth_fraction = internal_depth / float(global_depth)
+            depth_penalty = (
+                depth_balance_weight
+                * depth_fraction
+                * depth_fraction
+                * max(width_penalty, 1.0)
+            )
+
+            density_bonus = density_weight * block_density
             if pack_credit_weight:
-                k = len(support)
-                full_clique_pairs = k * (k - 1) // 2
-                if len(active_pairs) == full_clique_pairs:
-                    multi_qubit_gate_count = sum(
-                        1 for gate_idx in part
-                        if gate_dict.get(gate_idx) is not None
-                        and len(gate_dict[gate_idx].get_Involved_Qbits()) >= 2
-                    )
-                    conceptual_cost -= pack_credit_weight * max(multi_qubit_gate_count - 1, 0)
-            weights.append(max((conceptual_cost - 1.0) / N, 0.0))
+                density_bonus += (
+                    pack_credit_weight
+                    * block_density
+                    * max(two_qubit_gate_count - 1, 0)
+                )
+
+            conceptual_cost = (
+                synthesis_cost_weight * width_penalty
+                + boundary_weight * boundary_crossings
+                + depth_penalty
+                - density_bonus
+                - triangle_bonus
+            )
+            conceptual_cost = max(conceptual_cost, min_cost)
+            weights.append((conceptual_cost - 1.0) / N)
         return weights
 
     @staticmethod
@@ -660,20 +881,18 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post}
 
         # ---- Phase 2: ILP partition selection ----
+        # PartAM keeps one partitioning strategy: window_turnover.
+        ilp_weights = self._parts_to_window_turnover_weights(
+            allparts,
+            gate_dict,
+            g,
+            pack_credit_weight=self.config['pack_credit_weight'],
+            config=self.config,
+            max_partition_size=self.config["max_partition_size"],
+        )
         partition_weight_model = self.config['partition_weight_model']
         if partition_weight_model == 'ilp':
             ilp_weights = None
-        elif partition_weight_model == 'window_turnover':
-            ilp_weights = self._parts_to_window_turnover_weights(
-                allparts,
-                gate_dict,
-                g,
-                pack_credit_weight=self.config['pack_credit_weight'],
-            )
-        else:
-            raise Exception(
-                f"Unknown partition_weight_model: {partition_weight_model}."
-            )
         L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights)
 
         # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ----

From 43f14236e65cb4e546584214d97f5dfb000fd4b6 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 14 May 2026 15:49:52 +0200
Subject: [PATCH 217/232] Add routing aware partitioning

---
 squander/synthesis/PartAM.py | 38 +++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 1f39362b0..9b358de2c 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -197,6 +197,11 @@ def __init__(self, config):
         self.config.setdefault('partition_triangle_threshold', 0.6)
         self.config.setdefault('partition_triangle_window_radius', 8)
         self.config.setdefault('partition_synthesis_cost_weight', 1.0)
+        # Calibrated so a width-3 block whose three qubits are mutually
+        # non-adjacent on a sparse grid (sum_extra ≈ 3) takes ~+1.5 cost,
+        # enough to overpower a saturated triangle bonus (≤ 2.5) and pull
+        # the ILP back to width-2 unless the block is topology-aligned.
+        self.config.setdefault('partition_routing_span_weight', 0.5)
         self.config.setdefault('partition_min_cost', 0.05)
         self.config.setdefault(
             'partition_width_penalties',
@@ -446,7 +451,8 @@ def _average_turnover(part_idx, part, neighbor_gate_sets,
     def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                                           pack_credit_weight=0.0,
                                           config=None,
-                                          max_partition_size=None):
+                                          max_partition_size=None,
+                                          topology_distances=None):
         """Linear ILP weights for local block quality.
 
         The ILP accepts one linear cost per candidate part, so pairwise boundary
@@ -454,7 +460,10 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         rewards dense blocks, penalizes adjacent 2q gates left across a block
         boundary, charges a nonlinear synthesis-width cost, gives 3q blocks a
         local triangle incentive only above a density threshold, and adds a
-        light critical-path depth balance penalty.
+        light critical-path depth balance penalty.  When ``topology_distances``
+        is supplied, also adds ``routing_span_weight · Σ max(D[u,v]−1, 0)`` over
+        the part's active 2q pairs, capturing the SWAP overhead of bringing
+        interacting qubits adjacent on the device coupling map.
         """
         cfg = {} if config is None else config
         if max_partition_size is None:
@@ -477,19 +486,31 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         synthesis_cost_weight = float(
             cfg.get("partition_synthesis_cost_weight", 1.0)
         )
+        routing_span_weight = float(
+            cfg.get("partition_routing_span_weight", 0.0)
+        )
         min_cost = float(cfg.get("partition_min_cost", 0.05))
         width_penalties = cfg.get("partition_width_penalties")
 
+        use_routing_span = (
+            topology_distances is not None and routing_span_weight
+        )
+        inf_distance_cap = float(
+            max(len(topology_distances) - 1, 1)
+        ) if topology_distances is not None else 0.0
+
         N = max(len(allparts), 1)
         supports = []
+        active_pairs_list = []
         for part in allparts:
-            support, _ = (
+            support, active_pairs = (
                 qgd_Partition_Aware_Mapping._part_support_and_active_pairs(
                     part,
                     gate_dict,
                 )
             )
             supports.append(support)
+            active_pairs_list.append(active_pairs)
 
         rg = {gate_idx: set() for gate_idx in g}
         for src, dsts in g.items():
@@ -538,6 +559,15 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                     max_partition_size,
                 )
             )
+            if use_routing_span:
+                span_cost = 0.0
+                for u, v in active_pairs_list[part_idx]:
+                    d = topology_distances[u][v]
+                    if not np.isfinite(d):
+                        d = inf_distance_cap
+                    span_cost += max(float(d) - 1.0, 0.0)
+            else:
+                span_cost = 0.0
             pair_counts = (
                 qgd_Partition_Aware_Mapping._pair_counts_in_topological_window(
                     part,
@@ -585,6 +615,7 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             conceptual_cost = (
                 synthesis_cost_weight * width_penalty
                 + boundary_weight * boundary_crossings
+                + routing_span_weight * span_cost
                 + depth_penalty
                 - density_bonus
                 - triangle_bonus
@@ -889,6 +920,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             pack_credit_weight=self.config['pack_credit_weight'],
             config=self.config,
             max_partition_size=self.config["max_partition_size"],
+            topology_distances=D,
         )
         partition_weight_model = self.config['partition_weight_model']
         if partition_weight_model == 'ilp':

From 752a2646dd0545f1a15b0d7209bba1784917e572 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 14 May 2026 16:38:01 +0200
Subject: [PATCH 218/232] recalibrate

---
 squander/synthesis/PartAM.py | 53 +++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 4 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 9b358de2c..bc455241d 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -190,7 +190,12 @@ def __init__(self, config):
         self.config.setdefault('layout_boundary_beam_depth', None)
         self.config.setdefault('routing_trace_path', None)
         self.config['partition_weight_model'] = 'window_turnover'
-        self.config.setdefault('partition_density_weight', 4.0)
+        # With partition_synthesis_capacity set to CNOT lower bounds
+        # ({2:3, 3:14, 4:61}), block_density saturates at 1.0 for every
+        # width, so density_weight 1.0 gives a max bonus that exactly
+        # cancels the width-2 penalty (1.0) and leaves wider widths to
+        # earn the rest via triangle/topology terms.
+        self.config.setdefault('partition_density_weight', 1.0)
         self.config.setdefault('partition_boundary_weight', 0.9)
         self.config.setdefault('partition_depth_balance_weight', 0.25)
         self.config.setdefault('partition_triangle_weight', 2.5)
@@ -207,6 +212,14 @@ def __init__(self, config):
             'partition_width_penalties',
             {1: 0.25, 2: 1.0, 3: 4.0, 4: 16.0},
         )
+        # CNOT lower-bound synthesis budgets (Vidal–Dawson for w=2,
+        # Shende–Markov–Bullock for w=3, QSD for w=4). Sets block_density
+        # to "fraction of synthesis budget used" with consistent semantics
+        # across widths — saturation reward matches across all w.
+        self.config.setdefault(
+            'partition_synthesis_capacity',
+            {1: 1, 2: 3, 3: 14, 4: 61},
+        )
         strategy = self.config['strategy']
         self.config.setdefault('parallel_layout_trials', False)
         self.config.setdefault('layout_trial_workers', 0)
@@ -295,8 +308,37 @@ def _part_two_qubit_gate_count(part, gate_dict):
         return count
 
     @staticmethod
-    def _synthesis_capacity(width):
-        return float(4 ** max(int(width), 1))
+    def _synthesis_capacity(width, capacities=None):
+        """CNOT lower-bound budget for a generic width-qubit unitary.
+
+        Defaults reflect known synthesis bounds:
+          width 2 → 3   (Vidal–Dawson 2004, tight)
+          width 3 → 14  (Shende–Markov–Bullock 2004 counting bound)
+          width 4 → 61  (QSD recursion, practical upper bound)
+        Width 1 → 1 (no 2q gates; avoids division by zero in block_density).
+        Widths beyond 4 extrapolate as 61 · 4^(w−4), matching the asymptotic
+        (23/48)·4^w scaling of QSD.
+        """
+        if capacities is None:
+            capacities = {1: 1, 2: 3, 3: 14, 4: 61}
+
+        exact = None
+        if isinstance(capacities, dict):
+            exact = capacities.get(width)
+            if exact is None:
+                exact = capacities.get(str(width))
+        if exact is not None:
+            return float(max(exact, 1))
+
+        if width <= 1:
+            return 1.0
+        if width == 2:
+            return 3.0
+        if width == 3:
+            return 14.0
+        if width == 4:
+            return 61.0
+        return 61.0 * (4.0 ** (width - 4))
 
     @staticmethod
     def _configured_width_penalty(width, penalties):
@@ -491,6 +533,7 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         )
         min_cost = float(cfg.get("partition_min_cost", 0.05))
         width_penalties = cfg.get("partition_width_penalties")
+        synthesis_capacities = cfg.get("partition_synthesis_capacity")
 
         use_routing_span = (
             topology_distances is not None and routing_span_weight
@@ -547,7 +590,9 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             )
             block_density = (
                 two_qubit_gate_count
-                / qgd_Partition_Aware_Mapping._synthesis_capacity(width)
+                / qgd_Partition_Aware_Mapping._synthesis_capacity(
+                    width, synthesis_capacities
+                )
             )
             boundary_crossings = (
                 qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_count(

From 47179218e20ae50496c0212af457a54b82d5ab06 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 14 May 2026 23:21:35 +0200
Subject: [PATCH 219/232] refine partitioning

---
 squander/synthesis/PartAM.py | 52 ++++++++++++++++++++++++++++++++++--
 1 file changed, 50 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index bc455241d..2e3cb7b15 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -198,7 +198,12 @@ def __init__(self, config):
         self.config.setdefault('partition_density_weight', 1.0)
         self.config.setdefault('partition_boundary_weight', 0.9)
         self.config.setdefault('partition_depth_balance_weight', 0.25)
-        self.config.setdefault('partition_triangle_weight', 2.5)
+        # Triangle bonus rebalanced against the new density baseline:
+        # density_bonus already saturates at 1.0 for every width under the
+        # CNOT-budget capacities, so triangle should be a tie-breaker not a
+        # second density signal. 1.5 keeps grover-style adjacent triples
+        # competitive while letting span_cost suppress spread-out 3q blocks.
+        self.config.setdefault('partition_triangle_weight', 1.5)
         self.config.setdefault('partition_triangle_threshold', 0.6)
         self.config.setdefault('partition_triangle_window_radius', 8)
         self.config.setdefault('partition_synthesis_cost_weight', 1.0)
@@ -207,6 +212,12 @@ def __init__(self, config):
         # enough to overpower a saturated triangle bonus (≤ 2.5) and pull
         # the ILP back to width-2 unless the block is topology-aligned.
         self.config.setdefault('partition_routing_span_weight', 0.5)
+        # Averaged turnover with DAG successor partitions: penalises blocks
+        # whose support has little qubit overlap with the candidate parts
+        # immediately downstream. Captures inter-block routing churn that
+        # routing_span (intra-block spread) misses. Linear in ILP vars
+        # since each candidate gets a precomputed scalar.
+        self.config.setdefault('partition_turnover_weight', 0.5)
         self.config.setdefault('partition_min_cost', 0.05)
         self.config.setdefault(
             'partition_width_penalties',
@@ -505,7 +516,12 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         light critical-path depth balance penalty.  When ``topology_distances``
         is supplied, also adds ``routing_span_weight · Σ max(D[u,v]−1, 0)`` over
         the part's active 2q pairs, capturing the SWAP overhead of bringing
-        interacting qubits adjacent on the device coupling map.
+        interacting qubits adjacent on the device coupling map.  Finally adds
+        ``turnover_weight · avg_turnover`` where ``avg_turnover`` averages
+        ``min(|supp_p|, |supp_q|) − |supp_p ∩ supp_q|`` over candidate
+        partitions ``q`` that immediately follow ``p`` in the gate DAG —
+        penalising blocks whose downstream neighbours have little qubit
+        overlap (high inter-block routing churn).
         """
         cfg = {} if config is None else config
         if max_partition_size is None:
@@ -531,6 +547,7 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         routing_span_weight = float(
             cfg.get("partition_routing_span_weight", 0.0)
         )
+        turnover_weight = float(cfg.get("partition_turnover_weight", 0.0))
         min_cost = float(cfg.get("partition_min_cost", 0.05))
         width_penalties = cfg.get("partition_width_penalties")
         synthesis_capacities = cfg.get("partition_synthesis_capacity")
@@ -560,6 +577,22 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             for dst in dsts:
                 rg.setdefault(dst, set()).add(src)
 
+        use_turnover = turnover_weight != 0.0
+        if use_turnover:
+            gate_to_parts = defaultdict(list)
+            for idx, part in enumerate(allparts):
+                for gate_idx in part:
+                    gate_to_parts[gate_idx].append(idx)
+            successor_gate_sets = []
+            for part in allparts:
+                downstream = set()
+                for gate_idx in part:
+                    downstream.update(g.get(gate_idx, ()))
+                successor_gate_sets.append(downstream)
+        else:
+            gate_to_parts = None
+            successor_gate_sets = None
+
         gate_to_qubit = {
             gate_idx: set(gate.get_Involved_Qbits())
             for gate_idx, gate in gate_dict.items()
@@ -613,6 +646,20 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                     span_cost += max(float(d) - 1.0, 0.0)
             else:
                 span_cost = 0.0
+            if use_turnover:
+                avg_turnover = (
+                    qgd_Partition_Aware_Mapping._average_turnover(
+                        part_idx,
+                        part,
+                        [successor_gate_sets[part_idx]],
+                        gate_to_parts,
+                        allparts,
+                        supports,
+                    )
+                )
+                turnover_cost = 0.0 if avg_turnover is None else float(avg_turnover)
+            else:
+                turnover_cost = 0.0
             pair_counts = (
                 qgd_Partition_Aware_Mapping._pair_counts_in_topological_window(
                     part,
@@ -661,6 +708,7 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                 synthesis_cost_weight * width_penalty
                 + boundary_weight * boundary_crossings
                 + routing_span_weight * span_cost
+                + turnover_weight * turnover_cost
                 + depth_penalty
                 - density_bonus
                 - triangle_bonus

From 905be03275801be6b0d313cb43519a460fdd0f58 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 15 May 2026 12:05:30 +0200
Subject: [PATCH 220/232] vft seeded layout

---
 squander/synthesis/PartAM.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 2e3cb7b15..61ae4e8b4 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -505,7 +505,8 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                                           pack_credit_weight=0.0,
                                           config=None,
                                           max_partition_size=None,
-                                          topology_distances=None):
+                                          topology_distances=None,
+                                          seed_layout=None):
         """Linear ILP weights for local block quality.
 
         The ILP accepts one linear cost per candidate part, so pairwise boundary
@@ -516,7 +517,11 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         light critical-path depth balance penalty.  When ``topology_distances``
         is supplied, also adds ``routing_span_weight · Σ max(D[u,v]−1, 0)`` over
         the part's active 2q pairs, capturing the SWAP overhead of bringing
-        interacting qubits adjacent on the device coupling map.  Finally adds
+        interacting qubits adjacent on the device coupling map.  When
+        ``seed_layout`` is also supplied, ``D`` is permuted through the layout
+        so the span penalty reflects *physical* qubit distance under the
+        routing layer's chosen placement, not abstract logical distance.
+        Finally adds
         ``turnover_weight · avg_turnover`` where ``avg_turnover`` averages
         ``min(|supp_p|, |supp_q|) − |supp_p ∩ supp_q|`` over candidate
         partitions ``q`` that immediately follow ``p`` in the gate DAG —
@@ -555,9 +560,14 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         use_routing_span = (
             topology_distances is not None and routing_span_weight
         )
+        if topology_distances is not None and seed_layout is not None:
+            pi_arr = np.asarray(seed_layout, dtype=int)
+            layout_distances = topology_distances[np.ix_(pi_arr, pi_arr)]
+        else:
+            layout_distances = topology_distances
         inf_distance_cap = float(
-            max(len(topology_distances) - 1, 1)
-        ) if topology_distances is not None else 0.0
+            max(len(layout_distances) - 1, 1)
+        ) if layout_distances is not None else 0.0
 
         N = max(len(allparts), 1)
         supports = []
@@ -640,7 +650,7 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             if use_routing_span:
                 span_cost = 0.0
                 for u, v in active_pairs_list[part_idx]:
-                    d = topology_distances[u][v]
+                    d = layout_distances[u][v]
                     if not np.isfinite(d):
                         d = inf_distance_cap
                     span_cost += max(float(d) - 1.0, 0.0)
@@ -995,6 +1005,12 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
         # ---- Phase 0: Compute distance matrix ----
         D = self.compute_distances_bfs(qbit_num)
 
+        # ---- Phase 0b: Compute seed layout for layout-aware scoring ----
+        # Empty partitions list makes _compute_seeded_layout skip the
+        # partition-weighted greedy fallback; it returns identity if VF2
+        # and SabrePreLayout-augmented VF2 both fail (safe no-op).
+        seed_layout = self._compute_seeded_layout([], D, qbit_num, working_circ)
+
         # ---- Phase 1: Partition enumeration ----
         allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
         qbit_num_orig_circuit = working_circ.get_Qbit_Num()
@@ -1014,6 +1030,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             config=self.config,
             max_partition_size=self.config["max_partition_size"],
             topology_distances=D,
+            seed_layout=seed_layout,
         )
         partition_weight_model = self.config['partition_weight_model']
         if partition_weight_model == 'ilp':

From 370087748ca2b054ba01b990f970eae37e50817c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 15 May 2026 15:59:55 +0200
Subject: [PATCH 221/232] reowrk

---
 squander/synthesis/PartAM.py | 172 +++++++++++++++++++++++------------
 1 file changed, 116 insertions(+), 56 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 61ae4e8b4..02d99eafd 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -190,19 +190,17 @@ def __init__(self, config):
         self.config.setdefault('layout_boundary_beam_depth', None)
         self.config.setdefault('routing_trace_path', None)
         self.config['partition_weight_model'] = 'window_turnover'
-        # With partition_synthesis_capacity set to CNOT lower bounds
-        # ({2:3, 3:14, 4:61}), block_density saturates at 1.0 for every
-        # width, so density_weight 1.0 gives a max bonus that exactly
-        # cancels the width-2 penalty (1.0) and leaves wider widths to
-        # earn the rest via triangle/topology terms.
+        # Absorption credit: each absorbed 2q gate pays a flat 1.0 credit
+        # regardless of partition width. With width_penalty[2]=1.0, a
+        # saturated w-2 (k=3) gets bonus 3 → net synth cost −2; a w-3 needs
+        # to absorb ≥4 gates to beat width_penalty[3]=4. Narrow vs wide is
+        # decided by total absorbed work + topology, not by normalisation.
         self.config.setdefault('partition_density_weight', 1.0)
         self.config.setdefault('partition_boundary_weight', 0.9)
         self.config.setdefault('partition_depth_balance_weight', 0.25)
-        # Triangle bonus rebalanced against the new density baseline:
-        # density_bonus already saturates at 1.0 for every width under the
-        # CNOT-budget capacities, so triangle should be a tie-breaker not a
-        # second density signal. 1.5 keeps grover-style adjacent triples
-        # competitive while letting span_cost suppress spread-out 3q blocks.
+        # Triangle is a small tie-breaker on top of absorption-credit density
+        # and discounted boundary. A truly triangular 3q block (Toffoli, etc.)
+        # gets a modest extra reward; chain-shaped 3q blocks don't.
         self.config.setdefault('partition_triangle_weight', 1.5)
         self.config.setdefault('partition_triangle_threshold', 0.6)
         self.config.setdefault('partition_triangle_window_radius', 8)
@@ -394,9 +392,9 @@ def _restricted_longest_path_depth(nodes, g, rg, topo_order):
         return best
 
     @staticmethod
-    def _boundary_two_qubit_gate_count(part, support, g, rg, gate_dict,
-                                       max_partition_size):
-        """Count adjacent 2q gates that this candidate leaves over a boundary."""
+    def _boundary_two_qubit_gate_set(part, support, g, rg, gate_dict,
+                                     max_partition_size):
+        """Return the set of adjacent 2q gates this candidate leaves over a boundary."""
         support = set(support)
         boundary_gates = set()
         for gate_idx in part:
@@ -418,7 +416,17 @@ def _boundary_two_qubit_gate_count(part, support, g, rg, gate_dict,
                 ):
                     continue
                 boundary_gates.add(other_idx)
-        return len(boundary_gates)
+        return boundary_gates
+
+    @staticmethod
+    def _boundary_two_qubit_gate_count(part, support, g, rg, gate_dict,
+                                       max_partition_size):
+        """Count adjacent 2q gates that this candidate leaves over a boundary."""
+        return len(
+            qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_set(
+                part, support, g, rg, gate_dict, max_partition_size,
+            )
+        )
 
     @staticmethod
     def _pair_counts_in_topological_window(part, topo_order, topo_index,
@@ -509,24 +517,35 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                                           seed_layout=None):
         """Linear ILP weights for local block quality.
 
-        The ILP accepts one linear cost per candidate part, so pairwise boundary
-        effects are approximated locally.  Lower cost is better.  The model
-        rewards dense blocks, penalizes adjacent 2q gates left across a block
-        boundary, charges a nonlinear synthesis-width cost, gives 3q blocks a
-        local triangle incentive only above a density threshold, and adds a
-        light critical-path depth balance penalty.  When ``topology_distances``
-        is supplied, also adds ``routing_span_weight · Σ max(D[u,v]−1, 0)`` over
-        the part's active 2q pairs, capturing the SWAP overhead of bringing
-        interacting qubits adjacent on the device coupling map.  When
-        ``seed_layout`` is also supplied, ``D`` is permuted through the layout
-        so the span penalty reflects *physical* qubit distance under the
-        routing layer's chosen placement, not abstract logical distance.
-        Finally adds
-        ``turnover_weight · avg_turnover`` where ``avg_turnover`` averages
+        The ILP accepts one linear cost per candidate part, so pairwise
+        interactions are approximated locally.  Lower cost is better.
+
+        Core cost terms:
+          * ``synthesis_cost_weight · width_penalty[width]`` — non-linear
+            penalty for synthesising a wider unitary block.
+          * ``− density_weight · k_2q`` — *absorption credit*: each 2q gate the
+            partition absorbs pays a flat credit, independent of width.  This
+            puts narrow and wide partitions on equal per-gate footing; wider
+            partitions only win when they absorb enough gates to amortise the
+            larger synthesis cost.
+          * ``boundary_weight · effective_boundary_crossings`` — only counts
+            adjacent 2q gates that have *no* candidate home Q (disjoint from
+            this part) with base_cost(Q) ≤ base_cost(this part).  In other
+            words, gates that another comparably-cheap partition will absorb
+            are not double-penalised here.
+          * Triangle bonus (only above a density threshold), depth-balance
+            penalty, optional routing-span penalty, and optional turnover
+            penalty as documented per knob below.
+
+        When ``topology_distances`` is supplied, also adds
+        ``routing_span_weight · Σ max(D[u,v]−1, 0)`` over the part's active 2q
+        pairs.  When ``seed_layout`` is also supplied, ``D`` is permuted
+        through the layout so the span penalty reflects *physical* qubit
+        distance under the routing layer's chosen placement.  When
+        ``turnover_weight`` is non-zero, also adds
+        ``turnover_weight · avg_turnover`` averaging
         ``min(|supp_p|, |supp_q|) − |supp_p ∩ supp_q|`` over candidate
-        partitions ``q`` that immediately follow ``p`` in the gate DAG —
-        penalising blocks whose downstream neighbours have little qubit
-        overlap (high inter-block routing churn).
+        partitions ``q`` immediately downstream of ``p`` in the gate DAG.
         """
         cfg = {} if config is None else config
         if max_partition_size is None:
@@ -587,12 +606,15 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             for dst in dsts:
                 rg.setdefault(dst, set()).add(src)
 
+        # Always build gate_to_parts — needed by the discounted boundary
+        # check below as well as by turnover.
+        gate_to_parts = defaultdict(list)
+        for idx, part in enumerate(allparts):
+            for gate_idx in part:
+                gate_to_parts[gate_idx].append(idx)
+
         use_turnover = turnover_weight != 0.0
         if use_turnover:
-            gate_to_parts = defaultdict(list)
-            for idx, part in enumerate(allparts):
-                for gate_idx in part:
-                    gate_to_parts[gate_idx].append(idx)
             successor_gate_sets = []
             for part in allparts:
                 downstream = set()
@@ -600,7 +622,6 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                     downstream.update(g.get(gate_idx, ()))
                 successor_gate_sets.append(downstream)
         else:
-            gate_to_parts = None
             successor_gate_sets = None
 
         gate_to_qubit = {
@@ -617,28 +638,50 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             1,
         )
 
-        weights = []
+        # Pre-pass: cache per-partition width_penalty, k_2q, and base_cost.
+        # base_cost = synthesis_cost - absorption_credit, the "fundamental"
+        # local cost used by the discounted boundary check to decide whether
+        # a boundary gate has a cheaper-or-equal home elsewhere in the
+        # candidate space.  Each absorbed 2q gate pays a flat credit of
+        # ``density_weight`` regardless of partition width — wider partitions
+        # only win when they absorb enough gates to amortize their (larger)
+        # synthesis cost, not because of arithmetic from a capacity-based
+        # normalisation.
+        two_qubit_gate_counts = []
+        width_penalties_cache = []
+        base_costs = []
         for part_idx, part in enumerate(allparts):
-            support = supports[part_idx]
-            width = len(support)
-            width_penalty = (
-                qgd_Partition_Aware_Mapping._configured_width_penalty(
-                    width, width_penalties
-                )
+            width = len(supports[part_idx])
+            wp = qgd_Partition_Aware_Mapping._configured_width_penalty(
+                width, width_penalties
             )
-            two_qubit_gate_count = (
-                qgd_Partition_Aware_Mapping._part_two_qubit_gate_count(
-                    part, gate_dict
-                )
+            k_2q = qgd_Partition_Aware_Mapping._part_two_qubit_gate_count(
+                part, gate_dict
             )
-            block_density = (
-                two_qubit_gate_count
-                / qgd_Partition_Aware_Mapping._synthesis_capacity(
-                    width, synthesis_capacities
+            width_penalties_cache.append(wp)
+            two_qubit_gate_counts.append(k_2q)
+            density_bonus_local = density_weight * k_2q
+            if pack_credit_weight:
+                density_bonus_local += (
+                    pack_credit_weight * k_2q * max(k_2q - 1, 0)
                 )
-            )
-            boundary_crossings = (
-                qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_count(
+            base_costs.append(synthesis_cost_weight * wp - density_bonus_local)
+
+        weights = []
+        for part_idx, part in enumerate(allparts):
+            support = supports[part_idx]
+            width = len(support)
+            width_penalty = width_penalties_cache[part_idx]
+            two_qubit_gate_count = two_qubit_gate_counts[part_idx]
+            part_base_cost = base_costs[part_idx]
+
+            # Discounted boundary: skip boundary gates that have a candidate
+            # home Q (disjoint from this part) with base_cost(Q) ≤ base_cost(P).
+            # Those gates aren't actually "stranded" by selecting P — a
+            # comparably cheap Q is available in the candidate space to absorb
+            # them, so the ILP would naturally pick it.
+            boundary_gate_set = (
+                qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_set(
                     part,
                     support,
                     g,
@@ -647,6 +690,20 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                     max_partition_size,
                 )
             )
+            boundary_crossings = 0
+            for other_idx in boundary_gate_set:
+                has_home = False
+                for q_idx in gate_to_parts.get(other_idx, ()):
+                    if q_idx == part_idx:
+                        continue
+                    q_part = allparts[q_idx]
+                    if q_part & part:
+                        continue
+                    if base_costs[q_idx] <= part_base_cost:
+                        has_home = True
+                        break
+                if not has_home:
+                    boundary_crossings += 1
             if use_routing_span:
                 span_cost = 0.0
                 for u, v in active_pairs_list[part_idx]:
@@ -706,11 +763,14 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                 * max(width_penalty, 1.0)
             )
 
-            density_bonus = density_weight * block_density
+            # Absorption-credit density: flat per-absorbed-gate reward,
+            # independent of partition width.  Pre-pass already used this same
+            # formula to compute base_costs for the boundary discount.
+            density_bonus = density_weight * two_qubit_gate_count
             if pack_credit_weight:
                 density_bonus += (
                     pack_credit_weight
-                    * block_density
+                    * two_qubit_gate_count
                     * max(two_qubit_gate_count - 1, 0)
                 )
 

From 49835f60c6edfe4d6bfaa9997cfa1262b4e7db94 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Fri, 15 May 2026 19:54:32 +0200
Subject: [PATCH 222/232] partitioning change

---
 squander/synthesis/PartAM.py | 60 ++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 27 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 02d99eafd..50bd16a27 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -190,11 +190,12 @@ def __init__(self, config):
         self.config.setdefault('layout_boundary_beam_depth', None)
         self.config.setdefault('routing_trace_path', None)
         self.config['partition_weight_model'] = 'window_turnover'
-        # Absorption credit: each absorbed 2q gate pays a flat 1.0 credit
-        # regardless of partition width. With width_penalty[2]=1.0, a
-        # saturated w-2 (k=3) gets bonus 3 → net synth cost −2; a w-3 needs
-        # to absorb ≥4 gates to beat width_penalty[3]=4. Narrow vs wide is
-        # decided by total absorbed work + topology, not by normalisation.
+        # Capacity-normalised density: block_density = k_2q / capacity[w].
+        # With CNOT-budget capacities {2:3, 3:14, 4:61}, density saturates at
+        # 1.0 for every width, so density_weight 1.0 gives a max bonus that
+        # exactly cancels the width-2 penalty (1.0). Wider widths need extra
+        # signal (triangle / discounted boundary / negative span_cost) to
+        # justify selection.
         self.config.setdefault('partition_density_weight', 1.0)
         self.config.setdefault('partition_boundary_weight', 0.9)
         self.config.setdefault('partition_depth_balance_weight', 0.25)
@@ -523,16 +524,16 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         Core cost terms:
           * ``synthesis_cost_weight · width_penalty[width]`` — non-linear
             penalty for synthesising a wider unitary block.
-          * ``− density_weight · k_2q`` — *absorption credit*: each 2q gate the
-            partition absorbs pays a flat credit, independent of width.  This
-            puts narrow and wide partitions on equal per-gate footing; wider
-            partitions only win when they absorb enough gates to amortise the
-            larger synthesis cost.
+          * ``− density_weight · (k_2q / synthesis_capacity[width])`` —
+            capacity-normalised density reward.  Each width has the same
+            saturation level (1.0), implicitly pricing that wider partitions
+            don't compress to zero body CNOTs.
           * ``boundary_weight · effective_boundary_crossings`` — only counts
             adjacent 2q gates that have *no* candidate home Q (disjoint from
             this part) with base_cost(Q) ≤ base_cost(this part).  In other
             words, gates that another comparably-cheap partition will absorb
-            are not double-penalised here.
+            are not double-penalised here.  base_cost = synthesis_cost −
+            density_bonus, the same quantity precomputed in the pre-pass.
           * Triangle bonus (only above a density threshold), depth-balance
             penalty, optional routing-span penalty, and optional turnover
             penalty as documented per knob below.
@@ -638,17 +639,18 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             1,
         )
 
-        # Pre-pass: cache per-partition width_penalty, k_2q, and base_cost.
-        # base_cost = synthesis_cost - absorption_credit, the "fundamental"
-        # local cost used by the discounted boundary check to decide whether
-        # a boundary gate has a cheaper-or-equal home elsewhere in the
-        # candidate space.  Each absorbed 2q gate pays a flat credit of
-        # ``density_weight`` regardless of partition width — wider partitions
-        # only win when they absorb enough gates to amortize their (larger)
-        # synthesis cost, not because of arithmetic from a capacity-based
-        # normalisation.
+        # Pre-pass: cache per-partition width_penalty, k_2q, block_density,
+        # and base_cost.  base_cost = synthesis_cost − density_bonus is the
+        # fundamental local cost used by the discounted boundary check below
+        # to decide whether a boundary gate has a cheaper-or-equal home
+        # elsewhere in the candidate space.  Density is capacity-normalised
+        # (k_2q / synthesis_capacity[w]) so wider partitions need
+        # proportionally more absorbed gates to claim equal credit — this
+        # implicitly prices that a saturated w-3 doesn't compress to zero
+        # body CNOTs, it compresses to its lower-bound synthesis count.
         two_qubit_gate_counts = []
         width_penalties_cache = []
+        block_densities_cache = []
         base_costs = []
         for part_idx, part in enumerate(allparts):
             width = len(supports[part_idx])
@@ -658,12 +660,16 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             k_2q = qgd_Partition_Aware_Mapping._part_two_qubit_gate_count(
                 part, gate_dict
             )
+            block_density = k_2q / qgd_Partition_Aware_Mapping._synthesis_capacity(
+                width, synthesis_capacities
+            )
             width_penalties_cache.append(wp)
             two_qubit_gate_counts.append(k_2q)
-            density_bonus_local = density_weight * k_2q
+            block_densities_cache.append(block_density)
+            density_bonus_local = density_weight * block_density
             if pack_credit_weight:
                 density_bonus_local += (
-                    pack_credit_weight * k_2q * max(k_2q - 1, 0)
+                    pack_credit_weight * block_density * max(k_2q - 1, 0)
                 )
             base_costs.append(synthesis_cost_weight * wp - density_bonus_local)
 
@@ -763,14 +769,14 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                 * max(width_penalty, 1.0)
             )
 
-            # Absorption-credit density: flat per-absorbed-gate reward,
-            # independent of partition width.  Pre-pass already used this same
-            # formula to compute base_costs for the boundary discount.
-            density_bonus = density_weight * two_qubit_gate_count
+            # Capacity-normalised density: pre-pass already used this same
+            # formula when filling base_costs for the boundary discount.
+            block_density = block_densities_cache[part_idx]
+            density_bonus = density_weight * block_density
             if pack_credit_weight:
                 density_bonus += (
                     pack_credit_weight
-                    * two_qubit_gate_count
+                    * block_density
                     * max(two_qubit_gate_count - 1, 0)
                 )
 

From e0fdddd8b4a38facbc59fe70f7945ca931222d55 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sat, 16 May 2026 13:09:48 +0200
Subject: [PATCH 223/232] boundary rework

---
 squander/synthesis/PartAM.py | 106 ++++++++++-------------------------
 1 file changed, 29 insertions(+), 77 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 50bd16a27..da4ad181a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -210,7 +210,7 @@ def __init__(self, config):
         # non-adjacent on a sparse grid (sum_extra ≈ 3) takes ~+1.5 cost,
         # enough to overpower a saturated triangle bonus (≤ 2.5) and pull
         # the ILP back to width-2 unless the block is topology-aligned.
-        self.config.setdefault('partition_routing_span_weight', 0.5)
+        self.config.setdefault('partition_routing_span_weight', 2.0)
         # Averaged turnover with DAG successor partitions: penalises blocks
         # whose support has little qubit overlap with the candidate parts
         # immediately downstream. Captures inter-block routing churn that
@@ -528,15 +528,12 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             capacity-normalised density reward.  Each width has the same
             saturation level (1.0), implicitly pricing that wider partitions
             don't compress to zero body CNOTs.
-          * ``boundary_weight · effective_boundary_crossings`` — only counts
-            adjacent 2q gates that have *no* candidate home Q (disjoint from
-            this part) with base_cost(Q) ≤ base_cost(this part).  In other
-            words, gates that another comparably-cheap partition will absorb
-            are not double-penalised here.  base_cost = synthesis_cost −
-            density_bonus, the same quantity precomputed in the pre-pass.
+          * ``boundary_weight · boundary_crossings`` — penalises adjacent 2q
+            gates left across this candidate's boundary.
           * Triangle bonus (only above a density threshold), depth-balance
-            penalty, optional routing-span penalty, and optional turnover
-            penalty as documented per knob below.
+            penalty, optional routing-span penalty (heavily weighted to make
+            topology-spread wide partitions visibly expensive), and optional
+            turnover penalty as documented per knob below.
 
         When ``topology_distances`` is supplied, also adds
         ``routing_span_weight · Σ max(D[u,v]−1, 0)`` over the part's active 2q
@@ -607,15 +604,12 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             for dst in dsts:
                 rg.setdefault(dst, set()).add(src)
 
-        # Always build gate_to_parts — needed by the discounted boundary
-        # check below as well as by turnover.
-        gate_to_parts = defaultdict(list)
-        for idx, part in enumerate(allparts):
-            for gate_idx in part:
-                gate_to_parts[gate_idx].append(idx)
-
         use_turnover = turnover_weight != 0.0
         if use_turnover:
+            gate_to_parts = defaultdict(list)
+            for idx, part in enumerate(allparts):
+                for gate_idx in part:
+                    gate_to_parts[gate_idx].append(idx)
             successor_gate_sets = []
             for part in allparts:
                 downstream = set()
@@ -623,6 +617,7 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                     downstream.update(g.get(gate_idx, ()))
                 successor_gate_sets.append(downstream)
         else:
+            gate_to_parts = None
             successor_gate_sets = None
 
         gate_to_qubit = {
@@ -639,55 +634,29 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             1,
         )
 
-        # Pre-pass: cache per-partition width_penalty, k_2q, block_density,
-        # and base_cost.  base_cost = synthesis_cost − density_bonus is the
-        # fundamental local cost used by the discounted boundary check below
-        # to decide whether a boundary gate has a cheaper-or-equal home
-        # elsewhere in the candidate space.  Density is capacity-normalised
-        # (k_2q / synthesis_capacity[w]) so wider partitions need
-        # proportionally more absorbed gates to claim equal credit — this
-        # implicitly prices that a saturated w-3 doesn't compress to zero
-        # body CNOTs, it compresses to its lower-bound synthesis count.
-        two_qubit_gate_counts = []
-        width_penalties_cache = []
-        block_densities_cache = []
-        base_costs = []
-        for part_idx, part in enumerate(allparts):
-            width = len(supports[part_idx])
-            wp = qgd_Partition_Aware_Mapping._configured_width_penalty(
-                width, width_penalties
-            )
-            k_2q = qgd_Partition_Aware_Mapping._part_two_qubit_gate_count(
-                part, gate_dict
-            )
-            block_density = k_2q / qgd_Partition_Aware_Mapping._synthesis_capacity(
-                width, synthesis_capacities
-            )
-            width_penalties_cache.append(wp)
-            two_qubit_gate_counts.append(k_2q)
-            block_densities_cache.append(block_density)
-            density_bonus_local = density_weight * block_density
-            if pack_credit_weight:
-                density_bonus_local += (
-                    pack_credit_weight * block_density * max(k_2q - 1, 0)
-                )
-            base_costs.append(synthesis_cost_weight * wp - density_bonus_local)
 
         weights = []
         for part_idx, part in enumerate(allparts):
             support = supports[part_idx]
             width = len(support)
-            width_penalty = width_penalties_cache[part_idx]
-            two_qubit_gate_count = two_qubit_gate_counts[part_idx]
-            part_base_cost = base_costs[part_idx]
-
-            # Discounted boundary: skip boundary gates that have a candidate
-            # home Q (disjoint from this part) with base_cost(Q) ≤ base_cost(P).
-            # Those gates aren't actually "stranded" by selecting P — a
-            # comparably cheap Q is available in the candidate space to absorb
-            # them, so the ILP would naturally pick it.
-            boundary_gate_set = (
-                qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_set(
+            width_penalty = (
+                qgd_Partition_Aware_Mapping._configured_width_penalty(
+                    width, width_penalties
+                )
+            )
+            two_qubit_gate_count = (
+                qgd_Partition_Aware_Mapping._part_two_qubit_gate_count(
+                    part, gate_dict
+                )
+            )
+            block_density = (
+                two_qubit_gate_count
+                / qgd_Partition_Aware_Mapping._synthesis_capacity(
+                    width, synthesis_capacities
+                )
+            )
+            boundary_crossings = (
+                qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_count(
                     part,
                     support,
                     g,
@@ -696,20 +665,6 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                     max_partition_size,
                 )
             )
-            boundary_crossings = 0
-            for other_idx in boundary_gate_set:
-                has_home = False
-                for q_idx in gate_to_parts.get(other_idx, ()):
-                    if q_idx == part_idx:
-                        continue
-                    q_part = allparts[q_idx]
-                    if q_part & part:
-                        continue
-                    if base_costs[q_idx] <= part_base_cost:
-                        has_home = True
-                        break
-                if not has_home:
-                    boundary_crossings += 1
             if use_routing_span:
                 span_cost = 0.0
                 for u, v in active_pairs_list[part_idx]:
@@ -769,9 +724,6 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                 * max(width_penalty, 1.0)
             )
 
-            # Capacity-normalised density: pre-pass already used this same
-            # formula when filling base_costs for the boundary discount.
-            block_density = block_densities_cache[part_idx]
             density_bonus = density_weight * block_density
             if pack_credit_weight:
                 density_bonus += (

From c5c1644794636e261748762259e2a5934b5e1010 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Sun, 17 May 2026 14:03:44 +0200
Subject: [PATCH 224/232] Add anti-chain penalty for wide low-triangle
 partitions

---
 squander/synthesis/PartAM.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index da4ad181a..81dbc9ad5 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -217,6 +217,13 @@ def __init__(self, config):
         # routing_span (intra-block spread) misses. Linear in ILP vars
         # since each candidate gets a precomputed scalar.
         self.config.setdefault('partition_turnover_weight', 0.5)
+        # Anti-chain penalty: a width>=3 block whose qubits are wired as a
+        # chain (low triangle density) synthesises to more body CNOTs than
+        # the equivalent 2q blocks and adds a routing boundary with no
+        # entanglement payoff. triangle_bonus only ever rewards, so without
+        # this term such blocks are picked purely on boundary absorption.
+        # 0.0 disables it (recovers prior behaviour). Sweepable.
+        self.config.setdefault('partition_chain_penalty_weight', 2.0)
         self.config.setdefault('partition_min_cost', 0.05)
         self.config.setdefault(
             'partition_width_penalties',
@@ -570,6 +577,9 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             cfg.get("partition_routing_span_weight", 0.0)
         )
         turnover_weight = float(cfg.get("partition_turnover_weight", 0.0))
+        chain_penalty_weight = float(
+            cfg.get("partition_chain_penalty_weight", 0.0)
+        )
         min_cost = float(cfg.get("partition_min_cost", 0.05))
         width_penalties = cfg.get("partition_width_penalties")
         synthesis_capacities = cfg.get("partition_synthesis_capacity")
@@ -711,6 +721,20 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                     0.0,
                 ) / (1.0 - triangle_threshold)
 
+            if (
+                chain_penalty_weight
+                and width >= 3
+                and triangle_threshold > 0.0
+            ):
+                chain_deficit = max(
+                    triangle_threshold - triangle_density, 0.0
+                ) / triangle_threshold
+                chain_penalty = (
+                    chain_penalty_weight * chain_deficit * (width - 2)
+                )
+            else:
+                chain_penalty = 0.0
+
             internal_depth = (
                 qgd_Partition_Aware_Mapping._restricted_longest_path_depth(
                     part, g, rg, topo_order
@@ -737,6 +761,7 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
                 + boundary_weight * boundary_crossings
                 + routing_span_weight * span_cost
                 + turnover_weight * turnover_cost
+                + chain_penalty
                 + depth_penalty
                 - density_bonus
                 - triangle_bonus

From 33bf947c3687b59c0e2ba087e60befb54cd0a2f6 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Tue, 19 May 2026 12:35:38 +0200
Subject: [PATCH 225/232] Add separate chain penalty threshold knob

---
 squander/synthesis/PartAM.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 81dbc9ad5..850edd5e1 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -224,6 +224,13 @@ def __init__(self, config):
         # this term such blocks are picked purely on boundary absorption.
         # 0.0 disables it (recovers prior behaviour). Sweepable.
         self.config.setdefault('partition_chain_penalty_weight', 2.0)
+        # Knee of the anti-chain penalty. None reuses
+        # partition_triangle_threshold (the triangle-bonus knee), which
+        # leaves a dead zone: 3q blocks with mid triangle density get
+        # penalised yet earn no bonus. Set a lower value to spare
+        # moderately-entangled 3q blocks while still nuking near-pure
+        # chains (triD ~ 0). Sweepable.
+        self.config.setdefault('partition_chain_penalty_threshold', None)
         self.config.setdefault('partition_min_cost', 0.05)
         self.config.setdefault(
             'partition_width_penalties',
@@ -580,6 +587,13 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         chain_penalty_weight = float(
             cfg.get("partition_chain_penalty_weight", 0.0)
         )
+        chain_penalty_threshold = cfg.get("partition_chain_penalty_threshold")
+        if chain_penalty_threshold is None:
+            chain_penalty_threshold = triangle_threshold
+        else:
+            chain_penalty_threshold = min(
+                max(float(chain_penalty_threshold), 0.0), 1.0
+            )
         min_cost = float(cfg.get("partition_min_cost", 0.05))
         width_penalties = cfg.get("partition_width_penalties")
         synthesis_capacities = cfg.get("partition_synthesis_capacity")
@@ -724,11 +738,11 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             if (
                 chain_penalty_weight
                 and width >= 3
-                and triangle_threshold > 0.0
+                and chain_penalty_threshold > 0.0
             ):
                 chain_deficit = max(
-                    triangle_threshold - triangle_density, 0.0
-                ) / triangle_threshold
+                    chain_penalty_threshold - triangle_density, 0.0
+                ) / chain_penalty_threshold
                 chain_penalty = (
                     chain_penalty_weight * chain_deficit * (width - 2)
                 )

From efe98c1a041c0a431c79622bc7da4644210c4443 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 20 May 2026 17:21:44 +0200
Subject: [PATCH 226/232] Revert "Add separate chain penalty threshold knob"

This reverts commit 33bf947c3687b59c0e2ba087e60befb54cd0a2f6.
---
 squander/synthesis/PartAM.py | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 850edd5e1..81dbc9ad5 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -224,13 +224,6 @@ def __init__(self, config):
         # this term such blocks are picked purely on boundary absorption.
         # 0.0 disables it (recovers prior behaviour). Sweepable.
         self.config.setdefault('partition_chain_penalty_weight', 2.0)
-        # Knee of the anti-chain penalty. None reuses
-        # partition_triangle_threshold (the triangle-bonus knee), which
-        # leaves a dead zone: 3q blocks with mid triangle density get
-        # penalised yet earn no bonus. Set a lower value to spare
-        # moderately-entangled 3q blocks while still nuking near-pure
-        # chains (triD ~ 0). Sweepable.
-        self.config.setdefault('partition_chain_penalty_threshold', None)
         self.config.setdefault('partition_min_cost', 0.05)
         self.config.setdefault(
             'partition_width_penalties',
@@ -587,13 +580,6 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         chain_penalty_weight = float(
             cfg.get("partition_chain_penalty_weight", 0.0)
         )
-        chain_penalty_threshold = cfg.get("partition_chain_penalty_threshold")
-        if chain_penalty_threshold is None:
-            chain_penalty_threshold = triangle_threshold
-        else:
-            chain_penalty_threshold = min(
-                max(float(chain_penalty_threshold), 0.0), 1.0
-            )
         min_cost = float(cfg.get("partition_min_cost", 0.05))
         width_penalties = cfg.get("partition_width_penalties")
         synthesis_capacities = cfg.get("partition_synthesis_capacity")
@@ -738,11 +724,11 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             if (
                 chain_penalty_weight
                 and width >= 3
-                and chain_penalty_threshold > 0.0
+                and triangle_threshold > 0.0
             ):
                 chain_deficit = max(
-                    chain_penalty_threshold - triangle_density, 0.0
-                ) / chain_penalty_threshold
+                    triangle_threshold - triangle_density, 0.0
+                ) / triangle_threshold
                 chain_penalty = (
                     chain_penalty_weight * chain_deficit * (width - 2)
                 )

From 5c999b6345b769b0650d2447537beb478b591f47 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Wed, 20 May 2026 17:22:00 +0200
Subject: [PATCH 227/232] depth balance exponetnial

---
 squander/synthesis/PartAM.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 81dbc9ad5..91f17332a 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -199,6 +199,13 @@ def __init__(self, config):
         self.config.setdefault('partition_density_weight', 1.0)
         self.config.setdefault('partition_boundary_weight', 0.9)
         self.config.setdefault('partition_depth_balance_weight', 0.25)
+        # Shape of the depth-balance penalty: depth_balance_weight *
+        # depth_fraction**exponent * max(width_penalty, 1). 2.0 (current
+        # squared form) over-penalises tall partitions in deep circuits
+        # (qft, multiplier) so they get fragmented into shallow wide
+        # slices, creating extra boundaries and routing SWAPs. Lower to
+        # 1.0 for a linear penalty. Sweepable.
+        self.config.setdefault('partition_depth_balance_exponent', 2.0)
         # Triangle is a small tie-breaker on top of absorption-credit density
         # and discounted boundary. A truly triangular 3q block (Toffoli, etc.)
         # gets a modest extra reward; chain-shaped 3q blocks don't.
@@ -561,6 +568,9 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
         depth_balance_weight = float(
             cfg.get("partition_depth_balance_weight", 0.25)
         )
+        depth_balance_exponent = float(
+            cfg.get("partition_depth_balance_exponent", 2.0)
+        )
         triangle_weight = float(cfg.get("partition_triangle_weight", 2.5))
         triangle_threshold = float(
             cfg.get("partition_triangle_threshold", 0.6)
@@ -743,8 +753,7 @@ def _parts_to_window_turnover_weights(allparts, gate_dict, g,
             depth_fraction = internal_depth / float(global_depth)
             depth_penalty = (
                 depth_balance_weight
-                * depth_fraction
-                * depth_fraction
+                * (depth_fraction ** depth_balance_exponent)
                 * max(width_penalty, 1.0)
             )
 

From 2378d3b925cd3be135f6aacb0ce1e4958dd4e6cf Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 4 Jun 2026 12:51:07 +0200
Subject: [PATCH 228/232] Clean up PartAM and simplify cleanup phase

---
 .../sabre_router/include/sabre_router.hpp     |  16 +-
 .../src-cpp/sabre_router/sabre_router.cpp     |  27 +-
 squander/synthesis/PartAM.py                  | 350 ++++--------------
 squander/synthesis/PartAM_utils.py            |  15 +-
 4 files changed, 104 insertions(+), 304 deletions(-)

diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp
index a77eee1c6..0e037a9f9 100644
--- a/squander/src-cpp/sabre_router/include/sabre_router.hpp
+++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp
@@ -9,7 +9,6 @@ You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 
 C++ backend for the SABRE-style partition-aware routing engine.
-Ported from squander/synthesis/PartAM.py and PartAM_utils.py.
 */
 
 #include <cstdint>
@@ -228,8 +227,8 @@ class SabreRouter {
         return D_[phys_u * N_ + phys_v];
     }
 
-    // Heuristic search (port of _heuristic_search_layout_only)
-    // children_graph/parents_graph: swapped for backward passes
+    // Main heuristic search loop.
+    // children_graph/parents_graph are swapped for backward passes.
     std::pair<std::vector<int>, double> heuristic_search(
         const std::vector<int>& F_init,
         std::vector<int> pi,
@@ -241,7 +240,7 @@ class SabreRouter {
         ForwardRouteResult* route_trace = nullptr
     ) const;
 
-    // A* constrained swap search (port of find_constrained_swaps_partial)
+    // A* constrained swap search over the k-dimensional partition state space.
     std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
     find_constrained_swaps(
         const std::vector<int>& pi,
@@ -253,14 +252,14 @@ class SabreRouter {
         const NeighborInfo* neighbor_info = nullptr
     ) const;
 
-    // Lower-bound swap estimate (port of estimate_swap_count)
+    // Lower-bound swap estimate for routing the candidate's partition qubits.
     int estimate_swap_count(
         const CandidateData& cand,
         const std::vector<int>& pi,
         bool reverse
     ) const;
 
-    // BFS lookahead (port of generate_extended_set)
+    // BFS lookahead: multi-qubit partitions near the front layer.
     std::vector<std::pair<int,int>> generate_extended_set(
         const std::vector<int>& F,
         const std::vector<uint8_t>& resolved,
@@ -268,7 +267,7 @@ class SabreRouter {
         const std::vector<std::vector<int>>& parents_graph
     ) const;
 
-    // LightSABRE scoring (port of score_partition_candidate)
+    // LightSABRE relative scoring (arXiv:2409.08368, eq. 1).
     double score_candidate(
         const CandidateData& cand,
         const std::vector<int>& F_snapshot,
@@ -283,7 +282,8 @@ class SabreRouter {
         const NeighborInfo* cached_neighbor_info = nullptr
     ) const;
 
-    // Route and update layout for a candidate (port of transform_pi)
+    // Route a candidate's partition qubits to their input positions and
+    // update pi for the exit positions.
     std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
     transform_pi(
         const CandidateData& cand,
diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp
index 43f352e8f..8f1770ae5 100644
--- a/squander/src-cpp/sabre_router/sabre_router.cpp
+++ b/squander/src-cpp/sabre_router/sabre_router.cpp
@@ -151,12 +151,6 @@ SabreRouter::SabreRouter(
     }
 }
 
-// ---------------------------------------------------------------------------
-// run_trial (stub for Phase A)
-// ---------------------------------------------------------------------------
-
-// run_trial implemented below (after all private methods)
-
 // ---------------------------------------------------------------------------
 // Helper: random permutation
 // ---------------------------------------------------------------------------
@@ -506,7 +500,6 @@ int SabreRouter::estimate_swap_count(
 
 // ---------------------------------------------------------------------------
 // find_constrained_swaps (A* over k-dimensional state space)
-// Port of find_constrained_swaps_partial from PartAM_utils.py
 // ---------------------------------------------------------------------------
 
 std::pair<std::vector<std::pair<int,int>>, std::vector<int>>
@@ -970,7 +963,8 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
         if (static_cast<int>(E.size()) >= config_.max_E_size) break;
 
         std::deque<BFSNode> queue;
-        // EXACT Python logic: No pre-checks before pushing!
+        // Push without pre-checking; eligibility is tested when popped so a
+        // single-qubit partition can act as a transparent transit node.
         for (int child : children_graph[front_idx]) {
             queue.push_back({child, 1});
         }
@@ -992,7 +986,8 @@ std::vector<std::pair<int,int>> SabreRouter::generate_extended_set(
             if (!parents_ok) continue;
 
             if (layout_partitions_[part].is_single) {
-                // EXACT Python logic: blindly push grandchildren!
+                // Single-qubit partitions act as transparent transit nodes:
+                // forward their grandchildren at the same depth.
                 for (int child : children_graph[part]) {
                     queue.push_back({child, depth});
                 }
@@ -1603,7 +1598,6 @@ size_t SabreRouter::boundary_beam_select_index(
     return std::min_element(states.begin(), states.end(), sort_states)->first_idx;
 }
 
-// ---------------------------------------------------------------------------
 // ---------------------------------------------------------------------------
 // heuristic_search (main loop)
 // ---------------------------------------------------------------------------
@@ -1815,7 +1809,7 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
                 for (int par : pg[child]) {
                     if (!resolved[par]) { parents_ok = false; break; }
                 }
-                
+
                 if (parents_ok) {
                     if (layout_partitions_[child].is_single) {
                         resolved[child] = 1;
@@ -1830,11 +1824,11 @@ std::pair<std::vector<int>, double> SabreRouter::heuristic_search(
                         }
                         std::vector<int> stack;
                         for (int gc : cg[child]) stack.push_back(gc);
-                        
+
                         while (!stack.empty()) {
                             int gc = stack.back();
                             stack.pop_back();
-                            
+
                             if (!resolved[gc] && !in_F[gc]) {
                                 bool gc_parents_ok = true;
                                 for (int p_gc : pg[gc]) {
@@ -1927,11 +1921,12 @@ TrialResult SabreRouter::run_trial(
         }
     }
 
-    // Final evaluation pass (deterministic, no RNG)
-    auto eval_result = heuristic_search(F_fwd, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_); // Evaluates cost using a copy under the hood
+    // Deterministic evaluation pass on a copy of pi to score the trial.
+    auto eval_result = heuristic_search(F_fwd, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_);
     double cost = eval_result.second;
 
-    return TrialResult{std::move(pi), cost}; // Return the pi from AFTER the backward pass, BEFORE the eval pass
+    // Return the layout from AFTER the backward pass, BEFORE the eval pass.
+    return TrialResult{std::move(pi), cost};
 }
 
 } // namespace squander::routing
diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py
index 91f17332a..24156b791 100644
--- a/squander/synthesis/PartAM.py
+++ b/squander/synthesis/PartAM.py
@@ -172,9 +172,8 @@ def __init__(self, config):
         self.config.setdefault('decay_delta', 0.001)  # Qiskit LightSABRE DECAY_RATE
         self.config.setdefault('swap_burst_budget', 5)  # Qiskit LightSABRE DECAY_RESET_INTERVAL
         self.config.setdefault('path_tiebreak_weight', 0.2)
-        # The neighbor heuristic is normalized to [0, 1] and added to A*'s f-value.
-        # g-deltas are integer and h-deltas are half-integer, so preserving
-        # swap-count optimality requires weight < 0.5.
+        # Neighbor tie-breaker is added to A* f-values normalised to [0, 1];
+        # must stay < 0.5 to preserve swap-count optimality.
         if self.config['path_tiebreak_weight'] >= 0.5:
             logging.warning(
                 "path_tiebreak_weight=%.3f ≥ 0.5 may override SWAP-count "
@@ -190,56 +189,29 @@ def __init__(self, config):
         self.config.setdefault('layout_boundary_beam_depth', None)
         self.config.setdefault('routing_trace_path', None)
         self.config['partition_weight_model'] = 'window_turnover'
-        # Capacity-normalised density: block_density = k_2q / capacity[w].
-        # With CNOT-budget capacities {2:3, 3:14, 4:61}, density saturates at
-        # 1.0 for every width, so density_weight 1.0 gives a max bonus that
-        # exactly cancels the width-2 penalty (1.0). Wider widths need extra
-        # signal (triangle / discounted boundary / negative span_cost) to
-        # justify selection.
+        # ILP partition-selection weights. See _parts_to_window_turnover_weights
+        # for the full cost formula; defaults are calibrated against the
+        # synthesis-capacity / width-penalty pair below so saturation rewards
+        # match across widths.
         self.config.setdefault('partition_density_weight', 1.0)
         self.config.setdefault('partition_boundary_weight', 0.9)
         self.config.setdefault('partition_depth_balance_weight', 0.25)
-        # Shape of the depth-balance penalty: depth_balance_weight *
-        # depth_fraction**exponent * max(width_penalty, 1). 2.0 (current
-        # squared form) over-penalises tall partitions in deep circuits
-        # (qft, multiplier) so they get fragmented into shallow wide
-        # slices, creating extra boundaries and routing SWAPs. Lower to
-        # 1.0 for a linear penalty. Sweepable.
         self.config.setdefault('partition_depth_balance_exponent', 2.0)
-        # Triangle is a small tie-breaker on top of absorption-credit density
-        # and discounted boundary. A truly triangular 3q block (Toffoli, etc.)
-        # gets a modest extra reward; chain-shaped 3q blocks don't.
         self.config.setdefault('partition_triangle_weight', 1.5)
         self.config.setdefault('partition_triangle_threshold', 0.6)
         self.config.setdefault('partition_triangle_window_radius', 8)
         self.config.setdefault('partition_synthesis_cost_weight', 1.0)
-        # Calibrated so a width-3 block whose three qubits are mutually
-        # non-adjacent on a sparse grid (sum_extra ≈ 3) takes ~+1.5 cost,
-        # enough to overpower a saturated triangle bonus (≤ 2.5) and pull
-        # the ILP back to width-2 unless the block is topology-aligned.
         self.config.setdefault('partition_routing_span_weight', 2.0)
-        # Averaged turnover with DAG successor partitions: penalises blocks
-        # whose support has little qubit overlap with the candidate parts
-        # immediately downstream. Captures inter-block routing churn that
-        # routing_span (intra-block spread) misses. Linear in ILP vars
-        # since each candidate gets a precomputed scalar.
         self.config.setdefault('partition_turnover_weight', 0.5)
-        # Anti-chain penalty: a width>=3 block whose qubits are wired as a
-        # chain (low triangle density) synthesises to more body CNOTs than
-        # the equivalent 2q blocks and adds a routing boundary with no
-        # entanglement payoff. triangle_bonus only ever rewards, so without
-        # this term such blocks are picked purely on boundary absorption.
-        # 0.0 disables it (recovers prior behaviour). Sweepable.
+        # Penalises chain-shaped width>=3 blocks; 0.0 disables it.
         self.config.setdefault('partition_chain_penalty_weight', 2.0)
         self.config.setdefault('partition_min_cost', 0.05)
         self.config.setdefault(
             'partition_width_penalties',
             {1: 0.25, 2: 1.0, 3: 4.0, 4: 16.0},
         )
-        # CNOT lower-bound synthesis budgets (Vidal–Dawson for w=2,
-        # Shende–Markov–Bullock for w=3, QSD for w=4). Sets block_density
-        # to "fraction of synthesis budget used" with consistent semantics
-        # across widths — saturation reward matches across all w.
+        # CNOT lower-bound synthesis budgets (Vidal–Dawson w=2,
+        # Shende–Markov–Bullock w=3, QSD w=4).
         self.config.setdefault(
             'partition_synthesis_capacity',
             {1: 1, 2: 3, 3: 14, 4: 61},
@@ -257,7 +229,7 @@ def __init__(self, config):
                 f"{allowed_partition_weight_models}, got "
                 f"{self.config['partition_weight_model']}."
             )
-        
+
         # Initialize caches for performance optimization
         self._topology_cache = {}  # {frozenset(edges): [topology_candidates]}
         self._swap_cache = {}     # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)}
@@ -273,7 +245,7 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
         Cached version of get_subtopologies_of_type.
         Uses canonical form of mini_topology as cache key.
         """
-        
+
         # Create canonical form key
         target_qubits = set()
         for u, v in mini_topology:
@@ -281,13 +253,13 @@ def _get_subtopologies_of_type_cached(self, mini_topology):
             target_qubits.add(v)
         if not target_qubits:
             return []
-        
+
         # Use canonical form as cache key
         canonical_key = get_canonical_form(target_qubits, mini_topology)
-        
+
         if canonical_key not in self._topology_cache:
             self._topology_cache[canonical_key] = get_subtopologies_of_type(self.topology, mini_topology)
-        
+
         return self._topology_cache[canonical_key]
 
     # ------------------------------------------------------------------------
@@ -1065,7 +1037,6 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
 
         # ---- Phase 1: Partition enumeration ----
         allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"])
-        qbit_num_orig_circuit = working_circ.get_Qbit_Num()
         gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())}
 
         single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]}
@@ -1122,19 +1093,19 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             selected_multi = sum(
                 count for size, count in size_counts.items() if size > 1
             )
-            print(
-                "Selected partitions: "
-                f"2-qubit={size_counts.get(2, 0)}, "
-                f"3-qubit={size_counts.get(3, 0)}, "
-                f"total_multi={selected_multi}"
+            logging.info(
+                "Selected partitions: 2-qubit=%d, 3-qubit=%d, total_multi=%d",
+                size_counts.get(2, 0),
+                size_counts.get(3, 0),
+                selected_multi,
             )
 
         # ---- Phase 4: Assemble partitioned circuit from selected partitions only ----
-        partitioned_circuit = Circuit(qbit_num_orig_circuit)
+        partitioned_circuit = Circuit(qbit_num)
         params = []
 
         for gates in selected_parts_gates[:n_multi]:
-            c = Circuit(qbit_num_orig_circuit)
+            c = Circuit(qbit_num)
             for gate_idx in _get_topo_order({x: go[x] & gates for x in gates},
                                             {x: rgo[x] & gates for x in gates},
                                             gate_to_qubit):
@@ -1144,7 +1115,7 @@ def SynthesizeWideCircuit(self, circ, orig_parameters):
             partitioned_circuit.add_Circuit(c)
 
         for chain in standalone_chains:
-            c = Circuit(qbit_num_orig_circuit)
+            c = Circuit(qbit_num)
             for gate_idx in chain:
                 c.add_Gate(gate_dict[gate_idx])
                 start = gate_dict[gate_idx].get_Parameter_Start_Index()
@@ -1715,7 +1686,7 @@ def route_rank_input(item):
             for cost, pi in heuristic_ranked[actual_rank_top_k:]
         )
         return ranked
-        
+
     @staticmethod
     def _snapshot_single_qubit_circuits(optimized_partitions):
         return {
@@ -2228,6 +2199,42 @@ def Partition_Aware_Mapping(
             )
             routing_elapsed_before_cleanup = time.time() - routing_start
 
+            # Pick the best trial (already ranked by actual routing).
+            _, best_pi, _, trace_pi_init, route_steps = trial_results[0]
+
+            if route_steps is not None:
+                partition_order = self._partition_order_from_cpp_steps(
+                    route_steps,
+                    optimized_partitions,
+                    candidate_cache,
+                    N,
+                    pi_initial=trace_pi_init,
+                )
+                pi = np.asarray(best_pi, dtype=np.int64)
+                pi_initial = np.asarray(trace_pi_init, dtype=np.int64)
+                final_route_steps = route_steps
+                final_route_pi_initial = pi_initial.copy()
+            else:
+                F = self.get_initial_layer(IDAG, N, optimized_partitions)
+                partition_order, pi, pi_initial = self.Heuristic_Search(
+                    F,
+                    best_pi.copy(),
+                    DAG,
+                    IDAG,
+                    optimized_partitions,
+                    scoring_partitions,
+                    D,
+                    candidate_cache=candidate_cache,
+                )
+
+            trial_circuit, trial_params = self.Construct_circuit_from_HS(
+                partition_order, optimized_partitions, N
+            )
+            routing_swap_cnot, partition_body_cnot = (
+                self._partition_order_cnot_breakdown(partition_order)
+            )
+            pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get('CNOT', 0)
+
             if do_cleanup:
                 from squander.decomposition.qgd_Wide_Circuit_Optimization import (
                     qgd_Wide_Circuit_Optimization,
@@ -2239,160 +2246,25 @@ def Partition_Aware_Mapping(
                 cleanup_config['test_subcircuits'] = False
                 cleanup_config['test_final_circuit'] = False
                 cleanup_config['global_min'] = True
-                cleanup_config['use_osr'] = 0
-                cleanup_config['use_graph_search'] = 0
-                cleanup_config['part_size_end'] = 3
-                cleanup_config['max_partition_size'] = 3
+                cleanup_config['use_osr'] = 1
+                cleanup_config['use_graph_search'] = 1
+                cleanup_config['max_partition_size'] = 4
 
                 wco = qgd_Wide_Circuit_Optimization(cleanup_config)
 
-                saved_sq_circuits = self._snapshot_single_qubit_circuits(
-                    optimized_partitions
-                )
-
-                cleanup_top_k = self.config.get('cleanup_top_k', 3)
-                top_layouts = trial_results[:cleanup_top_k]
-
-                best_circuit = None
-                best_params = None
-                best_pi_init = None
-                best_pi = None
-                best_cost = float('inf')
-                best_pre_cleanup = None
-                best_routing_swap_cnot = 0
-                best_partition_body_cnot = 0
-                best_route_steps = None
-                best_route_pi_initial = None
-
-                for _, trial_pi, _, trace_pi_init, route_steps in top_layouts:
-                    self._restore_single_qubit_circuits(
-                        optimized_partitions, saved_sq_circuits
-                    )
-                    if route_steps is not None:
-                        partition_order = self._partition_order_from_cpp_steps(
-                            route_steps,
-                            optimized_partitions,
-                            candidate_cache,
-                            N,
-                            pi_initial=trace_pi_init,
-                        )
-                        pi_out = np.asarray(trial_pi, dtype=np.int64)
-                        pi_init = np.asarray(trace_pi_init, dtype=np.int64)
-                    else:
-                        F_trial = self.get_initial_layer(
-                            IDAG, N, optimized_partitions
-                        )
-                        partition_order, pi_out, pi_init = self.Heuristic_Search(
-                            F_trial,
-                            trial_pi.copy(),
-                            DAG,
-                            IDAG,
-                            optimized_partitions,
-                            scoring_partitions,
-                            D,
-                            candidate_cache=candidate_cache,
-                        )
-
-                    trial_circuit, trial_params = self.Construct_circuit_from_HS(
-                        partition_order, optimized_partitions, N
-                    )
-                    pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get(
-                        'CNOT', 0
-                    )
-                    trial_routing_cnot, trial_partition_cnot = (
-                        self._partition_order_cnot_breakdown(partition_order)
-                    )
-
-                    cleanup_t0 = time.time()
-                    cleaned_circuit, cleaned_params = wco.OptimizeWideCircuit(
-                        trial_circuit.get_Flat_Circuit(),
-                        trial_params,
-                    )
-                    cleaned_cost = cleaned_circuit.get_Gate_Nums().get(
-                        'CNOT', 0
-                    )
-                    cleanup_total += time.time() - cleanup_t0
-
-                    if cleaned_cost < best_cost:
-                        best_cost = cleaned_cost
-                        best_pre_cleanup = pre_cleanup_cnots
-                        best_circuit = cleaned_circuit
-                        best_params = cleaned_params
-                        best_pi_init = pi_init
-                        best_pi = pi_out
-                        best_routing_swap_cnot = trial_routing_cnot
-                        best_partition_body_cnot = trial_partition_cnot
-                        best_route_steps = route_steps
-                        best_route_pi_initial = (
-                            pi_init.copy()
-                            if hasattr(pi_init, "copy")
-                            else list(pi_init)
-                        )
-
-                final_cleanup_config = dict(cleanup_config)
-                final_cleanup_config['use_osr'] = 1
-                final_cleanup_config['use_graph_search'] = 1
-                final_cleanup_config['part_size_end'] = 4
-
-                wco = qgd_Wide_Circuit_Optimization(final_cleanup_config)
-
                 cleanup_t0 = time.time()
                 final_circuit, final_parameters = wco.OptimizeWideCircuit(
-                    best_circuit.get_Flat_Circuit(),
-                    best_params,
+                    trial_circuit.get_Flat_Circuit(),
+                    trial_params,
                 )
                 cleanup_total += time.time() - cleanup_t0
-                pi_initial = best_pi_init
-                pi = best_pi
-                routing_swap_cnot = best_routing_swap_cnot
-                partition_body_cnot = best_partition_body_cnot
-                final_route_steps = best_route_steps
-                final_route_pi_initial = best_route_pi_initial
-
             else:
-                _, best_pi, _, trace_pi_init, route_steps = trial_results[0]
-
-                if route_steps is not None:
-                    saved_sq_circuits = self._snapshot_single_qubit_circuits(
-                        optimized_partitions
-                    )
-                    self._restore_single_qubit_circuits(
-                        optimized_partitions, saved_sq_circuits
-                    )
-                    partition_order = self._partition_order_from_cpp_steps(
-                        route_steps,
-                        optimized_partitions,
-                        candidate_cache,
-                        N,
-                        pi_initial=trace_pi_init,
-                    )
-                    pi = np.asarray(best_pi, dtype=np.int64)
-                    pi_initial = np.asarray(trace_pi_init, dtype=np.int64)
-                    final_route_steps = route_steps
-                    final_route_pi_initial = pi_initial.copy()
-                else:
-                    F = self.get_initial_layer(IDAG, N, optimized_partitions)
-                    partition_order, pi, pi_initial = self.Heuristic_Search(
-                        F,
-                        best_pi.copy(),
-                        DAG,
-                        IDAG,
-                        optimized_partitions,
-                        scoring_partitions,
-                        D,
-                        candidate_cache=candidate_cache,
-                    )
-                final_circuit, final_parameters = self.Construct_circuit_from_HS(
-                    partition_order, optimized_partitions, N
-                )
-                routing_swap_cnot, partition_body_cnot = (
-                    self._partition_order_cnot_breakdown(partition_order)
-                )
+                final_circuit, final_parameters = trial_circuit, trial_params
 
         if do_cleanup and n_iterations > 0:
             self._routing_time = routing_elapsed_before_cleanup
             self._cleanup_time = cleanup_total
-            self._cnot_pre_cleanup = best_pre_cleanup
+            self._cnot_pre_cleanup = pre_cleanup_cnots
         else:
             self._routing_time = time.time() - routing_start
             self._cleanup_time = 0.0
@@ -3524,7 +3396,7 @@ def _heuristic_search_layout_only(
                         else:
                             F.append(child)
 
-        return pi, total_cost    
+        return pi, total_cost
     # ------------------------------------------------------------------------
     # Circuit Construction
     # ------------------------------------------------------------------------
@@ -3534,7 +3406,7 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
         final_parameters = []
         perm_count = 0
         partition_count = 0
-        
+
         for part in partition_order:
             if isinstance(part, Circuit):
                 final_circuit.add_Circuit(part)
@@ -3548,15 +3420,15 @@ def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N):
                 final_circuit.add_Circuit(part_circ)
                 final_parameters.append(part_parameters)
                 partition_count += 1
-        
+
         if final_parameters:
             final_parameters = np.concatenate([np.atleast_1d(p).ravel() for p in final_parameters], axis=0)
         else:
             final_parameters = np.array([])
         if not check_circuit_compatibility(final_circuit,self.topology):
-            print("ERROR: Final circuit is not compatible with device topology!")
+            logging.error("Final circuit is not compatible with device topology")
         return final_circuit, final_parameters
-    
+
     # ------------------------------------------------------------------------
     # Scoring
     # ------------------------------------------------------------------------
@@ -3803,7 +3675,7 @@ def obtain_partition_candidates(
     # ------------------------------------------------------------------------
     # Graph Construction
     # ------------------------------------------------------------------------
-        
+
     def get_initial_layer(self, IDAG, N, optimized_partitions):
         del N, optimized_partitions
         return [idx for idx in range(len(IDAG)) if not IDAG[idx]]
@@ -3812,7 +3684,7 @@ def get_initial_layer(self, IDAG, N, optimized_partitions):
     def get_final_layer(self, DAG, N, optimized_partitions):
         del N, optimized_partitions
         return [idx for idx in range(len(DAG) - 1, -1, -1) if not DAG[idx]]
-                
+
     def construct_DAG_and_IDAG(self, optimized_partitions):
         DAG = []
         IDAG = []
@@ -3844,7 +3716,7 @@ def construct_DAG_and_IDAG(self, optimized_partitions):
             DAG.append(children)
             IDAG.append(parents)
         return DAG, IDAG
-    
+
     # ------------------------------------------------------------------------
     # Distance & Layout
     # ------------------------------------------------------------------------
@@ -3852,19 +3724,19 @@ def construct_DAG_and_IDAG(self, optimized_partitions):
     def compute_distances_bfs(self, N):
         """BFS distance computation - faster than Floyd-Warshall."""
         D = np.ones((N, N)) * np.inf
-        
+
         # Build adjacency list
         adj = defaultdict(list)
         for u, v in self.config['topology']:
             adj[u].append(v)
             adj[v].append(u)
-        
+
         # BFS from each vertex
         for start in range(N):
             D[start][start] = 0
             queue = deque([(start, 0)])
             visited = {start}
-            
+
             while queue:
                 node, dist = queue.popleft()
                 for neighbor in adj[node]:
@@ -3872,7 +3744,7 @@ def compute_distances_bfs(self, N):
                         visited.add(neighbor)
                         D[start][neighbor] = dist + 1
                         queue.append((neighbor, dist + 1))
-        
+
         # Store adjacency list for reuse by A* routing
         self._adj = [list(adj[i]) for i in range(N)]
 
@@ -3893,9 +3765,6 @@ def _compute_seeded_layout(self, optimized_partitions, D, N, circ):
            retry VF2 — handles "almost perfect" embeddings.
         3. Fallback: greedy weighted-distance placement from partition weights.
         """
-        from collections import defaultdict
-        from squander.synthesis.PartAM_utils import PartitionSynthesisResult, SingleQubitPartitionResult
-
         if not self.topology:
             return np.arange(N)
 
@@ -3983,9 +3852,6 @@ def _try_vf2_layout(self, G_int, G_hw, N):
 
     def _greedy_seeded_layout(self, optimized_partitions, D, N):
         """Greedy weighted-distance placement (fallback when VF2 fails)."""
-        from collections import defaultdict
-        from squander.synthesis.PartAM_utils import PartitionSynthesisResult, SingleQubitPartitionResult
-
         # Build interaction weights from partitions
         interaction_weight = defaultdict(float)
         for partition in optimized_partitions:
@@ -4067,63 +3933,3 @@ def _score(q):
                 placed_physical.add(best_physical)
 
         return pi
-
-
-    def generate_DAG_levels(self, circuit):
-        """
-        Generate DAG levels - groups gates by their topological level.
-        
-        Args:
-            circuit: The quantum circuit to analyze
-            
-        Returns:
-            List of lists, where each inner list contains gate indices at the same DAG level.
-            Level 0 contains gates with no parents, level 1 contains gates whose parents
-            are all at level 0, etc.
-        """ 
-        gates = circuit.get_Gates()
-        num_gates = len(gates)
-        
-        # Build parent count for each gate
-        parent_counts = [0] * num_gates
-        children_map = [[] for _ in range(num_gates)]
-        
-        for gate_idx in range(num_gates):
-            gate = gates[gate_idx]
-            parents = circuit.get_Parents(gate)
-            parent_counts[gate_idx] = len(parents)
-            
-            # Build children map
-            children = circuit.get_Children(gate)
-            for child_idx in children:
-                children_map[gate_idx].append(child_idx)
-        
-        # Initialize level 0 with gates that have no parents
-        levels = []
-        current_level = []
-        processed = [False] * num_gates
-        
-        # Find gates with no parents (level 0)
-        for gate_idx in range(num_gates):
-            if parent_counts[gate_idx] == 0:
-                current_level.append(gate_idx)
-                processed[gate_idx] = True
-        
-        # Process levels using BFS
-        while current_level:
-            levels.append(current_level)
-            next_level = []
-            
-            # Process all gates in current level
-            for gate_idx in current_level:
-                # Decrement parent counts for children
-                for child_idx in children_map[gate_idx]:
-                    parent_counts[child_idx] -= 1
-                    # If all parents are processed, add to next level
-                    if parent_counts[child_idx] == 0 and not processed[child_idx]:
-                        next_level.append(child_idx)
-                        processed[child_idx] = True
-            
-            current_level = next_level
-        
-        return levels
diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py
index 727d19061..69c2c9732 100644
--- a/squander/synthesis/PartAM_utils.py
+++ b/squander/synthesis/PartAM_utils.py
@@ -182,10 +182,9 @@ def neighbor_heuristic(n_pos):
                 new_positions = tuple(new_positions)
 
                 new_g = g + 1
-                # Bug B fix: update neighbor positions for BOTH sides of the swap.
-                # A neighbor qubit at nb gets displaced to p, AND a neighbor qubit
-                # at p (if it's also tracked, e.g. overlaps with a partition qubit)
-                # moves to nb.
+                # When a partition qubit swaps into nb, a tracked neighbor at nb
+                # is displaced to p AND a tracked neighbor at p (if it overlaps
+                # with a partition qubit) moves to nb. Update both sides.
                 if use_neighbor:
                     new_n_pos = list(n_pos)
                     if nb in n_phys_to_idx:
@@ -466,7 +465,7 @@ def add_result(self, permutations_pair, synthesised_circuit, synthesised_paramet
         self.synthesised_parameters[topology_idx].append(synthesised_parameters)
         self.cnot_counts[topology_idx].append(flat_circuit.get_Gate_Nums().get('CNOT', 0))
         self.circuit_structures[topology_idx].append(self.extract_circuit_structure(flat_circuit))
-    
+
     def extract_circuit_structure(self, circuit):
         circuit_structure = []
         for gate in circuit.get_Gates():
@@ -535,7 +534,7 @@ def get_topology_candidates(self, topology_idx):
 
 
 class PartitionCandidate:
-    
+
     def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits, cnot_count=0):
         #Which partition does this belong to
         self.partition_idx = partition_idx
@@ -545,7 +544,7 @@ def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structu
         self.permutation_idx = permutation_idx
         # the structure of the circuit in Q*
         self.circuit_structure = circuit_structure
-        # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc 
+        # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc
         self.P_i = P_i
         # P_o in Q*->q* permutation pattern [Q*1 Q*0 Q*2] This means that the current output of Q*1 is equal to q*0
         self.P_o = P_o
@@ -608,7 +607,7 @@ def transform_pi(self, pi, D, swap_cache=None, reverse=False, adj=None, neighbor
                 k = qbit_map_inverse[q_star]
                 pi_output[k] = self.node_mapping[P_exit[q_star]]
         return swaps, pi_output
-    
+
     def estimate_swap_count(self, pi, D, reverse=False) -> int:
         """O(n) lower-bound on the number of SWAPs needed to route this
         partition's virtual qubits to their target physical positions.

From 37b863e6ae777700991df72ad76e7764f7edf464 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 4 Jun 2026 13:33:52 +0200
Subject: [PATCH 229/232] Rework example

---
 examples/decomposition/PartAM_example.py | 184 +++++++++++++----------
 1 file changed, 107 insertions(+), 77 deletions(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index b3e1482a0..cd676053f 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -18,23 +18,33 @@
 @author: Peter Rakyta, Ph.D.
 """
 ## \file PartAM_example.py
-## \brief Example demonstrating Partition Aware Mapping
+## \brief Simple example python code demonstrating Partition Aware Mapping
+
+import time
+import numpy as np
 
 from squander import Partition_Aware_Mapping
 from squander import utils
 from squander import Circuit
-import numpy as np
-import time
+from squander.decomposition.qgd_Wide_Circuit_Optimization import (
+    qgd_Wide_Circuit_Optimization,
+)
+
+
+def make_linear_topology(n_qubits):
+    return [(i, i + 1) for i in range(n_qubits - 1)]
 
 
 def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm):
-    """Validate decomposition by applying both circuits to a random state."""
+    """Apply both circuits to a random state and return ``1 - |<psi|phi>|``."""
     num_qubits = circ.get_Qbit_Num()
     matrix_size = 1 << num_qubits
-    initial_state_real = np.random.uniform(-1.0, 1.0, (matrix_size,))
-    initial_state_imag = np.random.uniform(-1.0, 1.0, (matrix_size,))
-    initial_state = initial_state_real + initial_state_imag * 1j
-    initial_state = initial_state / np.linalg.norm(initial_state)
+    rng = np.random.RandomState(0)
+    initial_state = (
+        rng.uniform(-1, 1, (matrix_size,))
+        + 1j * rng.uniform(-1, 1, (matrix_size,))
+    )
+    initial_state /= np.linalg.norm(initial_state)
 
     original_state = initial_state.copy()
     circ_orig.apply_to(parameters_orig, original_state)
@@ -43,90 +53,110 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
     output_perm_T = [0] * num_qubits
     for i, j in enumerate(output_perm):
         output_perm_T[j] = i
-    input_perm_list = [int(x) for x in input_perm]
-    circ_Final.add_Permutation(input_perm_list)
+    circ_Final.add_Permutation([int(x) for x in input_perm])
     circ_Final.add_Circuit(circ)
     circ_Final.add_Permutation(output_perm_T)
 
-    PartAM_state = initial_state.copy()
-    circ_Final.apply_to(params, PartAM_state)
-    state_error = 1 - abs(np.vdot(PartAM_state, original_state))
-    return state_error, circ_Final
-
-
-def run_and_report(label, config, circ_orig, parameters_orig):
-    """Run PartAM with the given config and print results."""
-    print(f"\n{'='*70}")
-    print(label)
-    print(f"{'='*70}")
-
-    start_time = time.time()
-    pam = Partition_Aware_Mapping(config)
-    circ, params, input_perm, output_perm = pam.Partition_Aware_Mapping(circ_orig, parameters_orig)
-    elapsed = time.time() - start_time
-
-    error, circ_final = validate_result(
-        circ_orig, parameters_orig, circ, params, input_perm, output_perm
-    )
-    print(f"Decomposition error: {error:.10f}")
-    print(f"Gate counts: {circ_final.get_Gate_Nums()}")
-    print(f"Time: {elapsed:.2f}s")
-    return error, elapsed
+    state = initial_state.copy()
+    circ_Final.apply_to(params, state)
+    return 1 - abs(np.vdot(state, original_state))
 
 
 if __name__ == '__main__':
 
     filename = "bv_n14.qasm"
+
+    # load the circuit from a file
     circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename)
-    topology = [
-        (0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6),
-        (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (12, 13),
-    ]
+    N = circ_orig.get_Qbit_Num()
+    topology = make_linear_topology(N)
 
-    results = {}
+    initial_cnot = circ_orig.get_Gate_Nums().get('CNOT', 0)
+    print(f"Qubits: {N}, initial CNOTs: {initial_cnot}")
 
-    # ================================================================
-    # Default: single forward pass (sabre_iterations=0)
-    # ================================================================
-    results['default'] = run_and_report("Default (single forward pass)", {
-        'strategy': "TreeSearch",
-        'max_partition_size': 3,
-        'progressbar': True,
-        'topology': topology,
-        'sabre_iterations': 0,
-    }, circ_orig, parameters_orig)
+    start_time = time.time()
 
-    # ================================================================
-    # SABRE-style layout refinement (sabre_iterations=3)
-    # ================================================================
-    results['sabre'] = run_and_report("SABRE iterations=3", {
-        'strategy': "TreeSearch",
+    # one-shot WCO pass before PartAM (topology=None, max_partition_size=3,
+    # part_size_end=4) to fuse trivially-mergeable blocks
+    pre_partam_cleanup_config = {
+        'strategy': 'TreeSearch',
+        'pre-opt-strategy': 'TreeSearch',
+        'partition_strategy': 'ilp',
+        'test_subcircuits': False,
+        'test_final_circuit': False,
         'max_partition_size': 3,
-        'progressbar': True,
-        'topology': topology,
-        'sabre_iterations': 3,
-    }, circ_orig, parameters_orig)
+        'topology': None,
+        'verbosity': 0,
+        'tolerance': 1e-8,
+        'parallel': 0,
+        'part_size_end': 4,
+    }
+    wco = qgd_Wide_Circuit_Optimization(pre_partam_cleanup_config)
+    pre_partam_circ, pre_partam_params = wco.OptimizeWideCircuit(
+        circ_orig.get_Flat_Circuit(),
+        parameters_orig,
+    )
+    pre_partam_cleanup_cnot = pre_partam_circ.get_Gate_Nums().get('CNOT', 0)
+    print(f"PartAM input CNOTs after pre-cleanup: {pre_partam_cleanup_cnot}")
 
-    # ================================================================
-    # Multiple layout trials with SABRE iterations
-    # ================================================================
-    results['trials'] = run_and_report("SABRE iterations=3, layout trials=5", {
+    # PartAM config
+    config = {
         'strategy': "TreeSearch",
+        'test_subcircuits': False,
+        'test_final_circuit': False,
         'max_partition_size': 3,
-        'progressbar': True,
+        'progressbar': False,
         'topology': topology,
-        'sabre_iterations': 3,
-        'n_layout_trials': 5,
+        'verbosity': 0,
+        'cleanup': True,
+        'sabre_iterations': 20,
+        'n_layout_trials': 128,
         'random_seed': 42,
-    }, circ_orig, parameters_orig)
-
-    # ================================================================
-    # Summary
-    # ================================================================
-    print(f"\n{'='*70}")
-    print("Summary")
-    print(f"{'='*70}")
-    print(f"{'Mode':<40} {'Error':<20} {'Time':<10}")
-    for label, (error, elapsed) in results.items():
-        print(f"{label:<40} {error:<20.10f} {elapsed:<10.2f}s")
-    print(f"{'='*70}\n")
+        # Cheap candidate prefilter before full A* scoring.
+        'prefilter_top_k': 400,
+        'prefilter_min_per_partition': 2,
+        'prefilter_min_3q': 12,
+        # Rank every layout trial by actual constructed routing, not only by
+        # the heuristic trial cost.  QFT is sensitive to this cap.
+        'actual_routing_rank_top_k': None,
+        # Boundary-state beam routing runs in the C++ SABRE router.
+        'use_cpp_router': True,
+        'layout_boundary_beam_width': 4,
+        'layout_boundary_beam_depth': 3,
+        'boundary_beam_width': 4,
+        'boundary_beam_depth': 3,
+        'cnot_cost': 0.5 / 3.0,
+        'cleanup_top_k': 3,
+        'parallel_layout_trials': True,
+        'layout_trial_workers': 0,
+        'max_E_size': 40,
+        'max_lookahead': 6,
+        'E_weight': 0.3,
+        'E_alpha': 1.0,  # LightSABRE-style uniform lookahead (no per-depth decay)
+        'decay_delta': 0.001,
+        'swap_burst_budget': 0,
+        'path_tiebreak_weight': 0.2,
+        'three_qubit_exit_weight': 1.5,
+        'partition_weight_model': 'window_turnover',
+        'pack_credit_weight': 1.0,
+        'partition_chain_penalty_weight': 3.0,
+    }
+
+    # instantiate the object for Partition Aware Mapping
+    pam = Partition_Aware_Mapping(config)
+
+    # run Partition Aware Mapping
+    circ, params, input_perm, output_perm = pam.Partition_Aware_Mapping(
+        pre_partam_circ.get_Flat_Circuit(), pre_partam_params
+    )
+
+    elapsed = time.time() - start_time
+
+    error = validate_result(
+        circ_orig, parameters_orig, circ, params, input_perm, output_perm
+    )
+
+    print(f"CNOTs pre-cleanup: {pam._cnot_pre_cleanup}")
+    print(f"CNOTs post-cleanup: {circ.get_Gate_Nums().get('CNOT', 0)}")
+    print(f"Decomposition error: {error:.10f}")
+    print("--- %s seconds elapsed during optimization ---" % elapsed)

From ad57c0a74d11e207eef055586975d827cec27103 Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 4 Jun 2026 13:53:39 +0200
Subject: [PATCH 230/232] add missing pip install from windows actions

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 67b7e4b97..c4bb75133 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -64,7 +64,7 @@ jobs:
     - name: Install Windows System Dependencies
       shell: pwsh
       run: |
-        conda install -c conda-forge cmake tbb-devel openblas lapack -y
+        conda install -c conda-forge pip cmake tbb-devel openblas lapack -y
 
         # setup-miniconda exposes the activated env via CONDA_PREFIX; fall back to conda info if needed.
         $envPath = $env:CONDA_PREFIX

From 72e00508dc2989e31510bfa401a95e9aae2ca29c Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 4 Jun 2026 13:58:57 +0200
Subject: [PATCH 231/232] update chain penalty

---
 examples/decomposition/PartAM_example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py
index cd676053f..d64fe596b 100644
--- a/examples/decomposition/PartAM_example.py
+++ b/examples/decomposition/PartAM_example.py
@@ -139,7 +139,7 @@ def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output
         'three_qubit_exit_weight': 1.5,
         'partition_weight_model': 'window_turnover',
         'pack_credit_weight': 1.0,
-        'partition_chain_penalty_weight': 3.0,
+        'partition_chain_penalty_weight': 2.5,
     }
 
     # instantiate the object for Partition Aware Mapping

From 35815244189056ccaf502594c040ac3db31031ed Mon Sep 17 00:00:00 2001
From: JNadori <degututaj@gmail.com>
Date: Thu, 4 Jun 2026 14:11:40 +0200
Subject: [PATCH 232/232] Fix code issues to work with MSVC as well

---
 squander/synthesis/bindings.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp
index 76d5a3452..6a930d095 100644
--- a/squander/synthesis/bindings.cpp
+++ b/squander/synthesis/bindings.cpp
@@ -8,7 +8,11 @@ pybind11 bindings for the SABRE routing engine.
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 
+#include <algorithm>
+#include <stdexcept>
+#include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include "sabre_router.hpp"
 
@@ -70,7 +74,7 @@ static std::vector<int> extract_int_array(py::handle obj) {
     }
     auto acc = arr.unchecked<1>();
     result.resize(acc.shape(0));
-    for (ssize_t i = 0; i < acc.shape(0); i++) {
+    for (py::ssize_t i = 0; i < acc.shape(0); i++) {
         result[i] = acc(i);
     }
     return result;