diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c4bb75133..09eea5193 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: - name: Install Python Dependencies run: | python -m pip install --upgrade pip - python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest + python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm python -m pip install -e . -q - name: Build @@ -113,7 +113,7 @@ jobs: - name: Install Python Dependencies run: | python -m pip install --upgrade pip - python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest + python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm python -m pip install -e . -q - name: Build @@ -159,7 +159,7 @@ jobs: - name: Install Python Dependencies run: | python -m pip install --upgrade pip - python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest + python -m pip install -q numpy scipy scikit-build pybind11 qiskit qiskit-aer pytest tqdm python -m pip install -e . -q - name: Build diff --git a/CMakeLists.txt b/CMakeLists.txt index bde0b9107..7ae762e7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -806,6 +806,7 @@ list(APPEND qgd_files ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/CR.cpp ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/Adaptive.cpp ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/R.cpp + ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/Permutation.cpp ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/kernels/apply_kernel_to_input.cpp ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/kernels/apply_kernel_to_state_vector_input.cpp ${PROJECT_SOURCE_DIR}/squander/src-cpp/gates/kernels/apply_large_kernel_to_input.cpp @@ -1040,6 +1041,12 @@ add_subdirectory (squander/VQA) add_subdirectory (squander/src-cpp/density_matrix) +# =================================================================== +# SABRE Router Module +# =================================================================== + +add_subdirectory (squander/src-cpp/sabre_router) + if(DEFINED ENV{QGD_CTEST}) # adding CMAKE files for executables add_subdirectory (test_standalone) diff --git a/conda_env_example.yaml b/conda_env_example.yaml index 12bde7316..b09ebd8f5 100644 --- a/conda_env_example.yaml +++ b/conda_env_example.yaml @@ -16,6 +16,7 @@ dependencies: - numpy - scipy - tbb-devel + - pybind11 - pip: - gurobipy - matplotlib diff --git a/examples/decomposition/PartAM_example.py b/examples/decomposition/PartAM_example.py new file mode 100644 index 000000000..d64fe596b --- /dev/null +++ b/examples/decomposition/PartAM_example.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 26 14:42:56 2020 +Copyright 2020 Peter Rakyta, Ph.D. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author: Peter Rakyta, Ph.D. +""" +## \file PartAM_example.py +## \brief Simple example python code demonstrating Partition Aware Mapping + +import time +import numpy as np + +from squander import Partition_Aware_Mapping +from squander import utils +from squander import Circuit +from squander.decomposition.qgd_Wide_Circuit_Optimization import ( + qgd_Wide_Circuit_Optimization, +) + + +def make_linear_topology(n_qubits): + return [(i, i + 1) for i in range(n_qubits - 1)] + + +def validate_result(circ_orig, parameters_orig, circ, params, input_perm, output_perm): + """Apply both circuits to a random state and return ``1 - ||``.""" + num_qubits = circ.get_Qbit_Num() + matrix_size = 1 << num_qubits + rng = np.random.RandomState(0) + initial_state = ( + rng.uniform(-1, 1, (matrix_size,)) + + 1j * rng.uniform(-1, 1, (matrix_size,)) + ) + initial_state /= np.linalg.norm(initial_state) + + original_state = initial_state.copy() + circ_orig.apply_to(parameters_orig, original_state) + + circ_Final = Circuit(num_qubits) + output_perm_T = [0] * num_qubits + for i, j in enumerate(output_perm): + output_perm_T[j] = i + circ_Final.add_Permutation([int(x) for x in input_perm]) + circ_Final.add_Circuit(circ) + circ_Final.add_Permutation(output_perm_T) + + state = initial_state.copy() + circ_Final.apply_to(params, state) + return 1 - abs(np.vdot(state, original_state)) + + +if __name__ == '__main__': + + filename = "bv_n14.qasm" + + # load the circuit from a file + circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename) + N = circ_orig.get_Qbit_Num() + topology = make_linear_topology(N) + + initial_cnot = circ_orig.get_Gate_Nums().get('CNOT', 0) + print(f"Qubits: {N}, initial CNOTs: {initial_cnot}") + + start_time = time.time() + + # one-shot WCO pass before PartAM (topology=None, max_partition_size=3, + # part_size_end=4) to fuse trivially-mergeable blocks + pre_partam_cleanup_config = { + 'strategy': 'TreeSearch', + 'pre-opt-strategy': 'TreeSearch', + 'partition_strategy': 'ilp', + 'test_subcircuits': False, + 'test_final_circuit': False, + 'max_partition_size': 3, + 'topology': None, + 'verbosity': 0, + 'tolerance': 1e-8, + 'parallel': 0, + 'part_size_end': 4, + } + wco = qgd_Wide_Circuit_Optimization(pre_partam_cleanup_config) + pre_partam_circ, pre_partam_params = wco.OptimizeWideCircuit( + circ_orig.get_Flat_Circuit(), + parameters_orig, + ) + pre_partam_cleanup_cnot = pre_partam_circ.get_Gate_Nums().get('CNOT', 0) + print(f"PartAM input CNOTs after pre-cleanup: {pre_partam_cleanup_cnot}") + + # PartAM config + config = { + 'strategy': "TreeSearch", + 'test_subcircuits': False, + 'test_final_circuit': False, + 'max_partition_size': 3, + 'progressbar': False, + 'topology': topology, + 'verbosity': 0, + 'cleanup': True, + 'sabre_iterations': 20, + 'n_layout_trials': 128, + 'random_seed': 42, + # Cheap candidate prefilter before full A* scoring. + 'prefilter_top_k': 400, + 'prefilter_min_per_partition': 2, + 'prefilter_min_3q': 12, + # Rank every layout trial by actual constructed routing, not only by + # the heuristic trial cost. QFT is sensitive to this cap. + 'actual_routing_rank_top_k': None, + # Boundary-state beam routing runs in the C++ SABRE router. + 'use_cpp_router': True, + 'layout_boundary_beam_width': 4, + 'layout_boundary_beam_depth': 3, + 'boundary_beam_width': 4, + 'boundary_beam_depth': 3, + 'cnot_cost': 0.5 / 3.0, + 'cleanup_top_k': 3, + 'parallel_layout_trials': True, + 'layout_trial_workers': 0, + 'max_E_size': 40, + 'max_lookahead': 6, + 'E_weight': 0.3, + 'E_alpha': 1.0, # LightSABRE-style uniform lookahead (no per-depth decay) + 'decay_delta': 0.001, + 'swap_burst_budget': 0, + 'path_tiebreak_weight': 0.2, + 'three_qubit_exit_weight': 1.5, + 'partition_weight_model': 'window_turnover', + 'pack_credit_weight': 1.0, + 'partition_chain_penalty_weight': 2.5, + } + + # instantiate the object for Partition Aware Mapping + pam = Partition_Aware_Mapping(config) + + # run Partition Aware Mapping + circ, params, input_perm, output_perm = pam.Partition_Aware_Mapping( + pre_partam_circ.get_Flat_Circuit(), pre_partam_params + ) + + elapsed = time.time() - start_time + + error = validate_result( + circ_orig, parameters_orig, circ, params, input_perm, output_perm + ) + + print(f"CNOTs pre-cleanup: {pam._cnot_pre_cleanup}") + print(f"CNOTs post-cleanup: {circ.get_Gate_Nums().get('CNOT', 0)}") + print(f"Decomposition error: {error:.10f}") + print("--- %s seconds elapsed during optimization ---" % elapsed) diff --git a/examples/decomposition/example_SABRE.py b/examples/decomposition/example_SABRE.py index 9b990e17e..68a235259 100644 --- a/examples/decomposition/example_SABRE.py +++ b/examples/decomposition/example_SABRE.py @@ -1,14 +1,10 @@ from squander import SABRE from squander import Qiskit_IO from squander import utils +from squander import Circuit from qiskit import transpile from qiskit import QuantumCircuit -from qiskit.circuit import CircuitInstruction -from qiskit.circuit.library import PermutationGate -from qiskit_aer import AerSimulator -from qiskit.quantum_info import Operator -from qiskit import QuantumRegister, ClassicalRegister import numpy as np parameters = np.array([]) @@ -39,13 +35,14 @@ print("INITIAL CIRCUIT:") #print( circuit_qiskit ) print("mapping (q -> Q):", pi) -print("Final mapping:", final_pi) qubits = list(range(N)) -Qiskit_circuit = QuantumCircuit(N) -pi_map = list(np.array(sabre.get_inverse_pi(pi))) -Qiskit_circuit.append(CircuitInstruction( PermutationGate(pi_map),qubits)) -Qiskit_circuit &= Qiskit_IO.get_Qiskit_Circuit( Squander_remapped_circuit, parameters_remapped_circuit ) -Qiskit_circuit.append(CircuitInstruction( PermutationGate(list(final_pi)),qubits)) +pi_map = list(np.array(sabre.get_inverse_pi(final_pi))) +print("Final mapping:", final_pi) +final_circuit = Circuit(N) +final_circuit.add_Permutation(list(pi)) +final_circuit.add_Circuit(Squander_remapped_circuit) +final_circuit.add_Permutation(list(pi_map)) +Qiskit_circuit = Qiskit_IO.get_Qiskit_Circuit( final_circuit.get_Flat_Circuit(), parameters_remapped_circuit ) print("CIRCUIT MAPPED WITH SABRE:") #print( Qiskit_circuit ) print("SABRE SWAP COUNT:", swap_count) @@ -61,27 +58,14 @@ print("CIRCUIT MAPPED WITH QISKIT:") #print( Qiskit_circuit_mapped ) print("QISKIT SWAP COUNT:", dict(Qiskit_circuit_mapped.count_ops())['swap']) - -# test the generated squander circuits -#matrix_size = 1 << Squander_initial_circuit.get_Qbit_Num() -#unitary_squander_initial = utils.get_unitary_from_qiskit_circuit_operator(circuit_qiskit) - -#unitary_squander_remapped_circuit = np.eye( 1 << Squander_initial_circuit.get_Qbit_Num(), dtype=np.complex128 ) -#Squander_remapped_circuit.apply_to( parameters_remapped_circuit, unitary_squander_remapped_circuit) -""" -unitary_squander_remapped_circuit = utils.get_unitary_from_qiskit_circuit_operator(Qiskit_circuit) - - -product_matrix = np.dot(unitary_squander_initial.conj().T, unitary_squander_remapped_circuit) -phase = np.angle(product_matrix[0,0]) -product_matrix = product_matrix*np.exp(-1j*phase) - - -product_matrix = np.eye(matrix_size)*2 - product_matrix - product_matrix.conj().T - -# the error of the decomposition -decomposition_error = (np.real(np.trace(product_matrix)))/2 - -print('The error of the decomposition is ' + str(decomposition_error)) - -""" \ No newline at end of file +num_qubits = final_circuit.get_Qbit_Num() +matrix_size = 1 << num_qubits +initial_state_real = np.random.uniform(-1.0,1.0, (matrix_size,) ) +initial_state_imag = np.random.uniform(-1.0,1.0, (matrix_size,) ) +initial_state = initial_state_real + initial_state_imag*1j +initial_state = initial_state/np.linalg.norm(initial_state) +original_state = initial_state.copy() +Squander_initial_circuit.apply_to(parameters_initial,original_state) +SABRE_state = initial_state.copy() +final_circuit.apply_to(parameters_remapped_circuit,SABRE_state) +print(f"ERROR: {1-abs(np.vdot(SABRE_state,original_state))}") \ No newline at end of file diff --git a/examples/decomposition/wide_circuit_optimization.py b/examples/decomposition/wide_circuit_optimization.py index 32d9bde66..603a4688c 100644 --- a/examples/decomposition/wide_circuit_optimization.py +++ b/examples/decomposition/wide_circuit_optimization.py @@ -21,91 +21,63 @@ ## \brief Simple example python code demonstrating a wide circuit optimization import squander.decomposition.qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization -from squander.decomposition.qgd_Wide_Circuit_Optimization import CNOTGateCount -from squander.gates.qgd_Circuit import qgd_Circuit as Circuit +from squander import Partition_Aware_Mapping from squander import utils from squander import Qiskit_IO -import time, requests, os, zipfile, tempfile -from pathlib import Path +import time +from squander import Circuit +import numpy as np +from qiskit import transpile +def generate_star_topology(num_qubits): + return [(0, i) for i in range(1, num_qubits)] +def extract_two_qubit_gate_count(gate_nums_dict): + # List of two-qubit gate names + two_qubit_gates = ['CNOT', 'CZ', 'CU', 'CH', 'SYC', 'CRY', 'CRZ', 'CRX', 'CP', 'SWAP', 'CSWAP'] + + total_two_qubit = 0 + for gate_name in two_qubit_gates: + total_two_qubit += gate_nums_dict.get(gate_name, 0) + return total_two_qubit +if __name__ == '__main__': -if __name__ == "__main__": - - config = { - "strategy": "TreeSearch", # possible values: "TreeSearch", "qiskit", "bqskit", "TabuSearch" - "test_subcircuits": False, - "test_final_circuit": False, - "max_partition_size": 3, - "beam": None, - "use_osr": True, - "use_graph_search": True, - "pre-opt-strategy": "TreeSearch", # possible values: "TreeSearch", "qiskit", "bqskit", "TabuSearch" - "routing-strategy": "seqpam-ilp", # possible values: "sabre", "light-sabre", "bqskit-sabre", "seqpam-quick", "seqpam-ilp" - "tolerance": 1e-10, - # **{'use_basin_hopping': True, 'bh_T': 1.1822334624366124, 'bh_stepsize': 0.9020671823381502, 'bh_interval': 165, 'bh_target_accept_rate': 0.7037812116166546, 'bh_stepwise_factor': 0.8254028860713254} + use_qiskit_sabre = False + config = { + 'strategy': "TreeSearch", + 'test_subcircuits': False, + 'test_final_circuit': True, + 'max_partition_size': 3, + 'beam': 16, + "use_gl": True, + 'tolerance': 1e-10, } - import os - - files = [os.path.join(Path(__file__).resolve().parent, "bv_n14.qasm")] + filename = "benchmarks/qfast/5q/vqe.qasm" + start_time = time.time() - results = {} - for filename in files: - print(f"executing optimization of circuit: {filename}") + # load the circuit from a file + circ_orig, parameters_orig = utils.qasm_to_squander_circuit(filename) + N = circ_orig.get_Qbit_Num() + # instantiate the object for optimizing wide circuits + wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config ) - # load the circuit from a file - circ, parameters, _ = utils.qasm_to_squander_circuit(filename) - config["topology"] = ( - Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization.linear_topology( - circ.get_Qbit_Num() - ) - ) + # run circuti optimization + circ_flat, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ_orig, parameters_orig, True ) - # run circuit optimization - wide_circuit_optimizer = ( - Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization({**config}) - ) - start_time = time.time() - optcirc, optparameters = wide_circuit_optimizer.OptimizeWideCircuit( - circ, parameters - ) - elapsed = time.time() - start_time - init_cnot_count = CNOTGateCount(circ, 0) - cnot_count, opt_time = CNOTGateCount( - optcirc, 0 - ), wide_circuit_optimizer.config.get("optimization_time", None) - a2a_cnot_count, routed_cnot_count = None, None - a2a_time, routing_time = 0.0, 0.0 + config['topology'] = generate_star_topology(N) + circo = Qiskit_IO.get_Qiskit_Circuit(circ_flat.get_Flat_Circuit(),parameters) + if use_qiskit_sabre: + coupling_map = [[i,j] for i,j in config['topology']] + circuit_qiskit_sabre = transpile(circo, coupling_map=coupling_map) + circ, parameters = Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_sabre) + config['routed']= True + wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config ) + else: + wide_circuit_optimizer = Wide_Circuit_Optimization.qgd_Wide_Circuit_Optimization( config ) + # run circuti optimization + circ, parameters = Qiskit_IO.convert_Qiskit_to_Squander(circo) + circ, parameters = wide_circuit_optimizer.OptimizeWideCircuit( circ, parameters, True ) + print(f"Two qubit gate count: {extract_two_qubit_gate_count(circ.get_Gate_Nums())}") + print("--- %s seconds elapsed during optimization ---" % (time.time() - start_time)) - if wide_circuit_optimizer.config.get("routed_circuit", None) is not None: - init_map, final_map = ( - wide_circuit_optimizer.config["initial_mapping"], - wide_circuit_optimizer.config["final_mapping"], - ) - a2acirc, a2aparams = ( - wide_circuit_optimizer.config["all_to_all_circuit"], - wide_circuit_optimizer.config["all_to_all_parameters"], - ) - routedcirc, routedparams = ( - wide_circuit_optimizer.config["routed_circuit"], - wide_circuit_optimizer.config["routed_parameters"], - ) - a2a_cnot_count = CNOTGateCount(a2acirc, 0) - routed_cnot_count = CNOTGateCount(routedcirc, 0) - a2a_time = wide_circuit_optimizer.config.get( - "all_to_all_optimization_time", None - ) - routing_time = wide_circuit_optimizer.config.get("routing_time", None) - results[os.path.basename(filename)] = ( - (init_cnot_count, a2a_cnot_count, routed_cnot_count, cnot_count), - (a2a_time, routing_time, opt_time, elapsed), - ) - wide_circuit_optimizer.check_compare_circuits( - circ, optparameters, optcirc, optparameters, routing=True - ) - with open("results.txt", "a") as f: - f.write( - f"{os.path.basename(filename)}: {config['pre-opt-strategy']}, {config['routing-strategy']}, {config['strategy']} CNOT count = {init_cnot_count, a2a_cnot_count, routed_cnot_count, cnot_count}, elapsed time = {a2a_time:.2f} + {routing_time:.2f} + {opt_time:.2f} = {elapsed:.2f} seconds\n" - ) - print("--- %s seconds elapsed during optimization ---" % elapsed) diff --git a/pyproject.toml b/pyproject.toml index 3ec14e77b..1721b5129 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,8 @@ requires = [ "tbb-devel; platform_machine == 'x86' or platform_machine == 'x86_64'", "cmake>=3.10.2", "networkx", - "qiskit" + "qiskit", + "tqdm" ] build-backend = "setuptools.build_meta" diff --git a/squander/IO_interfaces/Qiskit_IO.py b/squander/IO_interfaces/Qiskit_IO.py index fa5e3122e..f4b5a4873 100644 --- a/squander/IO_interfaces/Qiskit_IO.py +++ b/squander/IO_interfaces/Qiskit_IO.py @@ -62,7 +62,8 @@ CCX, RXX, RYY, - RZZ ) + RZZ, + Permutation ) @@ -79,7 +80,6 @@ def scalar(param): def get_Qiskit_Circuit( Squander_circuit, parameters ): from qiskit import QuantumCircuit - # creating Qiskit quantum circuit circuit = QuantumCircuit(Squander_circuit.get_Qbit_Num() ) @@ -218,6 +218,13 @@ def get_Qiskit_Circuit( Squander_circuit, parameters ): #CCX gate target_qbits = gate.get_Target_Qbits() circuit.swap(target_qbits[0], target_qbits[1]) + elif isinstance(gate, Permutation): + #Permutation gate + from qiskit.circuit.library import PermutationGate + pattern = gate.get_Pattern() + qubits = list(range(len(pattern))) + circuit.append( PermutationGate(pattern),qubits) + elif isinstance( gate, RXX ): # RXX gate @@ -594,6 +601,11 @@ def convert_Qiskit_to_Squander( qc_in ): Circuit_Squander.add_RZZ( [qubit0, qubit1] ) + elif name[:11] == 'permutation': + #Permutation gate + pattern = gate.operation.pattern + Circuit_Squander.add_Permutation( pattern ) + else: print(f"convert_Qiskit_to_Squander: Unimplemented gate: {name}") diff --git a/squander/__init__.py b/squander/__init__.py index 46e41ba35..87755ccc5 100644 --- a/squander/__init__.py +++ b/squander/__init__.py @@ -14,7 +14,7 @@ # optimization of wide circuits (optimize wide circuits) from squander.decomposition.qgd_Wide_Circuit_Optimization import qgd_Wide_Circuit_Optimization as Wide_Circuit_Optimization - +from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping as Partition_Aware_Mapping # variational quantum solver from squander.VQA.qgd_Variational_Quantum_Eigensolver_Base import qgd_Variational_Quantum_Eigensolver_Base as Variational_Quantum_Eigensolver from squander.VQA.qgd_Generative_Quantum_Machine_Learning_Base import qgd_Generative_Quantum_Machine_Learning_Base as Generative_Quantum_Machine_Learning @@ -52,7 +52,8 @@ RXX, RYY, RZZ, - SXdg + SXdg, + Permutation ) diff --git a/squander/decomposition/qgd_Wide_Circuit_Optimization.py b/squander/decomposition/qgd_Wide_Circuit_Optimization.py index 4841d66cf..423038a3b 100644 --- a/squander/decomposition/qgd_Wide_Circuit_Optimization.py +++ b/squander/decomposition/qgd_Wide_Circuit_Optimization.py @@ -27,6 +27,20 @@ from squander.synthesis.qgd_SABRE import qgd_SABRE as SABRE +def _affinity_num_workers(): + """Return CPU count visible to this process via sched affinity, falling back to cpu_count. + + Use this to size BQSKit ``Compiler(num_workers=...)`` so it does not oversubscribe + when the job is bound (taskset/cgroup) to a subset of the machine's CPUs. + """ + if hasattr(os, "sched_getaffinity"): + try: + return max(1, len(os.sched_getaffinity(0))) + except OSError: + pass + return max(1, mp.cpu_count()) + + def extract_subtopology(involved_qbits, qbit_map, config): """Return topology edges restricted to ``involved_qbits``, with indices remapped via ``qbit_map``. @@ -97,1197 +111,176 @@ def CNOTGateCount(circ: Circuit, max_gates: int = 0) -> int: return num_cnots -class N_Qubit_Decomposition_Guided_Tree(N_Qubit_Decomposition_custom): - """Tree-guided multi-qubit decomposition using operator Schmidt rank (OSR) style costs.""" - - def __init__( - self, Umtx, config, accelerator_num, topology, paramspace=None, paramscale=None - ): - """Initialize guided tree search over a unitary (or list of unitaries) and hardware topology. - - Args: - Umtx: Complex unitary matrix, or list of such matrices (already conjugate-transposed per caller). - config: Decomposition / search configuration dict. - accelerator_num: Number of accelerators for the base decomposer. - topology: List of undirected coupler pairs ``(i, j)``; default is all-to-all. - paramspace: Optional per-parameter affine scaling space for ``params_to_mat``. - paramscale: Optional scaling denominators paired with ``paramspace``. - """ - super().__init__( - Umtx[0] if isinstance(Umtx, list) else Umtx, - config=config, - accelerator_num=accelerator_num, - ) - self.Umtx = ( - Umtx if isinstance(Umtx, list) else [Umtx] - ) # already conjugate transposed - self.qbit_num = self.Umtx[0].shape[0].bit_length() - 1 - self.config = config - self.accelerator_num = accelerator_num - self.paramspace = paramspace - self.paramscale = () if paramscale is None else paramscale - # self.set_Cost_Function_Variant( 0 ) #0 is Frobenius, 3 is HS, 10 is OSR - if topology is None: - topology = [ - (i, j) - for i in range(self.qbit_num) - for j in range(i + 1, self.qbit_num) - ] - self.topology = topology - - @staticmethod - def enumerate_unordered_cnot_BFS(n: int, topology=None, use_gl=True): - """Yield successive BFS levels of CNOT-reachable GL(n,2) states (see ``enumerate_unordered_cnot_BFS_level``). - - Args: - n: Number of qubits. - topology: Allowed unordered CNOT pairs; default all pairs. - use_gl: If True, use GL-style column updates; else restricted enumeration. - - Yields: - Each level's list of ``(state_key, seq_pairs, seq_directed)`` discoveries. - """ - # Precompute unordered pairs - topology = ( - [(i, j) for i in range(n) for j in range(i + 1, n)] - if topology is None - else topology - ) - prior_level_info: Union[tuple[Any, Any, Any, Any], None] = None - while True: - visited, seq_pairs_of, seq_dir_of, res = ( - N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS_level( - n, topology, prior_level_info, use_gl=use_gl - ) - ) - if not res: - break - yield res - prior_level_info = ( - visited, - seq_pairs_of, - seq_dir_of, - list(x[0] for x in reversed(res)), - ) - - @staticmethod - def canonical_prefix_ok(seq): - """Check whether a sequence of unordered pair steps has a canonical topological order. +def generate_squander_seqpam(squander_config, block_size): + """Build a bqskit SeqPAM workflow using Squander as the inner synthesis engine with ILP partitioning. - Returns: - ``-1`` if the prefix is OK; otherwise the first index where canonical order fails. - """ - m = len(seq) - if m <= 1: - return -1 - succ = {} - indeg = {} - last_on = {} - for k in range(m): - for q in seq[k]: - if q in last_on: - p = last_on[q] - succ.setdefault(p, []).append(k) - indeg[k] = indeg.get(k, 0) + 1 - last_on[q] = k - import heapq - - pq = [(seq[x], x) for x in range(m) if indeg.get(x, 0) == 0] - heapq.heapify(pq) - for pos in range(m): - # Kahn's algorithm - if len(pq) == 0: - return pos # malformed (shouldn't happen) - u = heapq.heappop(pq) - if u[1] != pos: - return pos # deviation: not canonical - for v in succ.get(u[1], ()): - indeg[v] -= 1 - if indeg[v] == 0: - heapq.heappush(pq, (seq[v], v)) - return -1 - - @staticmethod - def enumerate_unordered_cnot_BFS_level( - n: int, - topology: Optional[List[Tuple[int, int]]] = None, - prior_level_info: Optional[ - Tuple[ - Set[Tuple[int, ...]], - Dict[Tuple[int, ...], List[Tuple[int, int]]], - Dict[Tuple[int, ...], List[Tuple[int, int]]], - List[ - Tuple[Tuple[int, ...], List[Tuple[int, int]], List[Tuple[int, int]]] - ], - ] - ] = None, - use_gl=True, - ): - """Enumerate GL(n,2) states at the next BFS depth from ``prior_level_info``. - - Moves are *recorded* as unordered pairs (structure view); each expansion - may try both CNOT directions internally when ``use_gl`` is True. - - Returns: - Tuple ``(visited, seq_pairs_of, seq_dir_of, res)`` where ``res`` is a - list of ``(A, seq_pairs, seq_directed)`` for newly discovered states - ``A``: ``seq_pairs`` is the unordered-pair history; ``seq_directed`` is - a consistent directed realization. On the first call, pass - ``prior_level_info=None`` to obtain the root state only. - """ - if prior_level_info is None: - # Initial state - start_key = tuple(1 << i for i in range(n)) - - # Visited: we only need to mark states once (minimal depth) - visited = {start_key} - - # We also keep *one* representative sequence per state (unordered + directed) - seq_pairs_of = {start_key: []} - seq_dir_of = {start_key: []} - - # Yield the root - return visited, seq_pairs_of, seq_dir_of, [(start_key, [], [])] - else: - visited, seq_pairs_of, seq_dir_of, q = prior_level_info - res = [] - new_seq_pairs_of = {} - new_seq_dir_of = {} - - while q: - A = q.pop() - last_pairs = seq_pairs_of[A] - last_dirs = seq_dir_of[A] - assert topology is not None - for p in topology: - if not use_gl: - if len(last_pairs) >= 3 and all(p == x for x in last_pairs[-3:]): - continue # avoid more than 3 repeated CNOTs - if ( - N_Qubit_Decomposition_Guided_Tree.canonical_prefix_ok( - last_pairs + [p] - ) - >= 0 - ): - continue # not canonical prefix - # Try both directions, but record the *same* unordered step 'p' - for mv in (p, (p[1], p[0])) if use_gl else (p,): - # CNOT left - if use_gl: - if mv[0] == mv[1]: - B = A - else: - B = list(A) - B[mv[1]] ^= B[mv[0]] - B = tuple(B) - - if B in visited: - continue # already discovered at minimal depth - else: - B = tuple(last_dirs + [p]) - - visited.add(B) - new_seq_pairs_of[B] = last_pairs + [p] - new_seq_dir_of[B] = last_dirs + [mv] - - # Emit as soon as we discover the state (BFS → minimal depth) - res.append((B, new_seq_pairs_of[B], new_seq_dir_of[B])) - return visited, new_seq_pairs_of, new_seq_dir_of, res - - @staticmethod - def build_sequence(stop: int = 5, ordered: bool = True, use_gl: bool = True): - """Debug helper: print distribution of minimal CNOT sequence lengths by qubit count (up to ``stop``). - - See OEIS A002884 for related enumeration context. Not used in production optimization paths. - """ - # https://oeis.org/A002884 - # unordered sequence: 1, 1, 4, 88, 9556, 4526605 - # unordered at 5 qubits: {0: 1, 1: 10, 2: 85, 3: 650, 4: 4475, 5: 27375, 6: 142499, 7: 580482, 8: 1501297, 9: 1738232, 10: 517884, 11: 13591, 12: 24} - for i in range(2, stop + 1): - d = {} - for z in N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS( - i, use_gl=use_gl - ): - for x in (list if ordered else set)(tuple(x[1]) for x in z): - d[len(x)] = d.get(len(x), 0) + 1 - if not use_gl and len(d) > 5: - break - print({x: d[x] for x in sorted(d)}, sum(d.values())) + Args: + squander_config: Config dict passed to SquanderSynthesisPass (bqskit-squander keys: + ``strategy`` ("Tree_search"/"Tabu_search"), ``verbosity``, + ``optimization_tolerance``, ``optimizer_engine``, etc.). + block_size: Maximum block size for ILP partitioning and SubtopologySelectionPass. - @staticmethod - def extract_bits(x, pos): - """Pack bits of integer ``x`` at positions ``pos`` into a smaller integer (LSB-first order).""" - return sum(((x >> p) & 1) << i for i, p in enumerate(pos)) + Returns: + bqskit Workflow implementing the two-stage permutation-aware mapping. + """ + from bqskit.passes import ( + SquanderSynthesisPass, + ForEachBlockPass, + EmbedAllPermutationsPass, + PAMRoutingPass, + PAMLayoutPass, + PAMVerificationSequence, + SubtopologySelectionPass, + ApplyPlacement, + UnfoldPass, + ExtractModelConnectivityPass, + RestoreModelConnectivityPass, + LogPass, + ) + from bqskit.passes.control import IfThenElsePass + from bqskit.passes.control.predicates import NotPredicate, WidthPredicate + from bqskit.compiler import Workflow, BasePass - @staticmethod - def build_osr_matrix(U, n, A): - """Reshape unitary ``U`` (size ``2^n``) into the OSR matrix for bipartition ``A`` vs complement. + class SquanderILPPartitioner(BasePass): + """Partition a bqskit circuit using Squander's ILP partitioner.""" - Args: - U: Flattened ``2^n x 2^n`` unitary (row-major). - n: Qubit count. - A: Tuple of qubit indices on subsystem A. + def __init__(self, block_size): + self.block_size = block_size - Returns: - Matrix of shape ``(2^{|A|})^2 x (2^{|B|})^2`` for Schmidt analysis. - """ - A = list(reversed(A)) - B = list(sorted(set(range(n)) - set(A), reverse=True)) - A, B = [n - 1 - q for q in A], [n - 1 - q for q in B] - dA = 1 << len(A) - dB = 1 << len(B) - return ( - U.reshape([2] * (2 * n)) - .transpose( - tuple(A) + tuple(t + n for t in A) + tuple(B) + tuple(t + n for t in B) + async def run(self, circuit, data): + from bqskit.ir import Circuit as BQCircuit + from bqskit.ir.lang.qasm2 import OPENQASM2Language + from qiskit import qasm2 + from qiskit import QuantumCircuit as QkCircuit + from squander import Qiskit_IO + from squander.partitioning.partition import PartitionCircuit + + # Unfold any CircuitGate blocks (e.g. from a prior SubtopologySelectionPass) + # so that bqskit op indices align 1:1 with squander gate indices after the + # QASM roundtrip. unfold_all() is a no-op on already-flat circuits. + flat_circuit = circuit.copy() + flat_circuit.unfold_all() + + qasm_str = OPENQASM2Language().encode(flat_circuit) + qk_circ = QkCircuit.from_qasm_str(qasm_str) + sqdr_circ, sqdr_parameters = Qiskit_IO.convert_Qiskit_to_Squander( + qk_circ ) - .reshape(dA * dA, dB * dB) - ) - - @staticmethod - def accumulate_grad_for_cut(U, G, Umat, VTmat, n, A): # qubits on A - """Accumulate gradient ``G * Umat @ VTmat`` from an SVD triplet back into full ``U`` layout for cut ``A``.""" - A = list(reversed(A)) - B = list(sorted(set(range(n)) - set(A), reverse=True)) - A, B = [n - 1 - q for q in A], [n - 1 - q for q in B] - mat = np.array(G) * Umat @ VTmat # reconstruct U from its dyadic decomposition - revmap = [None] * (2 * n) - for i, x in enumerate( - tuple(A) + tuple(t + n for t in A) + tuple(B) + tuple(t + n for t in B) - ): - revmap[x] = i - U += mat.reshape([2] * (2 * n)).transpose(tuple(revmap)).reshape(*U.shape) - return U - - @staticmethod - def trace_out_qubits(U, n, A): - """Trace out complement of subsystem ``A`` and return a unitary polar factor on ``A`` (2^{|A|} x 2^{|A|}).""" - M = N_Qubit_Decomposition_Guided_Tree.build_osr_matrix(U, n, A) - M = np.linalg.svd(M, compute_uv=True, full_matrices=False)[0][:, 0].reshape( - 1 << len(A), 1 << len(A) - ) - return N_Qubit_Decomposition_Guided_Tree._polar_unitary(M) - - @staticmethod - def numerical_rank_osr(M, Fnorm, tol=1e-10): - """Count singular values of ``M/Fnorm`` above ``tol`` relative to the largest; returns ``(rank, s)``.""" - s = np.linalg.svd(M, full_matrices=False, compute_uv=False) / Fnorm - # print(s) - return int(np.sum(s >= s[0] * tol)), s - - @staticmethod - def operator_schmidt_rank(U, n, A, Fnorm, tol=1e-10): - """Operator Schmidt rank of ``U`` across cut ``A`` (via OSR matrix), using ``numerical_rank_osr``.""" - return N_Qubit_Decomposition_Guided_Tree.numerical_rank_osr( - N_Qubit_Decomposition_Guided_Tree.build_osr_matrix(U, n, A), Fnorm, tol - ) - - @staticmethod - def unique_cuts(n): - """Yield all nontrivial unordered bipartitions of ``n`` qubits (each complement pair once).""" - import itertools - - qubits = tuple(range(n)) - for r in range(1, n // 2 + 1): # only up to half - for S in itertools.combinations(qubits, r): - if r < n - r: - yield S - else: # r == n-r (only possible when n even): tie-break - comp = tuple(q for q in qubits if q not in S) - if S < comp: # lexicographically smaller tuple wins - yield S - - def get_circuit_from_pairs(self, pairs, finalizing=True): - """Build a layer of U3–U3–CNOT per pair, optionally followed by trailing U3 on every qubit.""" - circ = Circuit(self.qbit_num) - for pair in pairs: - circ.add_U3(pair[0]) - circ.add_U3(pair[1]) - circ.add_CNOT(pair[0], pair[1]) - if finalizing: - for qbit in range(self.qbit_num): - circ.add_U3(qbit) - return circ - - @staticmethod - def ceil_log2(x): - """Ceiling of log2 for nonnegative integer ``x``; ``0`` maps to ``0``.""" - return 0 if x == 0 else (x - 1).bit_length() - - @staticmethod - def logsumexp_smoothmax(Lc, tau=1e-2): - """Smooth maximum of list ``Lc``: ``tau * log(sum exp(v/tau)) + max``, stable implementation.""" - if not Lc: - return 0.0 - if tau <= 0.0: - raise RuntimeError("tau must be > 0") - m = max(Lc) - acc = 0.0 - for v in Lc: - acc += np.exp((v - m) / tau) - return tau * np.log(acc) + m - - @staticmethod - def dyadic_loss(S, max_dyadic, rho=0.9, tol=1e-4): - """Weighted loss on dyadic singular-value indices (powers of two) of normalized spectrum ``S``.""" - tot_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(len(S)) - w = 1.0 - acc = 0.0 - for k in range(max_dyadic - 1, -1, -1): - if k < tot_dyadic: - val = S[1 << k] - S[0] * tol - acc += w * val * val - w *= rho - return acc - - @staticmethod - def avg_loss(cuts_S, rho=0.9): - """Average ``dyadic_loss`` over a list of singular-value spectra ``cuts_S``.""" - max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2( - max(len(S) for S in cuts_S) - ) - total_loss = 0.0 - for S in cuts_S: - total_loss += N_Qubit_Decomposition_Guided_Tree.dyadic_loss( - S, max_dyadic, rho + partitioned_circuit, parameters, _ = PartitionCircuit( + sqdr_circ, + sqdr_parameters, + self.block_size, + strategy="ilp", ) - return total_loss / len(cuts_S) - - # Aggregated cost over cuts: softmax (log-sum-exp) of per-cut dyadic losses - @staticmethod - def cuts_softmax_dyadic_cost(cuts_S, rho=0.1, tau=1e-2): - """Log-sum-exp aggregate of per-cut dyadic losses (temperature ``tau``).""" - if tau <= 0.0: - raise RuntimeError("tau must be > 0") - Lc = [] - max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2( - max(len(S) for S in cuts_S) - ) - for S in cuts_S: - Lc.append(N_Qubit_Decomposition_Guided_Tree.dyadic_loss(S, max_dyadic, rho)) - return N_Qubit_Decomposition_Guided_Tree.logsumexp_smoothmax(Lc, tau) - - # Gradient w.r.t. the singular values (diagonal of dL/dΣ): - @staticmethod - def dyadic_loss_grad_diag(S, max_dyadic, Fnorm, rho=0.1, tol=1e-4): - """Diagonal gradient of ``dyadic_loss`` w.r.t. singular values (dyadic indices only).""" - n = len(S) - # c_k = rho^k / Mk for k=1..n-1, then prefix sum C_j = sum_{k=1}^j c_k - tot_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2(n) - grad = [0.0] * tot_dyadic - w = 1.0 - for k in range(max_dyadic - 1, -1, -1): - if k < tot_dyadic: - idx = 1 << k - grad[k] = ( - 2.0 * w * S[idx] * (1.0 - tol) / Fnorm - ) # 1-tol not needed if using stop-grad - w *= rho # w = rho^k - return grad - @staticmethod - def cuts_avg_dyadic_grad(cuts_S, Fnorm, rho=0.1): - """Per-cut gradients for the average dyadic loss (list parallel to ``cuts_S``).""" - C = len(cuts_S) - max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2( - max(len(S) for S in cuts_S) - ) - Lc = [] - for c in range(C): - Lc.append( - N_Qubit_Decomposition_Guided_Tree.dyadic_loss_grad_diag( - cuts_S[c], max_dyadic, Fnorm * C, rho + partitioned = BQCircuit(circuit.num_qudits, circuit.radixes) + qasm = OPENQASM2Language() + + for subcircuit in partitioned_circuit.get_Gates(): + global_qudits = list(subcircuit.get_Qbits()) + if not global_qudits: + continue + + start = subcircuit.get_Parameter_Start_Index() + stop = start + subcircuit.get_Parameter_Num() + sub_parameters = parameters[start:stop] + local_map = {q: i for i, q in enumerate(global_qudits)} + local_subcircuit = subcircuit.Remap_Qbits( + local_map, + len(global_qudits), ) - ) - return Lc - - # Gradient w.r.t. singular values (same length as S). - # Only dyadic positions (1,2,4,...) get nonzero entries; others are 0. - @staticmethod - def cuts_softmax_tail_grad(cuts_S, Fnorm, rho=0.1, tau=1e-2): - """Gradient of softmax-of-dyadic-losses w.r.t. each cut's singular values.""" - C = len(cuts_S) - if C == 0: - return [] - max_dyadic = N_Qubit_Decomposition_Guided_Tree.ceil_log2( - max(len(S) for S in cuts_S) - ) - # 1) per-cut losses - Lc = [ - N_Qubit_Decomposition_Guided_Tree.dyadic_loss(cuts_S[c], max_dyadic, rho) - for c in range(C) - ] - - # 2) softmax weights w_c = exp((Lc - m)/tau) / Z - m = max(Lc) - w = [np.exp((Lc[c] - m) / tau) for c in range(C)] - Z = np.sum(w) - for c in range(C): - w[c] /= Z if Z > 0.0 else 1.0 - - # 3) dL/dS^{(c)} = w_c * dL_c/dS^{(c)} - return [ - [ - v * w[c] - for v in N_Qubit_Decomposition_Guided_Tree.dyadic_loss_grad_diag( - cuts_S[c], max_dyadic, Fnorm, rho + local_qiskit = Qiskit_IO.get_Qiskit_Circuit( + local_subcircuit, + sub_parameters, ) - ] - for c in range(C) - ] - - @staticmethod - def loss_for_rank(S, rank): - """Sum of squares of singular values from index ``2**rank`` onward (tail beyond target rank).""" - start = 1 << rank - if start >= len(S): - return 0.0 - return sum(x * x for x in S[start:]) - - @staticmethod - def avg_loss_for_rank(cuts_S, rank): - """Average ``loss_for_rank`` over cuts.""" - if not cuts_S: - return 0.0 - total_loss = 0.0 - for S in cuts_S: - total_loss += N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank) - return total_loss / len(cuts_S) - - # Aggregated cost over cuts: softmax (log-sum-exp) of per-cut dyadic losses - @staticmethod - def cuts_softmax_rank_cost(cuts_S, rank, tau=1e-2): - """Softmax aggregate of per-cut ``loss_for_rank`` (temperature ``tau``).""" - Lc = [] - for S in cuts_S: - Lc.append(N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank)) - return N_Qubit_Decomposition_Guided_Tree.logsumexp_smoothmax(Lc, tau) - - # Gradient w.r.t. the singular values (diagonal of dL/dΣ): - @staticmethod - def loss_for_rank_grad_diag(S, rank, Fnorm): - """ - Gradient of a single-cut tail loss with respect to the RAW singular values, - assuming S is already normalized and Fnorm is treated as constant. - - If S = sigma / Fnorm, then d/dsigma_i sum_{j>=r} S_j^2 = 2*S_i/Fnorm on tail. - """ - n = len(S) - start = 1 << rank - grad = [0.0] * n - if start >= n: - return grad - invF = 1.0 / Fnorm - for i in range(start, n): - grad[i] = 2.0 * S[i] * invF - return grad - - @staticmethod - def cuts_avg_rank_grad(cuts_S, rank, Fnorm): - """ - Gradient of average tail loss across cuts. - Returns one gradient vector per cut, same length as that cut's S. - """ - C = len(cuts_S) - if C == 0: - return [] - scale = 1.0 / C - out = [] - for S in cuts_S: - g = N_Qubit_Decomposition_Guided_Tree.loss_for_rank_grad_diag( - S, rank, Fnorm - ) - out.append([scale * v for v in g]) - return out - - # Gradient w.r.t. singular values (same length as S). - @staticmethod - def cuts_softmax_rank_grad(cuts_S, rank, Fnorm, tau=1e-2): - """ - Gradient of smooth-max across cuts: - L = tau * log(sum_c exp(L_c / tau)) - so - dL = sum_c softmax_c * dL_c - """ - C = len(cuts_S) - if C == 0: - return [] - if tau <= 0.0: - raise RuntimeError("tau must be > 0") - - Lc = [N_Qubit_Decomposition_Guided_Tree.loss_for_rank(S, rank) for S in cuts_S] - - m = max(Lc) - w = [np.exp((v - m) / tau) for v in Lc] - Z = np.sum(w) - if Z <= 0.0: - Z = 1.0 - w = [x / Z for x in w] - - out = [] - for c, S in enumerate(cuts_S): - g = N_Qubit_Decomposition_Guided_Tree.loss_for_rank_grad_diag( - S, rank, Fnorm - ) - out.append([w[c] * v for v in g]) - return out - - # Build M with build_osr_matrix, then SVD (econ) and grab top triplet. - @staticmethod - def top_k_triplet_for_cut( - U, # (N x N), row-major, N = 1<, i|Φ->, i|Ψ+>, |Ψ->) up to harmless phases - return (1 / np.sqrt(2)) * np.array( - [[1, 0, 0, 1j], [0, 1j, 1, 0], [0, 1j, -1, 0], [1j, 0, 0, -1]], - dtype=complex, - ) - - @staticmethod - def _project_to_SO4(O): - """Nearest proper SO(4) rotation to real matrix ``O`` (SVD with det fix).""" - # nearest real orthogonal with det=+1 - O = np.real_if_close(O, tol=1e5) - U, _, Vt = np.linalg.svd(O) - O = U @ Vt - if np.linalg.det(O) < 0: - O[:, 0] *= -1 - return O - - @staticmethod - def _clean_col_phases(W): - """Remove column-wise global phases from matrix ``W`` (largest-magnitude entry per column).""" - Wc = W.copy() - for j in range(Wc.shape[1]): - col = Wc[:, j] - k = np.argmax(np.abs(col)) - if np.abs(col[k]) > 1e-14: - Wc[:, j] *= np.exp(-1j * np.angle(col[k])) - return Wc - - @staticmethod - def closest_local_product(W4): - """Best product of single-qubit unitaries approximating 4x4 ``W4`` (via ``factor_local``).""" - A, B = N_Qubit_Decomposition_Guided_Tree.factor_local(W4) - return N_Qubit_Decomposition_Guided_Tree._global_phase_fix( - A - ), N_Qubit_Decomposition_Guided_Tree._global_phase_fix(B) - - @staticmethod - def kak_u3s_around_cx(U, n, c, t, iters=3): - """KAK-style two-qubit block on control ``c`` and target ``t``: Weyl angles and U3 params (debug helper).""" - U4 = N_Qubit_Decomposition_Guided_Tree.trace_out_qubits(U, n, (c, t)) - U4 = N_Qubit_Decomposition_Guided_Tree._global_phase_fix(U4) - from qiskit.synthesis import TwoQubitWeylDecomposition - - twd = TwoQubitWeylDecomposition(U4) - c1, c2, c3 = twd.a, twd.b, twd.c - K1A, K1B, K2A, K2B = twd.K1l, twd.K1r, twd.K2l, twd.K2r - A = N_Qubit_Decomposition_Guided_Tree._A_from_c(c1, c2, c3) - U_rec = np.kron(K1A, K1B) @ A @ np.kron(K2A, K2B) - z = np.trace(U_rec.conj().T @ U4) - U_rec *= np.exp(1j * np.angle(z)) - print("Frob err:", np.linalg.norm(U_rec - U4), c1, c2, c3) - thA_pre, phA_pre, laA_pre = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz( - K2A.conj().T - ) - thB_pre, phB_pre, laB_pre = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz( - K2B.conj().T - ) - thA_post, phA_post, laA_post = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz( - K1A.conj().T - ) # left-apply ⇒ take dagger on outputs - thB_post, phB_post, laB_post = N_Qubit_Decomposition_Guided_Tree.su2_to_u3_zyz( - K1B.conj().T - ) - return { - "c": (c1, c2, c3), - "pre": { - "A": (thA_pre / 2, phA_pre, laA_pre), - "B": (thB_pre / 2, phB_pre, laB_pre), - }, - "post": { - "A": (thA_post / 2, phA_post, laA_post), - "B": (thB_post / 2, phB_post, laB_post), - }, - } - - def params_to_mat(self, params): - """Apply current gate structure to each target unitary with (optional) affine parameter scaling.""" - allU = [] - for U, pspace in zip( - self.Umtx, [None] if self.paramspace is None else self.paramspace - ): - U = U.copy() - scaled_params = ( - np.sum( - params.reshape(-1, 1 + len(pspace)) * np.array((1.0,) + pspace), - axis=1, + local_bqskit = qasm.decode(qasm2.dumps(local_qiskit)) + partitioned.append_circuit( + local_bqskit, + global_qudits, + as_circuit_gate=True, ) - if pspace is not None - else params - ) - self.get_Circuit().apply_to( - scaled_params if pspace is not None else params, U - ) - allU.append(U) - return allU - def OSR_with_local_alignment( - self, pairs, cuts, Fnorm, tol, rank, use_softmax, method="dual_annealing" - ): - """Optimize gate parameters to reduce OSR-based entanglement across ``cuts`` (optionally softmax-aggregated). + circuit.become(partitioned, False) - Uses cost variant 10 during optimization, then restores variant 3. Returns list of - ``(ceil_log2(rank), singular_spectrum)``-style entries per unitary and cut. - """ - if len(pairs) != 0: - self.set_Cost_Function_Variant(10) - # self.Run_Decomposition(pairs, False) - self.set_Gate_Structure(self.get_circuit_from_pairs(pairs, False)) - import scipy - - param_bound = np.array( - ([2 * np.pi] + [1 / x for x in self.paramscale]) - * self.get_Parameter_Num() - ) + class SetPAMInitialPlacementPass(BasePass): + """Set the placement used as the starting point for the final PAM layout.""" - def cost(x): - allU = self.params_to_mat(x) - S = [ - N_Qubit_Decomposition_Guided_Tree.operator_schmidt_rank( - U, self.qbit_num, cut, Fnorm, tol - )[1] - for U in allU - for cut in cuts - ] - if use_softmax: - return N_Qubit_Decomposition_Guided_Tree.cuts_softmax_rank_cost( - S, rank - ) - else: - return N_Qubit_Decomposition_Guided_Tree.avg_loss_for_rank(S, rank) - - def jacobian(x): - allU = self.params_to_mat(x) - grad = np.zeros(len(x), dtype=float) - for Ubase, U, pspace in zip( - self.Umtx, - allU, - [None] if self.paramspace is None else self.paramspace, - ): - dL = N_Qubit_Decomposition_Guided_Tree.get_deriv_osr_entanglement( - U, cuts, rank, use_softmax - ) - basevec = np.array((1.0,) if pspace is None else (1.0,) + pspace) - scaled_params = ( - np.sum(x.reshape(-1, 1 + len(pspace)) * basevec, axis=1) - if pspace is not None - else x - ) - derivs = N_Qubit_Decomposition_Guided_Tree.param_derivs( - self.get_Circuit(), Ubase, scaled_params - ) - newgrad = np.array( - [ - N_Qubit_Decomposition_Guided_Tree.real_trace_conj_dot( - dL, deriv - ) - for deriv in derivs - ] - ) - if pspace is not None: - newgrad = (np.array(newgrad)[:, np.newaxis] * basevec).reshape( - -1 - ) - grad += newgrad - return grad / len(self.Umtx) + def __init__(self, placement): + self.placement = None if placement is None else list(placement) - if method == "differential_evolution": - best = scipy.optimize.differential_evolution( - cost, [(0, x) for x in param_bound], maxiter=100, polish=False - ) - best = scipy.optimize.minimize( - cost, best.x, method="BFGS", jac=jacobian, options={"maxiter": 200} - ) - elif method == "dual_annealing": - best = None - for seed in range(20): - res = scipy.optimize.dual_annealing( - cost, [(0, x) for x in param_bound], maxiter=100 - ) # , minimizer_kwargs={'jac': jacobian}) - if best is None or res.fun < best.fun: - best = res - elif method == "basinhopping": - best = scipy.optimize.basinhopping( - cost, - np.random.rand(len(param_bound)) * param_bound, - niter=50, - stepsize=np.pi / 2, - minimizer_kwargs={"jac": jacobian}, - ) - else: - best = min( - [ - scipy.optimize.minimize( - cost, - np.random.rand(len(param_bound)) * param_bound, - method="BFGS", - jac=jacobian, - options={"maxiter": 200}, - ) - for _ in range(20) - ], - key=lambda r: r.fun, + async def run(self, circuit, data): + if self.placement is None: + return + if len(self.placement) != circuit.num_qudits: + raise ValueError( + "PAM initial placement length must match circuit width." ) - # print(best) - self.set_Cost_Function_Variant(3) - assert best is not None - allU = self.params_to_mat(best.x) - else: - allU = self.Umtx - return [ - (N_Qubit_Decomposition_Guided_Tree.ceil_log2(rank), s) - for U in allU - for cut in cuts - for rank, s in ( - N_Qubit_Decomposition_Guided_Tree.operator_schmidt_rank( - U, self.qbit_num, cut, Fnorm, tol - ), - ) - ] - - def Run_Decomposition(self, pairs, finalizing=True): - """Run BFGS decomposition for CNOT structure ``pairs``; set ``self.err`` and return success vs tolerance.""" - circ = self.get_circuit_from_pairs(pairs, finalizing) - self.set_Gate_Structure(circ) - self.set_Optimized_Parameters( - np.random.rand(self.get_Parameter_Num()) * (2 * np.pi) - ) - super().Start_Decomposition() - if finalizing: - params = self.get_Optimized_Parameters() - self.err = self.Optimization_Problem(params) - return self.err < self.config.get("tolerance", 1e-8) - - @staticmethod - def generate_insertions(curpath, topology, num_cnot): - """Yield CNOT insertion patterns: insert ``num_cnot`` topology pairs into sequence ``curpath``.""" - import itertools + data.placement = list(self.placement) - n = len(curpath) - nslots = n + 1 - for places in itertools.combinations_with_replacement(range(nslots), num_cnot): - for pairs in itertools.product(topology, repeat=num_cnot): - out = [] - j = 0 # index into inserted pairs - for slot in range(nslots): - while j < num_cnot and places[j] == slot: - out.append(pairs[j]) - j += 1 - if slot < n: - out.append(curpath[slot]) - yield tuple(out) - - def Start_Decomposition(self): - """Beam-style search over CNOT prefixes guided by OSR stats; collects solutions in ``self.all_solutions``.""" - import heapq, itertools - - self.all_solutions = [] - self.err = 1.0 - stop_first_solution = self.config.get("stop_first_solution", True) - cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(self.qbit_num)) - # because we have U already conjugate transposed, must use prefix order - B = self.config.get("beam", None) # 8*len(self.topology)) - max_depth = self.config.get("tree_level_max", 14) - tol = 1e-3 - Fnorm = np.sqrt(1 << self.qbit_num) - best = [] - visited = set() - all_ranks = list(range(min(2, self.qbit_num - 1))) - - def get_osr_stats(path, rank, use_softmax): - """Return ``(min_cnots, rank_kappa_metric, raw_osr_list)`` for prefix ``path``.""" - h = self.OSR_with_local_alignment( - path, - cuts, - Fnorm, - tol=tol, - rank=rank, - use_softmax=use_softmax, - method="basin_hopping", - ) - min_cnots = max((x[0] for x in h), default=0) - ranktot = sum(x[0] for x in h) - kappa = sum(sum(y * y for y in x[1][1:]) for x in h) - return min_cnots, ranktot + kappa, h - - def add_to_heap(path, parent_stats): - """Push ``path`` onto search heap if within depth and OSR bounds improve on ``parent_stats``.""" - if len(path) > max_depth: - return False - if path in visited: - return False - visited.add(path) - if self.qbit_num > 1: - min_cnots, rankkappa = min( - get_osr_stats(path, rank, use_sm)[:2] - for (rank, use_sm) in itertools.product(all_ranks, (False,)) - ) # (False, True) - else: - min_cnots, rankkappa = 0, 0.0 - if parent_stats is not None and (min_cnots, rankkappa) >= parent_stats: - return False - heapq.heappush(best, (min_cnots, rankkappa, path)) - return True - - add_to_heap((), None) - while best: - # print(best[0]) - min_cnots, rankkappa, curpath = heapq.heappop(best) - if min_cnots == 0: - # print(path) - for i in range(10): - if self.Run_Decomposition(curpath): - self.all_solutions.append( - (self.get_Circuit(), self.get_Optimized_Parameters()) - ) - if stop_first_solution: - return - break - # print("Looping", h) - num_cnot = 1 - while True: - any_added = False - for newpath in N_Qubit_Decomposition_Guided_Tree.generate_insertions( - curpath, self.topology, num_cnot - ): - if add_to_heap(newpath, (min_cnots, rankkappa)): - any_added = True - if any_added: - break - num_cnot += 1 - if len(curpath) + num_cnot > max_depth: - break - self.set_Gate_Structure(Circuit(self.qbit_num)) - self.set_Optimized_Parameters(np.array([])) - # print("No decomposition found within the given CNOT limit.") - - """ - def Start_Decomposition(self): - self.all_solutions = [] - self.err = 1.0 - stop_first_solution = self.config.get("stop_first_solution", True) - cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(self.qbit_num)) - if self.topology is None: - self.topology = [(i, j) for i in range(self.qbit_num) for j in range(i+1, self.qbit_num)] - pair_affects = { - pair: [i for i,A in enumerate(cuts) if (pair[0] in A) ^ (pair[1] in A)] - for pair in self.topology - } - #because we have U already conjugate transposed, must use prefix order - B = self.config.get('beam', None)#8*len(self.topology)) - max_depth = self.config.get('tree_level_max', 14) - tol = 1e-3 - Fnorm = np.sqrt(1< remaining: continue - if not curh is None: - #print(path, [(h[i], curh[i]) for i in check_cuts]) - #if any(h[i][0] > curh[i][0] for i in check_cuts): continue - if max((x[0] for x in curh), default=0) < min_cnots: continue - nextprefixes.append((path, h)) - nextprefixes.sort(key=lambda t: (max((x[0] for x in t[1]), default=0), sum(x[0] for x in t[1]), N_Qubit_Decomposition_Guided_Tree.avg_loss([x[1] for x in t[1]]))) - prefixes = {x[0]: x[1] for x in nextprefixes[:B]} - prior_level_info = (visited, seq_pairs_of, seq_dir_of, list(x[0] for x in reversed(res) if tuple(x[1]) in prefixes)) - self.set_Gate_Structure(Circuit(self.qbit_num)) - self.set_Optimized_Parameters(np.array([])) - #print("No decomposition found within the given CNOT limit.") - """ + from bqskit.passes import QuickPartitioner + squander = SquanderSynthesisPass(squander_config=squander_config) + partitioner = SquanderILPPartitioner(block_size) + enable_pam_verification = bool(squander_config.get("enable_pam_verification", False)) + num_layout_passes = int(squander_config.get("num_layout_passes", 3)) + pam_initial_placement = squander_config.get("pam_initial_placement", None) - def get_Decomposition_Error(self): - """Last decomposition error (Frobenius / cost) from guided search or ``Run_Decomposition``.""" - return self.err - - @staticmethod - def compositions(total, parts): - """ - All nonnegative integer tuples of length `parts` summing to `total`. - """ - if parts == 1: - yield (total,) - return - for x in range(total + 1): - for rest in N_Qubit_Decomposition_Guided_Tree.compositions( - total - x, parts - 1 - ): - yield (x,) + rest - - @staticmethod - def solve_best_min_cnots(num_qubits, cuts, rank_kappa, topology, use_surplus=True): - """Minimize total CNOT count subject to per-cut edge coverage vs ``rank_kappa`` bounds; return best kappa.""" - m = len(topology) - cut_to_edges = [ - [i for i, z in enumerate(topology) if (z[0] in cut) != (z[1] in cut)] - for cut in cuts - ] - total = 0 - best_kappa = None - while True: - for edge_counts in N_Qubit_Decomposition_Guided_Tree.compositions(total, m): - if all( - sum(edge_counts[j] for j in cut_to_edge) >= cut_bound[0] - for cut_to_edge, cut_bound in zip(cut_to_edges, rank_kappa) - ): - new_kappa = 0.0 - for cut_to_edge, cut_bound in zip(cut_to_edges, rank_kappa): - coverage = sum(edge_counts[j] for j in cut_to_edge) - if use_surplus: - new_kappa += cut_bound[1] * (coverage - cut_bound[0]) - else: - new_kappa += cut_bound[1] * coverage - best_kappa = ( - new_kappa if best_kappa is None else max(best_kappa, new_kappa) - ) - if best_kappa is not None: - break - total += 1 - return total, best_kappa - - @staticmethod - def solve_min_cnots(num_qubits, cuts, cut_bounds, topology): - """Smallest total CNOT budget such that each cut's crossing edges meet ``cut_bounds``.""" - m = len(topology) - cut_to_edges = [ - [i for i, z in enumerate(topology) if (z[0] in cut) != (z[1] in cut)] - for cut in cuts - ] - total = 0 - while True: - for edge_counts in N_Qubit_Decomposition_Guided_Tree.compositions(total, m): - if all( - sum(edge_counts[j] for j in cut_to_edge) >= cut_bound - for cut_to_edge, cut_bound in zip(cut_to_edges, cut_bounds) - ): - return total - total += 1 - - @staticmethod - def gen_all_min_cnots( - num_qbits, topology=None - ): # OSR tells min CNOTs at most for 3 qubits 3, 4 qubits 6, 5 qubits 7 - """Debug: print min CNOT solutions for all combinations of per-cut bounds (see ``solve_min_cnots``).""" - import itertools + pam_verify_passes = ( + [PAMVerificationSequence(block_size)] if enable_pam_verification else [] + ) - cuts = list(N_Qubit_Decomposition_Guided_Tree.unique_cuts(num_qbits)) - min_cnot_bounds = [ - 2 * min(cut_size, num_qbits - cut_size) - for cut_size in (len(cut) for cut in cuts) - ] - if topology is None: - topology = [ - (i, j) for i in range(num_qbits) for j in range(i + 1, num_qbits) - ] - for cnot_bounds in itertools.product( - *(range(bound + 1) for bound in min_cnot_bounds) - ): - # if tuple(sorted(cnot_bounds)) != cnot_bounds: continue - print( - cnot_bounds, - N_Qubit_Decomposition_Guided_Tree.solve_min_cnots( - num_qbits, cuts, cnot_bounds, topology - ), - ) + inner_passes = [ + LogPass("Caching permutation-aware synthesis results."), + ExtractModelConnectivityPass(), + partitioner, + ForEachBlockPass( + EmbedAllPermutationsPass( + inner_synthesis=squander, + input_perm=True, + output_perm=False, + vary_topology=False, + ), + ), + LogPass("Preoptimizing with permutation-aware mapping."), + PAMRoutingPass(), + *pam_verify_passes, + UnfoldPass(), + RestoreModelConnectivityPass(), + LogPass("Recaching permutation-aware synthesis results."), + SubtopologySelectionPass(block_size), + QuickPartitioner(block_size), + ForEachBlockPass( + EmbedAllPermutationsPass( + inner_synthesis=squander, + input_perm=False, + output_perm=True, + vary_topology=True, + ), + ), + LogPass("Performing permutation-aware mapping."), + ApplyPlacement(), + SetPAMInitialPlacementPass(pam_initial_placement), + PAMLayoutPass(num_layout_passes), + PAMRoutingPass(0.1), + *pam_verify_passes, + ApplyPlacement(), + UnfoldPass(), + ] + + return Workflow( + IfThenElsePass( + NotPredicate(WidthPredicate(2)), + inner_passes, + ), + name="SeqPAM Mapping", + ) -# N_Qubit_Decomposition_Guided_Tree.gen_all_min_cnots(3); assert False -# N_Qubit_Decomposition_Guided_Tree.build_sequence(); assert False -# print(len(list(N_Qubit_Decomposition_Guided_Tree.enumerate_unordered_cnot_BFS(3, [(0,1),(1,2),])))); assert False class qgd_Wide_Circuit_Optimization: """Optimize wide (many-qubit) circuits via partitioning and subcircuit decomposition. Supports multiple decomposition strategies, optional global recombination (ILP), and routing when the circuit does not match the target topology. + """ def __init__(self, config): @@ -1312,6 +305,7 @@ def __init__(self, config): "TreeGuided", "qiskit", "bqskit", + "seqpam_PartAM", ] if not strategy in allowed_startegies: raise Exception( @@ -1390,7 +384,8 @@ def ConstructCircuitFromPartitions( def DecomposePartition( Umtx: np.ndarray, config: dict, mini_topology=None, structure=None ) -> list[tuple[Circuit, np.ndarray]]: - """Decompose a unitary ``Umtx`` (e.g. from a partition) using ``config['strategy']``. + """ + Decompose a unitary ``Umtx`` (e.g. from a partition) using ``config['strategy']``. Args: Umtx: Complex unitary matrix. @@ -1399,11 +394,7 @@ def DecomposePartition( structure: Required gate structure when ``strategy == "Custom"``. Returns: - Normally ``[(circuit, parameters)]`` on success, or ``[]`` if the - decomposition error exceeds ``tolerance``. If - ``config.get('stop_first_solution')`` is false, returns - ``cDecompose.all_solutions`` from the underlying decomposer instead of - a single best pair. + List of ``(squander_circuit, parameters)`` on success, or ``[]`` if error exceeds tolerance. """ strategy = config["strategy"] if strategy == "TreeSearch": @@ -1421,10 +412,6 @@ def DecomposePartition( level_limit_min=1, topology=mini_topology, ) - elif strategy == "TreeGuided": - cDecompose = N_Qubit_Decomposition_Guided_Tree( - Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology - ) elif strategy == "Custom": cDecompose = N_Qubit_Decomposition_custom( Umtx.conj().T, config=config, accelerator_num=0 @@ -1469,7 +456,7 @@ def DecomposePartition( parameters = cDecompose.get_Optimized_Parameters() err = cDecompose.Optimization_Problem(parameters) it += 1 - if err > tolerance or it != 0: + if (err > tolerance or it != 0) and config.get("verbosity", 0) >= 1: print("Decomposition error: ", err, it) else: err = cDecompose.get_Decomposition_Error() @@ -1486,15 +473,25 @@ def CompareAndPickCircuits( parameter_arrs: List[np.ndarray], metric: Callable[[Circuit], int] = CNOTGateCount, ) -> tuple[Circuit, np.ndarray]: - """Select the circuit with the lowest ``metric`` value. + """ + Call to pick the most optimal circuit corresponding a specific metric. Looks for the circuit + with the minimal metric value. + Args: - circs: Candidate Squander circuits (same length as ``parameter_arrs``). - parameter_arrs: Parameter vectors aligned with ``circs``. - metric: Scalar cost functional; lower is better. Defaults to ``CNOTGateCount``. - Returns: - ``(best_circuit, best_parameters)`` for the minimizing index. + circs ( List[Circuit] ) A list of Squander circuits to be compared + + parameter_arrs ( List[np.ndarray] ) A list of parameter arrays associated with the sqaunder circuits + + metric (optional) The metric function to decide which input circuit is better. + + + Return: + + Returns with the chosen circuit and the corresponding parameter array + + """ if not isinstance(circs, list): @@ -1521,10 +518,8 @@ def PartitionDecompositionProcess( config: dict, structure=None, ) -> Tuple[Circuit, np.ndarray]: - """Decompose one partition subcircuit (multiprocessing-safe entry point). - - For ``TreeGuided`` on large registers, may recursively partition and - enumerate combinations before returning remapped results. + """ + Worker-friendly entry: decompose a partition subcircuit (optionally nested for TreeGuided). Args: subcircuit: Subcircuit acting on a subset of the wide register. @@ -1533,8 +528,7 @@ def PartitionDecompositionProcess( structure: Optional fixed gate structure when ``strategy == "Custom"``. Returns: - Tuple of ``(decomposed_circuit, decomposed_parameters)`` pairs, each - remapped back to the original qubit indices of ``subcircuit``. + List of ``(Circuit, parameters)`` pairs (or empty list on failure), remapped to the original register. """ qbit_num_orig_circuit = subcircuit.get_Qbit_Num() @@ -1553,116 +547,16 @@ def PartitionDecompositionProcess( # remap the subcircuit to a smaller qubit register remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num) - if ( - qbit_num > 3 - and structure is None - and config.get("strategy", "") == "TreeGuided" - ): - circo = Circuit(qbit_num) - for gate in remapped_subcircuit.get_Gates(): - circo.add_Gate(gate) - remapped_subcircuit = circo - partitioned_circuit, params, recombine_info, _ = ( - qgd_Wide_Circuit_Optimization.make_all_partition_circuit( - remapped_subcircuit, subcircuit_parameters, 3 - ) - ) - optimized_circuits = [] - subcircs = partitioned_circuit.get_Gates() - # first find the optimal CNOT decomposition - for innercirc in subcircs: - start_idx = innercirc.get_Parameter_Start_Index() - innercirc_parameters = params[ - start_idx : start_idx + innercirc.get_Parameter_Num() - ] - callback_fnc = ( - lambda x: qgd_Wide_Circuit_Optimization.CompareAndPickCircuits( - [innercirc, *(z[0] for z in x)], - [innercirc_parameters, *(z[1] for z in x)], - ) - ) - optimized_circuits.append( - callback_fnc( - qgd_Wide_Circuit_Optimization.PartitionDecompositionProcess( - innercirc, - innercirc_parameters, - { - **config, - "stop_first_solution": True, - "tree_level_max": max( - 0, CNOTGateCount(subcircuit, 0) - 1 - ), - }, - structure=None, - ) - ) - ) - parts, struct_idxs = ( - qgd_Wide_Circuit_Optimization.recombine_all_partition_circuit( - remapped_subcircuit, - [x[0] for x in optimized_circuits], - params, - recombine_info, - ) - ) - # enumerate all solutions for each subcircuit in the optimal - all_sol_for_idx = [] - for idx in struct_idxs: - innercirc = subcircs[idx] - start_idx = innercirc.get_Parameter_Start_Index() - innercirc_parameters = params[ - start_idx : start_idx + innercirc.get_Parameter_Num() - ] - callback_fnc = lambda x: x + [(innercirc, innercirc_parameters)] - all_sol_for_idx.append( - callback_fnc( - qgd_Wide_Circuit_Optimization.PartitionDecompositionProcess( - innercirc, - innercirc_parameters, - { - **config, - "stop_first_solution": False, - "tree_level_max": max(0, CNOTGateCount(subcircuit, 0)), - }, - structure=None, - ) - ) - ) - all_decomposed = [] - import itertools + if not structure is None: + structure = structure.Remap_Qbits(qbit_map, qbit_num) - opt = qgd_Wide_Circuit_Optimization({**config, "max_partition_size": 3}) - if np.prod([len(x) for x in all_sol_for_idx]) > 32: - import random + # get the unitary representing the circuit + unitary = remapped_subcircuit.get_Matrix(subcircuit_parameters) - trycombs = [ - [random.choice(x) for x in all_sol_for_idx] for _ in range(32) - ] - else: - trycombs = itertools.product(*all_sol_for_idx) - for combination in trycombs: - structures = [ - qgd_Wide_Circuit_Optimization.copy_circuit_structure(x[0]) - for x in combination - ] - optcirc, optparams = opt._OptimizeWideCircuit( - remapped_subcircuit, subcircuit_parameters, False, parts, structures - ) - reoptcirc, reoptparams = opt._OptimizeWideCircuit( - optcirc.get_Flat_Circuit(), optparams - ) - all_decomposed.append((reoptcirc.get_Flat_Circuit(), reoptparams)) - else: - if not structure is None: - structure = structure.Remap_Qbits(qbit_map, qbit_num) - - # get the unitary representing the circuit - unitary = remapped_subcircuit.get_Matrix(subcircuit_parameters) - - # decompose a small unitary into a new circuit - all_decomposed = qgd_Wide_Circuit_Optimization.DecomposePartition( - unitary, config, mini_topology, structure=structure - ) + # decompose a small unitary into a new circuit + all_decomposed = qgd_Wide_Circuit_Optimization.DecomposePartition( + unitary, config, mini_topology, structure=structure + ) # create inverse qbit map: inverse_qbit_map = {} for key, value in qbit_map.items(): @@ -1690,15 +584,7 @@ def PartitionDecompositionProcess( @staticmethod def build_partition_topo_deps(allparts): - """Order partition gate-sets by dependencies and build a reverse-dependency map. - - Args: - allparts: List of sets of gate indices, one per partition. - - Returns: - ``(ordered_parts, rg_new)`` where ``ordered_parts`` lists partitions in - topological order and ``rg_new`` maps each new index to predecessors. - """ + """Topological sort of partition gate-sets; returns ordered partitions and reverse-dependency map.""" gate_to_parts = {} for i, part in enumerate(allparts): for gate in part: @@ -1812,15 +698,7 @@ def make_all_partition_circuit(circ, orig_parameters, max_partition_size): @staticmethod def strip_single_qubit_head_tails(circ, params): - """Drop single-qubit gates that sit only at the head or tail of the dependency DAG. - - Args: - circ: Input circuit. - params: Flat parameter array for ``circ``. - - Returns: - ``(new_circuit, new_params)`` with head/tail single-qubit gates removed. - """ + """Remove single-qubit gates that are purely at the head/tail of the dependency graph.""" gate_dict, g, rg, gate_to_qubit, _ = build_dependency(circ) newcirc = Circuit(circ.get_Qbit_Num()) new_params = [] @@ -1839,15 +717,7 @@ def strip_single_qubit_head_tails(circ, params): @staticmethod def get_fingerprint(circ, params): - """Hashable signature of gate layout and parameters (for decomposition caching). - - Args: - circ: Squander circuit. - params: Parameter array associated with ``circ``. - - Returns: - Tuple usable as a dict key for memoizing decompositions. - """ + """Hashable signature of gate types, qubits, and parameters (for decomposition caching).""" return tuple( (gate.get_Name(), tuple(gate.get_Involved_Qbits())) for gate in circ.get_Gates() @@ -1857,16 +727,10 @@ def get_fingerprint(circ, params): def recombine_all_partition_circuit( circ, optimized_subcircuits, optimized_parameter_list, recombine_info ): - """Reorder optimized partitions to respect global gate dependencies. - - Args: - circ: Original flat circuit (for topological ordering context). - optimized_subcircuits: One optimized subcircuit per partition slot. - optimized_parameter_list: Parameter lists aligned with ``optimized_subcircuits``. - recombine_info: Tuple from ``make_all_partition_circuit`` (ILP metadata). + """Reorder partition results to satisfy global dependencies. - Returns: - ``(reordered_circuits, reordered_parameter_lists)`` in execution order. + Uses ILP-based ordering and a final topological sort, then returns + reordered subcircuits and parameter arrays aligned by structure index. """ from squander.partitioning.ilp import ( topo_sort_partitions, @@ -1918,13 +782,15 @@ def OptimizeWideCircuit( circ, self.config["topology"] ): - print("fixing topology in the circuit") + if self.config["verbosity"] >= 1: + print("fixing topology in the circuit") topo = self.config["topology"] self.config["topology"] = None strat = self.config["strategy"] self.config["strategy"] = self.config["pre-opt-strategy"] - print("Optimizing circuit with all-to-all (a2a) connectivity") + if self.config["verbosity"] >= 1: + print("Optimizing circuit with all-to-all (a2a) connectivity") circ, parameters = self.OptimizeWideCircuit(circ, parameters) self.config["all_to_all_optimization_time"] = self.config[ "optimization_time" @@ -1935,17 +801,20 @@ def OptimizeWideCircuit( self.config["topology"] = topo start_time = time.time() - print("Routing circuit to fix the topology") + if self.config["verbosity"] >= 1: + print("Routing circuit to fix the topology") circ, parameters = self.route_circuit(circ, parameters) self.config["routing_time"] = time.time() - start_time self.config["routed_circuit"] = circ self.config["routed_parameters"] = parameters else: - print("No additional routing is needed on the circuit") + if self.config["verbosity"] >= 1: + print("No additional routing is needed on the circuit") start_time = time.time() if self.config["strategy"] == "bqskit": - print("Optimizing circuit with BQSkit") + if self.config["verbosity"] >= 1: + print("Optimizing circuit with BQSkit") from squander import Qiskit_IO from bqskit import compile @@ -1989,7 +858,7 @@ def OptimizeWideCircuit( LogErrorPass(), ] - with Compiler() as compiler: + with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler: routed_bqskit_circ, pass_data = compiler.compile( bqskit_circ, compilation_workflow, True ) @@ -2009,12 +878,61 @@ def OptimizeWideCircuit( qgd_Wide_Circuit_Optimization.check_valid_routing( newcirc, self.config["topology"] ) - print("OptimizeWideCircuit::check_compare_circuits") + if self.config["verbosity"] >= 2: + print("OptimizeWideCircuit::check_compare_circuits") + self.check_compare_circuits(circ, parameters, newcirc, newparameters) + circ, parameters = newcirc, newparameters + + elif self.config["strategy"] == "seqpam_PartAM": + if self.config["verbosity"] >= 1: + print("Optimizing circuit with BQSKit SeqPAM + Squander (PartAM ILP weights)") + from squander import Qiskit_IO + from bqskit.compiler import Compiler + from bqskit.compiler.machine import MachineModel + from bqskit.ir.lang.qasm2 import OPENQASM2Language + from bqskit.passes import SetModelPass + from qiskit import qasm2, QuantumCircuit + + strategy_map = {"TreeSearch": "Tree_search", "TabuSearch": "Tabu_search"} + squander_config = { + "strategy": strategy_map.get(self.config.get("strategy", "TreeSearch"), "Tree_search"), + "optimization_tolerance": self.config.get("tolerance", 1e-8), + "verbosity": self.config.get("verbosity", 0), + "optimizer_engine": self.config.get("optimizer_engine", "BFGS"), + "Cost_Function_Variant": self.config.get("Cost_Function_Variant", 3), + "size_density_weight": True, + "sparse_penalty": self.config.get("sparse_penalty", 3.0), + "max_partition_size": self.max_partition_size, + } + block_size = self.max_partition_size + + model = MachineModel(circ.get_Qbit_Num(), self.config["topology"]) + circo = Qiskit_IO.get_Qiskit_Circuit(circ, parameters) + bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo)) + + workflow = generate_squander_seqpam(squander_config, block_size) + + with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler: + routed_bqskit_circ = compiler.compile( + bqskit_circ, [SetModelPass(model), workflow] + ) + + circuit_qiskit = QuantumCircuit.from_qasm_str( + OPENQASM2Language().encode(routed_bqskit_circ) + ) + newcirc, newparameters = Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit) + + qgd_Wide_Circuit_Optimization.check_valid_routing( + newcirc, self.config["topology"] + ) + if self.config["verbosity"] >= 2: + print("OptimizeWideCircuit::check_compare_circuits") self.check_compare_circuits(circ, parameters, newcirc, newparameters) circ, parameters = newcirc, newparameters elif self.config["strategy"] == "qiskit": - print("Optimizing circuit with Qiskit") + if self.config["verbosity"] >= 1: + print("Optimizing circuit with Qiskit") from squander import Qiskit_IO from qiskit import transpile from qiskit.transpiler import CouplingMap @@ -2045,18 +963,16 @@ def OptimizeWideCircuit( qgd_Wide_Circuit_Optimization.check_valid_routing( newcirc, self.config["topology"] ) - print("OptimizeWideCircuit::check_compare_circuits") + if self.config["verbosity"] >= 2: + print("OptimizeWideCircuit::check_compare_circuits") self.check_compare_circuits(circ, parameters, newcirc, newparameters) circ, parameters = newcirc, newparameters else: - print("Optimizing circuit with Squander") + if self.config["verbosity"] >= 1: + print("Optimizing circuit with Squander") part_size_start = self.max_partition_size - part_size_end = self.max_partition_size - if self.config.get("use_osr", False) or self.config.get( - "use_graph_search", False - ): - part_size_end = min(4, circ.get_Qbit_Num()) + part_size_end = self.config.get("part_size_end",self.max_partition_size) count = CNOTGateCount(circ, 0) fingerprint_dict = {} for max_part_size in range(part_size_start, part_size_end + 1): @@ -2126,7 +1042,7 @@ def InnerOptimizeWideCircuit( in_parent = parent_process() is not None - if not in_parent: + if not in_parent and self.config["verbosity"] >= 1: print(len(subcircuits), "partitions found to optimize") # the list of optimized subcircuits @@ -2147,7 +1063,7 @@ def process_result(partition_idx): if optimized_subcircuits[partition_idx] is not None: return subcircuit = subcircuits[partition_idx] - # callback on the master process to compare the decomposed and original subcircuit + # callback function done on the master process to compare the new decomposed and the original suncircuit start_idx = subcircuit.get_Parameter_Start_Index() subcircuit_parameters = parameters[ start_idx : start_idx + subcircuit.get_Parameter_Num() @@ -2173,7 +1089,7 @@ def process_result(partition_idx): else async_results[partition_idx].get(timeout=None) ) - if subcircuit != new_subcircuit: + if subcircuit != new_subcircuit and self.config["verbosity"] >= 2: print( "original subcircuit: ", subcircuit.get_Gate_Nums(), @@ -2197,14 +1113,16 @@ def process_result(partition_idx): trim_subcirc, trim_parameters ) ] = (trim_subcirc, trim_parameters) - if total_opt[0] % 100 == 99: + if total_opt[0] % 100 == 99 and self.config["verbosity"] >= 1: print(total_opt[0] + 1, "partitions optimized") total_opt[0] += 1 optimized_subcircuits[partition_idx] = new_subcircuit optimized_parameter_list[partition_idx] = new_parameters with ( - contextlib.nullcontext() if in_parent else Pool(processes=mp.cpu_count()) + contextlib.nullcontext() + if in_parent + else Pool(processes=len(os.sched_getaffinity(0)) if hasattr(os, 'sched_getaffinity') else mp.cpu_count()) ) as pool: remaining = list(range(len(subcircuits))) while remaining: @@ -2272,9 +1190,10 @@ def process_result(partition_idx): (subcircuit, subcircuit_parameters, config, None), ) # print("Dispatching", subcircuit.get_Involved_Qubits(), "qubits with", CNOGateCount(subcircuit, 0), "CNOT gates, partition ", partition_idx) - assert pool is not None async_results[partition_idx] = ( - fargs if in_parent else pool.apply_async(*fargs) + fargs + if in_parent + else pool.apply_async(*fargs) ) if len(remaining) == len(still_remaining): time.sleep(0.1) @@ -2283,7 +1202,7 @@ def process_result(partition_idx): for partition_idx in range(len(subcircuits)): process_result(partition_idx) - # construct the wide circuit from the optimized subcircuits + # construct the wide circuit from the optimized suncircuits if global_min: optimized_subcircuits, optimized_parameter_list = ( qgd_Wide_Circuit_Optimization.recombine_all_partition_circuit( @@ -2305,14 +1224,15 @@ def process_result(partition_idx): cast(List[List[np.ndarray]], optimized_parameter_list), ) - if not in_parent: + if not in_parent and self.config["verbosity"] >= 1: print("original circuit: ", circ.get_Gate_Nums()) print("reoptimized circuit: ", wide_circuit.get_Gate_Nums()) qgd_Wide_Circuit_Optimization.check_valid_routing( wide_circuit, self.config["topology"] ) - print("InnerOptimizeWideCircuit: check_compare_circuits") + if self.config["verbosity"] >= 2: + print("InnerOptimizeWideCircuit: check_compare_circuits") self.check_compare_circuits( circ, orig_parameters, wide_circuit, wide_parameters ) @@ -2354,16 +1274,15 @@ def lattice_topology(x_qbits, y_qbits): @staticmethod def heavy_hexagonal_topology(rows, cols): - """Build a finite heavy-hex coupling list (honeycomb with subdivided edges). + """ + Finite heavy-hex patch. - Args: - rows: Number of rows in the brick-wall honeycomb patch. - cols: Number of columns in the patch. + rows, cols describe the underlying honeycomb 'brick-wall' patch. + The first rows*cols qubits are the original honeycomb vertices. + Every original edge gets one inserted degree-2 qubit. Returns: - List of undirected edges ``(u, v)``. The first ``rows * cols`` qubit - indices are honeycomb vertices; each original edge introduces one - additional degree-2 qubit on the subdivided link. + list[(u, v)] undirected couplers """ def vid(r, c): @@ -2446,26 +1365,9 @@ def check_valid_routing(wide_circuit, topo): ), "Final circuit contains gates that do not respect the routing constraints." def check_compare_circuits( - self, - circ, - orig_parameters, - wide_circuit, - wide_parameters, - routing=False, - forced_test=False, + self, circ, orig_parameters, wide_circuit, wide_parameters, routing=False, forced_test=False, ): - """Optionally verify equivalence of ``circ`` and ``wide_circuit`` via ``CompareCircuits``. - - Args: - circ: Original circuit. - orig_parameters: Parameters for ``circ``. - wide_circuit: Optimized or routed circuit. - wide_parameters: Parameters for ``wide_circuit``. - routing: If true and initial/final mappings exist in ``self.config``, - pass them to ``CompareCircuits`` for layout-aware comparison. - forced_test: If true, run the comparison even when ``test_final_circuit`` - is false in config. - """ + """If ``test_final_circuit``, numerically compare unitaries (optional initial/final layout for routing).""" if self.config["test_final_circuit"] or forced_test: if ( routing @@ -2485,23 +1387,59 @@ def check_compare_circuits( CompareCircuits(circ, orig_parameters, wide_circuit, wide_parameters) def route_circuit(self, circ: Circuit, orig_parameters: np.ndarray): - """Map ``circ`` onto ``self.config['topology']`` using the configured router. + """Map ``circ`` onto ``self.config['topology']`` using BQSKit SeQPAM, Qiskit SABRE, or Squander SABRE.""" + strategy = self.config.get("routing-strategy", "seqpam-ilp") + + if strategy == "seqpam-ilp": + from squander import Qiskit_IO + from squander.decomposition.qgd_Wide_Circuit_Optimization import generate_squander_seqpam + from bqskit.compiler import Compiler + from bqskit.compiler.machine import MachineModel + from bqskit.ir.lang.qasm2 import OPENQASM2Language + from bqskit.passes import SetModelPass + from qiskit import qasm2, QuantumCircuit - The strategy is ``self.config['routing-strategy']``, e.g. ``seqpam-ilp``, - ``seqpam-quick``, ``bqskit-sabre``, ``light-sabre`` (Qiskit), or ``sabre`` - (Squander). Writes ``initial_mapping`` and ``final_mapping`` into - ``self.config`` when the backend provides them. + model = MachineModel(circ.get_Qbit_Num(), self.config["topology"]) + circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters) + bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo)) - Args: - circ: Circuit before routing. - orig_parameters: Parameter vector for ``circ``. + strategy_map = {"TreeSearch": "Tree_search", "TabuSearch": "Tabu_search"} + squander_config = { + "strategy": strategy_map.get(self.config.get("strategy", "TreeSearch"), "Tree_search"), + "optimization_tolerance": self.config.get("tolerance", 1e-8), + "verbosity": self.config.get("verbosity", 0), + "optimizer_engine": self.config.get("optimizer_engine", "BFGS"), + "Cost_Function_Variant": self.config.get("Cost_Function_Variant", 3), + "size_density_weight": True, + "sparse_penalty": self.config.get("sparse_penalty", 3.0), + "max_partition_size": self.max_partition_size, + } + block_size = self.max_partition_size - Returns: - ``(routed_circuit, routed_parameters)`` laid out for ``self.config['topology']``. - """ - strategy = self.config.get("routing-strategy", "seqpam-ilp") + workflow = generate_squander_seqpam(squander_config, block_size) + + with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler: + routed_bqskit_circ, pass_data = compiler.compile( + bqskit_circ, [SetModelPass(model), workflow], True + ) - if strategy in ("seqpam-ilp", "seqpam-quick", "bqskit-sabre"): + circuit_qiskit_routed = QuantumCircuit.from_qasm_str( + OPENQASM2Language().encode(routed_bqskit_circ) + ) + Squander_remapped_circuit, parameters_remapped_circuit = ( + Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_routed) + ) + Squander_remapped_circuit = Squander_remapped_circuit.Remap_Qbits( + {i: j for i, j in enumerate(pass_data.placement)} + ) + self.config["initial_mapping"] = list( + pass_data.placement[x] for x in pass_data.initial_mapping + ) + self.config["final_mapping"] = list( + pass_data.placement[x] for x in pass_data.final_mapping + ) + + elif strategy in ("seqpam-quick", "bqskit-sabre"): from squander import Qiskit_IO from bqskit import Circuit as BQSKitCircuit, compile from bqskit.compiler import Compiler @@ -2564,14 +1502,6 @@ async def run(self, circuit: BQSKitCircuit, data=None): mainflow = build_seqpam_mapping_optimization_workflow( block_size=self.config["max_partition_size"] ) - if strategy == "seqpam-ilp": - for curpass in mainflow._passes: - if isinstance(curpass, IfThenElsePass): - for i in range(len(curpass.on_true._passes)): - if isinstance(curpass.on_true._passes[i], QuickPartitioner): - curpass.on_true._passes[i] = SquanderPartitioner( - self.config["max_partition_size"] - ) routing_workflow = [ SetModelPass(model), # attach hardware model to circuit @@ -2585,7 +1515,7 @@ async def run(self, circuit: BQSKitCircuit, data=None): ), # SABRE-style routing ] - with Compiler() as compiler: + with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler: routed_bqskit_circ, pass_data = compiler.compile( bqskit_circ, routing_workflow, True ) @@ -2607,6 +1537,53 @@ async def run(self, circuit: BQSKitCircuit, data=None): pass_data.placement[x] for x in pass_data.final_mapping ) + elif strategy == "seqpam_partam": + from squander import Qiskit_IO + from squander.decomposition.qgd_Wide_Circuit_Optimization import generate_squander_seqpam + from bqskit.compiler import Compiler + from bqskit.compiler.machine import MachineModel + from bqskit.ir.lang.qasm2 import OPENQASM2Language + from bqskit.passes import SetModelPass + from qiskit import qasm2, QuantumCircuit + + model = MachineModel(circ.get_Qbit_Num(), self.config["topology"]) + circo = Qiskit_IO.get_Qiskit_Circuit(circ, orig_parameters) + bqskit_circ = OPENQASM2Language().decode(qasm2.dumps(circo)) + + squander_config = { + 'strategy': 'Tree_search', + 'optimization_tolerance': self.config.get('tolerance', 1e-8), + 'verbosity': self.config.get('verbosity', 0), + 'optimizer_engine': self.config.get('optimizer_engine', 'BFGS'), + 'size_density_weight': True, + 'sparse_penalty': self.config.get('sparse_penalty', 3.0), + 'max_partition_size': self.max_partition_size, + 'use_osr':0, + 'use_graph_search':0, + } + workflow = generate_squander_seqpam(squander_config, self.max_partition_size) + + with Compiler(num_workers=int(self.config.get("num_workers", _affinity_num_workers()))) as compiler: + routed_bqskit_circ, pass_data = compiler.compile( + bqskit_circ, [SetModelPass(model), workflow], True + ) + + circuit_qiskit_routed = QuantumCircuit.from_qasm_str( + OPENQASM2Language().encode(routed_bqskit_circ) + ) + Squander_remapped_circuit, parameters_remapped_circuit = ( + Qiskit_IO.convert_Qiskit_to_Squander(circuit_qiskit_routed) + ) + Squander_remapped_circuit = Squander_remapped_circuit.Remap_Qbits( + {i: j for i, j in enumerate(pass_data.placement)} + ) + self.config["initial_mapping"] = list( + pass_data.placement[x] for x in pass_data.initial_mapping + ) + self.config["final_mapping"] = list( + pass_data.placement[x] for x in pass_data.final_mapping + ) + elif strategy == "light-sabre": from squander import Qiskit_IO from qiskit import transpile @@ -2674,7 +1651,8 @@ async def run(self, circuit: BQSKitCircuit, data=None): Squander_remapped_circuit, self.config["topology"] ) - print("cheking circuit after routing") + if self.config["verbosity"] >= 2: + print("cheking circuit after routing") self.check_compare_circuits( circ, orig_parameters, diff --git a/squander/gates/gates_Wrapper.cpp b/squander/gates/gates_Wrapper.cpp index debfa4a57..58b2a4bb7 100644 --- a/squander/gates/gates_Wrapper.cpp +++ b/squander/gates/gates_Wrapper.cpp @@ -61,6 +61,7 @@ along with this program. If not, see http://www.gnu.org/licenses/. #include "SWAP.h" #include "CSWAP.h" #include "numpy_interface.h" +#include "Permutation.h" #include "RXX.h" #include "RYY.h" #include "RZZ.h" @@ -79,7 +80,6 @@ typedef struct { - template Gate* create_gate( int qbit_num, int target_qbit ) { GateT* gate = new GateT( qbit_num, target_qbit ); @@ -126,6 +126,11 @@ Gate* create_multi_target_controlled_gate( int qbit_num, const std::vector& } +Gate* create_permutation_gate( int qbit_num, const std::vector& pattern ) { + Permutation* gate = new Permutation( qbit_num, pattern ); + return static_cast( gate ); +} + /** @brief Method called when a python instance of the class Gate_Wrapper is destroyed @@ -143,6 +148,7 @@ static void } + /** @brief Method called when a python instance of the class qgd_CH_Wrapper is allocated @param type A pointer pointing to a structure describing the type of the class qgd_CH_Wrapper. @@ -487,6 +493,112 @@ static PyObject * } +template +static PyObject * + permutation_gate_Wrapper_new(PyTypeObject *type, PyObject *args, PyObject *kwds) +{ + static char *kwlist[] = {(char*)"qbit_num", (char*)"pattern", NULL}; + int qbit_num = -1; + PyObject* pattern_py = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|iO", kwlist, &qbit_num, &pattern_py)) { + std::string err("Unable to parse arguments"); + PyErr_SetString(PyExc_Exception, err.c_str()); + return NULL; + } + + if (qbit_num == -1 || pattern_py == NULL) { + PyErr_SetString(PyExc_ValueError, "qbit_num and pattern must be provided!"); + return NULL; + } + + // Convert tuple to list if necessary, or check if it's a list + PyObject* pattern_list = NULL; + bool created_list = false; + if (PyTuple_Check(pattern_py)) { + pattern_list = PySequence_List(pattern_py); + if (pattern_list == NULL) { + PyErr_SetString(PyExc_TypeError, "Failed to convert tuple to list"); + return NULL; + } + created_list = true; // We created it, so we need to DECREF + } else if (PyList_Check(pattern_py)) { + pattern_list = pattern_py; + // We're borrowing the reference, no need to INCREF/DECREF + } else { + PyErr_SetString(PyExc_TypeError, "pattern must be a list or tuple!"); + return NULL; + } + + std::vector pattern; + Py_ssize_t pattern_size = PyList_Size(pattern_list); + + // Check pattern size matches qbit_num + if (pattern_size != qbit_num) { + if (created_list) { + Py_DECREF(pattern_list); + } + std::string err = "Pattern size " + std::to_string(pattern_size) + + " does not match qubit number " + std::to_string(qbit_num); + PyErr_SetString(PyExc_ValueError, err.c_str()); + return NULL; + } + + // Track which values we've seen to validate it's a permutation + std::vector seen(qbit_num, false); + + for (Py_ssize_t i = 0; i < pattern_size; i++) { + PyObject* item = PyList_GetItem(pattern_list, i); + if (!PyLong_Check(item)) { + if (created_list) { + Py_DECREF(pattern_list); + } + PyErr_SetString(PyExc_TypeError, "pattern must contain integers!"); + return NULL; + } + int qbit = PyLong_AsLong(item); + if (qbit < 0 || qbit >= qbit_num) { + if (created_list) { + Py_DECREF(pattern_list); + } + std::string err = "Pattern qubit index " + std::to_string(qbit) + + " out of range [0, " + std::to_string(qbit_num - 1) + "]"; + PyErr_SetString(PyExc_ValueError, err.c_str()); + return NULL; + } + if (seen[qbit]) { + if (created_list) { + Py_DECREF(pattern_list); + } + std::string err = "Pattern contains duplicate value " + std::to_string(qbit); + PyErr_SetString(PyExc_ValueError, err.c_str()); + return NULL; + } + seen[qbit] = true; + pattern.push_back(qbit); + } + + // Release the pattern_list reference (only if we created it from a tuple) + if (created_list) { + Py_DECREF(pattern_list); + } + + Gate_Wrapper *self; + self = (Gate_Wrapper *) type->tp_alloc(type, 0); + if (self != NULL) { + try { + self->gate = create_permutation_gate(qbit_num, pattern); + } catch (const std::string& e) { + PyErr_SetString(PyExc_ValueError, e.c_str()); + return NULL; + } catch (const std::exception& e) { + PyErr_SetString(PyExc_ValueError, e.what()); + return NULL; + } + } + + return (PyObject *) self; +} /** @brief Method called when a python instance of a non-controlled gate class is initialized @param self A pointer pointing to an instance of the class Gate_Wrapper. @@ -672,7 +784,6 @@ Gate_Wrapper_get_Matrix( Gate_Wrapper *self, PyObject *args, PyObject *kwds ) { } } - /** @brief Call to apply the gate operation from the right side on an input state or matrix */ @@ -2428,8 +2539,139 @@ Gate_Wrapper_getstate( Gate_Wrapper *self ) { } +static PyObject * Gate_Wrapper_get_Pattern( Gate_Wrapper *self ) { + std::vector pattern; + try { + // Cast to Permutation* to access pattern methods + Permutation* perm_gate = dynamic_cast(self->gate); + if (perm_gate == nullptr) { + PyErr_SetString(PyExc_TypeError, "Gate is not a Permutation gate"); + return NULL; + } + pattern = perm_gate->get_pattern(); + } + catch (std::string err) { + PyErr_SetString(PyExc_Exception, err.c_str()); + return NULL; + } + catch(...) { + std::string err( "Invalid pointer to gate class"); + PyErr_SetString(PyExc_Exception, err.c_str()); + return NULL; + } + PyObject* pattern_py = PyList_New(pattern.size()); + for (size_t i = 0; i < pattern.size(); i++) { + PyList_SetItem(pattern_py, i, Py_BuildValue("i", pattern[i])); + } + return pattern_py; +} +static PyObject * Gate_Wrapper_set_Pattern( Gate_Wrapper *self, PyObject *args ) { + PyObject* pattern_py = NULL; + if (!PyArg_ParseTuple(args, "O", &pattern_py)) { + std::string err("Unable to parse arguments"); + PyErr_SetString(PyExc_Exception, err.c_str()); + return NULL; + } + // Convert tuple to list if necessary, or check if it's a list + PyObject* pattern_list = NULL; + bool created_list = false; + if (PyTuple_Check(pattern_py)) { + pattern_list = PySequence_List(pattern_py); + if (pattern_list == NULL) { + PyErr_SetString(PyExc_TypeError, "Failed to convert tuple to list"); + return NULL; + } + created_list = true; // We created it, so we need to DECREF + } else if (PyList_Check(pattern_py)) { + pattern_list = pattern_py; + // We're borrowing the reference, no need to INCREF/DECREF + } else { + std::string err("Pattern must be a list or tuple!"); + PyErr_SetString(PyExc_TypeError, err.c_str()); + return NULL; + } + + // Cast to Permutation* to access pattern methods and get qbit_num + Permutation* perm_gate = dynamic_cast(self->gate); + if (perm_gate == nullptr) { + if (created_list) { + Py_DECREF(pattern_list); + } + PyErr_SetString(PyExc_TypeError, "Gate is not a Permutation gate"); + return NULL; + } + + int qbit_num = perm_gate->get_qbit_num(); + std::vector pattern; + Py_ssize_t pattern_size = PyList_Size(pattern_list); + + // Check pattern size matches qbit_num + if (pattern_size != qbit_num) { + if (created_list) { + Py_DECREF(pattern_list); + } + std::string err = "Pattern size " + std::to_string(pattern_size) + + " does not match qubit number " + std::to_string(qbit_num); + PyErr_SetString(PyExc_ValueError, err.c_str()); + return NULL; + } + + // Track which values we've seen to validate it's a permutation + std::vector seen(qbit_num, false); + + for (Py_ssize_t i = 0; i < pattern_size; i++) { + PyObject* item = PyList_GetItem(pattern_list, i); + if (!PyLong_Check(item)) { + if (created_list) { + Py_DECREF(pattern_list); + } + std::string err("Pattern must contain integers!"); + PyErr_SetString(PyExc_TypeError, err.c_str()); + return NULL; + } + int qbit = PyLong_AsLong(item); + if (qbit < 0 || qbit >= qbit_num) { + if (created_list) { + Py_DECREF(pattern_list); + } + std::string err = "Pattern qubit index " + std::to_string(qbit) + + " out of range [0, " + std::to_string(qbit_num - 1) + "]"; + PyErr_SetString(PyExc_ValueError, err.c_str()); + return NULL; + } + if (seen[qbit]) { + if (created_list) { + Py_DECREF(pattern_list); + } + std::string err = "Pattern contains duplicate value " + std::to_string(qbit); + PyErr_SetString(PyExc_ValueError, err.c_str()); + return NULL; + } + seen[qbit] = true; + pattern.push_back(qbit); + } + + // Release the pattern_list reference (only if we created it from a tuple) + if (created_list) { + Py_DECREF(pattern_list); + } + + try { + perm_gate->set_pattern(pattern); + } + catch (std::string err) { + PyErr_SetString(PyExc_Exception, err.c_str()); + return NULL; + } + catch(...) { + std::string err( "Invalid pointer to gate class"); + PyErr_SetString(PyExc_Exception, err.c_str()); + return NULL; + } + return Py_BuildValue("i", 0); +} /** @brief Call to set the state of quantum gate from a human-readable data serialized and pickle-able format @@ -2860,6 +3102,12 @@ extern "C" }, \ {"get_Name", (PyCFunction) Gate_Wrapper_get_Name, METH_NOARGS, \ "Method to get the name label of the gate" \ + }, \ + {"get_Pattern", (PyCFunction) Gate_Wrapper_get_Pattern, METH_NOARGS, \ + "Method to get the pattern of the permutation gate." \ + }, \ + {"set_Pattern", (PyCFunction) Gate_Wrapper_set_Pattern, METH_VARARGS, \ + "Method to set the pattern of the permutation gate." \ } static PyMethodDef Gate_Wrapper_methods[] = { @@ -2875,6 +3123,7 @@ static PyMethodDef Gate_Wrapper_methods[] = { }; + /** @brief Structure containing metadata about the members of class qgd_CH_Wrapper. */ @@ -2883,6 +3132,7 @@ static PyMemberDef Gate_Wrapper_members[] = { }; + struct Gate_Wrapper_Type_tmp : PyTypeObject { @@ -3044,6 +3294,8 @@ gate_wrapper_type_template(Tdg, Gate_Wrapper_new); gate_wrapper_type_template(R, Gate_Wrapper_new); +gate_wrapper_type_template(Permutation, permutation_gate_Wrapper_new); + @@ -3120,7 +3372,8 @@ PyInit_gates_Wrapper(void) PyType_Ready(&CCX_Wrapper_Type_ins) < 0 || PyType_Ready(&SWAP_Wrapper_Type_ins) < 0 || PyType_Ready(&CSWAP_Wrapper_Type_ins) < 0 || - PyType_Ready(&R_Wrapper_Type_ins) < 0 ) { + PyType_Ready(&R_Wrapper_Type_ins) < 0 || + PyType_Ready(&Permutation_Wrapper_Type_ins) < 0 ) { Py_DECREF(m); return NULL; @@ -3218,6 +3471,8 @@ PyInit_gates_Wrapper(void) Py_INCREF_template(CSWAP); + Py_INCREF_template(Permutation); + return m; } diff --git a/squander/gates/qgd_Circuit.py b/squander/gates/qgd_Circuit.py index eb259b4d0..6626f1549 100644 --- a/squander/gates/qgd_Circuit.py +++ b/squander/gates/qgd_Circuit.py @@ -80,6 +80,14 @@ def __init__(self, qbit_num): # call the constructor of the wrapper class super().__init__(qbit_num) + def copy(self): + """ + Create a deep copy of the circuit. + @return A new qgd_Circuit instance with all gates copied. + """ + # Call the C wrapper function that uses the clone() method + return super().copy() + def add_U1(self, target_qbit): """Add a U1 gate to the front of the gate structure. @@ -382,6 +390,18 @@ def add_CP(self, target_qbit, control_qbit): # call the C wrapper function super(qgd_Circuit, self).add_CP(target_qbit, control_qbit) +#@brief Call to add a Permutation gate to the front of the gate structure. +#@param self A pointer pointing to an instance of the class qgd_Circuit. +#@param Input arguments: pattern (list of int) - permutation pattern. + + def add_Permutation( self, pattern): + + # call the C wrapper function + super(qgd_Circuit, self).add_Permutation(pattern) + +#@brief Call to add a SWAP gate to the front of the gate structure. +#@param self A pointer pointing to an instance of the class qgd_Circuit. +#@param Input arguments: target_qbits (list of int) - list of target qubits (at least 2). def add_SWAP(self, target_qbits, target_qbit2=-1): """Add a SWAP gate to the front of the gate structure. @@ -674,12 +694,12 @@ def get_Qbits(self): return super().get_Qbits() - def set_min_fusion(self, min_fusion): - """Set the minimum fusion parameter in the circuit. + def get_Involved_Qbits(self): - Args: - min_fusion: Minimum fusion value (int) - """ + return super().get_Qbits() +#@brief Call to set hte min fusion in the circuit +#@param Input arguments: min_fusion + def set_min_fusion( self, min_fusion): super().set_min_fusion(min_fusion) diff --git a/squander/gates/qgd_Circuit_Wrapper.cpp b/squander/gates/qgd_Circuit_Wrapper.cpp index 22e500b87..17f79d545 100644 --- a/squander/gates/qgd_Circuit_Wrapper.cpp +++ b/squander/gates/qgd_Circuit_Wrapper.cpp @@ -55,6 +55,7 @@ along with this program. If not, see http://www.gnu.org/licenses/. #include "SXdg.h" #include "SYC.h" #include "Adaptive.h" +#include "Permutation.h" #include "RXX.h" #include "RYY.h" #include "RZZ.h" @@ -471,6 +472,49 @@ qgd_Circuit_Wrapper_add_CSWAP(qgd_Circuit_Wrapper *self, PyObject *args, PyObjec } +/** +@brief Wrapper function to add a Permutation gate to the front of the gate structure. +@param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper. +@param args A tuple of the input arguments: pattern (list of ints) +@param kwds A tuple of keywords +*/ +static PyObject * +qgd_Circuit_Wrapper_add_Permutation(qgd_Circuit_Wrapper *self, PyObject *args, PyObject *kwds) +{ + static char *kwlist[] = {(char*)"pattern", NULL}; + PyObject* pattern_py = NULL; + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwlist, &pattern_py)) + return Py_BuildValue("i", -1); + + if (pattern_py != NULL && PyList_Check(pattern_py)) { + std::vector pattern; + Py_ssize_t list_size = PyList_Size(pattern_py); + for (Py_ssize_t i = 0; i < list_size; i++) { + PyObject* item = PyList_GetItem(pattern_py, i); + pattern.push_back(PyLong_AsLong(item)); + } + if (pattern.size() == self->circuit->get_qbit_num()) { + try { + self->circuit->add_permutation(pattern); + } catch (const std::string& e) { + PyErr_SetString(PyExc_ValueError, e.c_str()); + return Py_BuildValue("i", -1); + } catch (const std::exception& e) { + PyErr_SetString(PyExc_ValueError, e.what()); + return Py_BuildValue("i", -1); + } catch (...) { + PyErr_SetString(PyExc_ValueError, "Unknown error occurred in add_permutation"); + return Py_BuildValue("i", -1); + } + } else { + std::string err = "Pattern size " + std::to_string(pattern.size()) + + " does not match circuit qubit number " + std::to_string(self->circuit->get_qbit_num()); + PyErr_SetString(PyExc_ValueError, err.c_str()); + return Py_BuildValue("i", -1); + } + } + return Py_BuildValue("i", 0); +} /** @brief Wrapper function to add a block of operations to the front of the gate structure. @param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper. @@ -824,7 +868,22 @@ qgd_Circuit_Wrapper_get_Matrix( qgd_Circuit_Wrapper *self, PyObject *args, PyObj // get the C++ wrapper around the data Matrix_real parameters_mtx = numpy2matrix_real(parameters_arr); - Matrix mtx = self->circuit->get_matrix(parameters_mtx); + Matrix mtx; + try { + mtx = self->circuit->get_matrix( parameters_mtx ); + } + catch (std::string err) { + Py_DECREF(parameters_arr); + PyErr_SetString(PyExc_Exception, err.c_str()); + std::cout << err << std::endl; + return NULL; + } + catch(...) { + Py_DECREF(parameters_arr); + std::string err( "Invalid pointer to circuit class or error in get_matrix"); + PyErr_SetString(PyExc_Exception, err.c_str()); + return NULL; + } // convert to numpy array mtx.set_owner(false); @@ -2423,6 +2482,33 @@ get_gate( Gates_block* circuit, int &idx ) { Py_DECREF( circuit_input ); } + else if (gate->get_type() == PERMUTATION_OPERATION) { + // Handle Permutation gate + PyObject* qgd_gate_Dict = PyModule_GetDict( qgd_gate ); + PyObject* py_gate_class = PyDict_GetItemString( qgd_gate_Dict, "Permutation"); + + // Get the pattern from the Permutation gate + Permutation* perm_gate = static_cast(gate); + std::vector pattern = perm_gate->get_pattern(); + + // Convert pattern to Python list + PyObject* pattern_list = PyList_New(pattern.size()); + for (size_t i = 0; i < pattern.size(); i++) { + PyList_SetItem(pattern_list, i, Py_BuildValue("i", pattern[i])); + } + + PyObject* gate_input = Py_BuildValue("(OO)", qbit_num, pattern_list); + py_gate = PyObject_CallObject(py_gate_class, gate_input); + + // replace dummy data with real gate data + qgd_Gate* py_gate_C = reinterpret_cast( py_gate ); + delete( py_gate_C->gate ); + py_gate_C->gate = static_cast( gate->clone() ); + + Py_DECREF( qgd_gate ); + Py_DECREF( gate_input ); + Py_DECREF( pattern_list ); + } else { Py_DECREF( qgd_gate ); @@ -2806,6 +2892,62 @@ qgd_Circuit_Wrapper_get_Flat_Circuit( qgd_Circuit_Wrapper *self ) { +/** +@brief Wrapper function to create a deep copy of the circuit. +@param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper. +@return Returns a new qgd_Circuit Python object that is a deep copy. +*/ +static PyObject * +qgd_Circuit_Wrapper_copy( qgd_Circuit_Wrapper *self ) { + + Gates_block* copied_circuit = NULL; + + try { + copied_circuit = self->circuit->clone(); + } + catch (std::string err) { + PyErr_SetString(PyExc_Exception, err.c_str()); + std::cout << err << std::endl; + return NULL; + } + catch(...) { + std::string err( "Invalid pointer to circuit class"); + PyErr_SetString(PyExc_Exception, err.c_str()); + return NULL; + } + + int qbit_num = copied_circuit->get_qbit_num(); + + // import gate operation modules + PyObject* qgd_circuit = PyImport_ImportModule("squander.gates.qgd_Circuit"); + + if ( qgd_circuit == NULL ) { + PyErr_SetString(PyExc_Exception, "Module import error: squander.gates.qgd_Circuit" ); + delete copied_circuit; + return NULL; + } + + PyObject* qgd_circuit_Dict = PyModule_GetDict( qgd_circuit ); + + // PyDict_GetItemString creates a borrowed reference to the item in the dict. Reference counting is not increased on this element, dont need to decrease the reference counting at the end + PyObject* py_circuit_class = PyDict_GetItemString( qgd_circuit_Dict, "qgd_Circuit"); + + PyObject* circuit_input = Py_BuildValue("(O)", Py_BuildValue("i", qbit_num) ); + PyObject* py_circuit = PyObject_CallObject(py_circuit_class, circuit_input); + + // replace dummy data with real gate data + qgd_Circuit_Wrapper* py_circuit_C = reinterpret_cast( py_circuit ); + + delete( py_circuit_C->circuit ); + py_circuit_C->circuit = copied_circuit; + + Py_DECREF( qgd_circuit ); + Py_DECREF( circuit_input ); + + return py_circuit; +} + + /** @brief Method to extract the stored quantum circuit in a human-readable data serialized and pickle-able format @param self A pointer pointing to an instance of the class qgd_Circuit_Wrapper @@ -3173,14 +3315,17 @@ static PyMethodDef qgd_Circuit_Wrapper_Methods[] = { {"add_CRY", (PyCFunction) qgd_Circuit_Wrapper_add_CRY, METH_VARARGS | METH_KEYWORDS, "Call to add a CRY gate to the front of the gate structure" }, + {"add_Permutation", (PyCFunction) qgd_Circuit_Wrapper_add_Permutation, METH_VARARGS | METH_KEYWORDS, + "Call to add a Permutation gate to the front of the gate structure" + }, {"add_CRX", (PyCFunction) qgd_Circuit_Wrapper_add_CRX, METH_VARARGS | METH_KEYWORDS, - "Call to add a CRY gate to the front of the gate structure" + "Call to add a CRX gate to the front of the gate structure" }, {"add_CRZ", (PyCFunction) qgd_Circuit_Wrapper_add_CRZ, METH_VARARGS | METH_KEYWORDS, - "Call to add a CRY gate to the front of the gate structure" + "Call to add a CRZ gate to the front of the gate structure" }, {"add_CP", (PyCFunction) qgd_Circuit_Wrapper_add_CP, METH_VARARGS | METH_KEYWORDS, - "Call to add a CRY gate to the front of the gate structure" + "Call to add a CP gate to the front of the gate structure" }, {"add_CCX", (PyCFunction) qgd_Circuit_Wrapper_add_CCX, METH_VARARGS | METH_KEYWORDS, "Call to add a CCX gate to the front of the gate structure" @@ -3283,6 +3428,9 @@ static PyMethodDef qgd_Circuit_Wrapper_Methods[] = { {"get_Children", (PyCFunction) qgd_Circuit_Wrapper_get_children, METH_VARARGS, "Method to get the list of child gate indices. Then the children gates can be obtained from the list of gates involved in the circuit." }, + {"copy", (PyCFunction) qgd_Circuit_Wrapper_copy, METH_NOARGS, + "Method to create a deep copy of the circuit." + }, {"__getstate__", (PyCFunction) qgd_Circuit_Wrapper_getstate, METH_NOARGS, "Method to extract the stored quantum circuit in a human-readable data serialized and pickle-able format." }, diff --git a/squander/partitioning/ilp.py b/squander/partitioning/ilp.py index e3ad3e3c1..9731247e6 100644 --- a/squander/partitioning/ilp.py +++ b/squander/partitioning/ilp.py @@ -539,6 +539,48 @@ def sol_to_badsccs(g, allparts, L): _, scc = scc_tarjan_iterative(G_part) return {frozenset(v) for v in scc if len(v) > 1} +def parts_to_overlap_scores(allparts, g, gate_to_qubit): + """ + Per-part tie-breaker weights from logical-qubit overlap with DAG-downstream + candidate parts. + + For each part i, score s[i] is the mean over candidate parts j reachable + from i in the gate DAG of |support(i) ∩ support(j)|. Returned weights are + `(s_max - s[i]) * eps` (lower is better — ILP minimizes), with eps small + enough that count-minimization in `ilp_global_optimal` is strictly + preserved when these weights are passed via `weights=`. + + Args: + allparts (list[frozenset[int]]): Candidate parts (gate sets). + g (dict[int, set[int]]): Contracted gate DAG (u -> successors v) as + returned by `get_all_partitions`. + gate_to_qubit (dict[int, set[int]]): Gate -> qubits acted on. + + Returns: + list[float]: weights[i] indexed like allparts, all in + [0, 1 / (len(allparts) * len(g))). + """ + N = len(allparts) + if N == 0: return [] + _, reach = nuutila_reach_scc(g) + gate_to_parts = {gate: [] for gate in g} + for i, part in enumerate(allparts): + for gate in part: gate_to_parts[gate].append(i) + supports = [set.union(*(gate_to_qubit[v] for v in part)) for part in allparts] + scores = [0.0] * N + for i, part in enumerate(allparts): + dgates = set().union(*(reach[u] for u in part)) - part + if not dgates: continue + succ_idxs = set().union(*(gate_to_parts[v] for v in dgates)) + succ_idxs.discard(i) + if not succ_idxs: continue + sup_i = supports[i] + scores[i] = sum(len(sup_i & supports[j]) for j in succ_idxs) / len(succ_idxs) + s_max = max(scores) + if s_max == 0.0: return [0.0] * N + eps = 0.9 / (N * max(len(g), 1) * (s_max + 1.0)) + return [(s_max - s) * eps for s in scores] + def ilp_global_optimal(allparts, g, weighted_info=None, gurobi_direct=False, use_order=False, weights=None): """ Select an optimal set of non-overlapping parts via ILP/MIP with cycle cuts. diff --git a/squander/src-cpp/gates/Gate.cpp b/squander/src-cpp/gates/Gate.cpp index ba24471d7..8c03ff1ef 100644 --- a/squander/src-cpp/gates/Gate.cpp +++ b/squander/src-cpp/gates/Gate.cpp @@ -158,7 +158,9 @@ Gate::Gate(int qbit_num_in) { // number of qubits spanning the matrix of the operation qbit_num = qbit_num_in; // the size of the matrix - matrix_size = Power_of_2(qbit_num); + if (qbit_num<31){ + matrix_size = Power_of_2(qbit_num); + } // A string describing the type of the operation type = GENERAL_OPERATION; // The index of the qubit on which the operation acts (target_qbit >= 0) @@ -192,7 +194,10 @@ Gate::Gate(int qbit_num_in, const std::vector& target_qbits_in, const std:: // number of qubits spanning the matrix of the operation qbit_num = qbit_num_in; // the size of the matrix - matrix_size = Power_of_2(qbit_num); + if (qbit_num<31){ + matrix_size = Power_of_2(qbit_num); + } + // A string describing the type of the operation type = GENERAL_OPERATION; // The number of parameters diff --git a/squander/src-cpp/gates/Gates_block.cpp b/squander/src-cpp/gates/Gates_block.cpp index c974dad77..970f37db7 100644 --- a/squander/src-cpp/gates/Gates_block.cpp +++ b/squander/src-cpp/gates/Gates_block.cpp @@ -59,6 +59,8 @@ limitations under the License. #include "RZZ.h" #include "Adaptive.h" #include "Gates_block.h" +#include "Permutation.h" + #include "qgd_math.h" #ifdef _WIN32 @@ -1414,6 +1416,43 @@ void Gates_block::add_u3_to_front(int target_qbit) { } +/** +@brief Append a Permutation gate to the list of gates +@param pattern The pattern of the permutation +*/ +void Gates_block::add_permutation(const std::vector& pattern) { + // create the operation + try { + Gate* operation = static_cast(new Permutation( qbit_num, pattern )); + add_gate( operation ); + } catch (const std::string& e) { + // Re-throw as proper exception + throw std::runtime_error(e); + } catch (const std::exception& e) { + // Re-throw as-is + throw; + } +} + + +/** +@brief Add a Permutation gate to the front of the list of gates +@param pattern The pattern of the permutation +*/ +void Gates_block::add_permutation_to_front(const std::vector& pattern) { + // create the operation + try { + Gate* operation = static_cast(new Permutation( qbit_num, pattern )); + add_gate_to_front( operation ); + } catch (const std::string& e) { + // Re-throw as proper exception + throw std::runtime_error(e); + } catch (const std::exception& e) { + // Re-throw as-is + throw; + } +} + /** @brief Append a RX gate to the list of gates @param target_qbit The identification number of the targt qubit. (0 <= target_qbit <= qbit_num-1) @@ -3045,6 +3084,13 @@ Gates_block::create_remapped_circuit( const std::map& qbit_map, const break; } + case PERMUTATION_OPERATION: + { + Gate* cloned_op = op->clone(); + cloned_op->set_qbit_num( qbit_num_ ); + ret->add_gate( cloned_op ); + break; + } default: std::string err("Gates_block::create_remapped_circuit: unimplemented gate"); throw err; @@ -3277,7 +3323,7 @@ int Gates_block::extract_gates( Gates_block* op_block ) { case CH_OPERATION: case SYC_OPERATION: case U1_OPERATION: case U2_OPERATION: case U3_OPERATION: case CP_OPERATION: - case RY_OPERATION: case CRY_OPERATION: + case RY_OPERATION: case CRY_OPERATION: case PERMUTATION_OPERATION: case CRX_OPERATION: case CRZ_OPERATION: case RX_OPERATION: case CR_OPERATION: case RZ_OPERATION: case X_OPERATION: diff --git a/squander/src-cpp/gates/Permutation.cpp b/squander/src-cpp/gates/Permutation.cpp new file mode 100644 index 000000000..4d9b98d49 --- /dev/null +++ b/squander/src-cpp/gates/Permutation.cpp @@ -0,0 +1,267 @@ +/* +Created on Fri Jun 26 14:13:26 2020 +Copyright 2020 Peter Rakyta, Ph.D. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author: Peter Rakyta, Ph.D. +*/ +/*! \file Permutation.cpp + \brief Class for the representation of Permutation gate. +*/ +#include "Permutation.h" +#include "apply_dedicated_gate_kernel_to_input.h" +#include "common.h" + +Permutation::Permutation(){ + name = "Permutation"; + type = PERMUTATION_OPERATION; + target_qbits.clear(); + control_qbits.clear(); + parameter_num = 0; + cycles_cache_valid = false; + cycles_cache_matrix_size = 0; +} + +Permutation::Permutation(int qbit_num_in, const std::vector& pattern_in) : Gate(qbit_num_in) { + if (pattern_in.size() != qbit_num_in) { + std::stringstream sstream; + sstream << "Permutation: Pattern size " << pattern_in.size() << " is not equal to the number of qubits " << qbit_num_in << std::endl; + print(sstream, 0); + throw sstream.str(); + } + name = "Permutation"; + type = PERMUTATION_OPERATION; + pattern = pattern_in; + control_qbits.clear(); + parameter_num = 0; + target_qbits.resize(qbit_num_in); + for (int idx=0; idx +static void permute_columns_from_cycles(MatrixType& input, const std::vector>& cycles){ + auto* data = input.get_data(); + for (const auto& cycle : cycles) { + for (int idx = (int)cycle.size() - 2; idx >= 0; --idx) { + int c0 = cycle[idx]; + int c1 = cycle[idx + 1]; + for (int row = 0; row < input.rows; ++row) { + auto tmp = data[row * input.stride + c0]; + data[row * input.stride + c0] = data[row * input.stride + c1]; + data[row * input.stride + c1] = tmp; + } + } + } +} + +void Permutation::apply_from_right(Matrix& input){ + if (input.cols != matrix_size) { + std::string err("Permutation::apply_from_right: Wrong input size in Permutation gate apply"); + throw err; + } + if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) { + build_cycles_cache(); + } + permute_columns_from_cycles(input, cycles_cache); +} + +void Permutation::apply_to(Matrix_float& input, int parallel){ + if (input.rows != matrix_size) { + std::string err("Permutation::apply_to: Wrong input size in Permutation gate apply"); + throw err; + } + + if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) { + build_cycles_cache(); + } + if (parallel == 2) { + apply_Permutation_kernel_to_input_tbb(input, pattern, matrix_size, cycles_cache); + } + else if (parallel == 1) { + apply_Permutation_kernel_to_input_omp(input, pattern, matrix_size, cycles_cache); + } + else { + apply_Permutation_kernel_to_input(input, pattern, matrix_size, cycles_cache); + } +} + +void Permutation::apply_from_right(Matrix_float& input){ + if (input.cols != matrix_size) { + std::string err("Permutation::apply_from_right: Wrong input size in Permutation gate apply"); + throw err; + } + if (!cycles_cache_valid || cycles_cache_matrix_size != matrix_size) { + build_cycles_cache(); + } + permute_columns_from_cycles(input, cycles_cache); +} + +void Permutation::apply_to_list(std::vector& inputs, int parallel){ + int work_batch = 1; + if ( parallel == 0 ) { + work_batch = inputs.size(); + } + else { + work_batch = 1; + } + + + tbb::parallel_for( tbb::blocked_range(0,inputs.size(),work_batch), [&](tbb::blocked_range r) { + for (int idx=r.begin(); idx Permutation::get_target_qbits(){ + return target_qbits; +} + +std::vector Permutation::get_control_qbits(){ + return control_qbits; +} + +std::vector Permutation::get_pattern(){ + return pattern; +} + +void Permutation::set_pattern(const std::vector& pattern_in){ + pattern = pattern_in; + invalidate_cache(); +} + +std::vector Permutation::get_involved_qubits(bool only_target){ + std::vector involved_qubits; + for (int i = 0; i < qbit_num; i++) { + involved_qubits.push_back(i); + } + return involved_qubits; +} + +Permutation* Permutation::clone(){ + Permutation* ret = new Permutation(qbit_num, pattern); + ret->set_parameter_start_idx(get_parameter_start_idx()); + ret->set_parents(parents); + ret->set_children(children); + return ret; +} + +void Permutation::reorder_qubits(std::vector qbit_list){ + Gate::reorder_qubits(qbit_list); + std::vector new_pattern(qbit_num); + for (int idx=0; idx next_index(matrix_size); + for (int row_idx = 0; row_idx < matrix_size; ++row_idx) { + int new_row_idx = 0; + for (int idx = 0; idx < qbit_num; idx++) { + int bit = (row_idx >> pattern[idx]) & 1; + new_row_idx |= (bit << idx); + } + next_index[row_idx] = new_row_idx; + } + + std::vector visited(matrix_size, 0); + for (int start = 0; start < matrix_size; ++start) { + if (visited[start]) continue; + std::vector cycle; + int current = start; + while (!visited[current]) { + visited[current] = 1; + cycle.push_back(current); + current = next_index[current]; + } + if (cycle.size() > 1) { + cycles_cache.push_back(std::move(cycle)); + } + } + + cycles_cache_valid = true; +} \ No newline at end of file diff --git a/squander/src-cpp/gates/include/Gate.h b/squander/src-cpp/gates/include/Gate.h index 6b62e9d62..e5f2626f6 100644 --- a/squander/src-cpp/gates/include/Gate.h +++ b/squander/src-cpp/gates/include/Gate.h @@ -76,7 +76,8 @@ typedef enum gate_type {GENERAL_OPERATION=1, RXX_OPERATION=44, RYY_OPERATION=45, RZZ_OPERATION=46, - SXDG_OPERATION=47} gate_type; + SXDG_OPERATION=47, + PERMUTATION_OPERATION=48} gate_type; diff --git a/squander/src-cpp/gates/include/Gates_block.h b/squander/src-cpp/gates/include/Gates_block.h index 3b616a839..75ee2da8e 100644 --- a/squander/src-cpp/gates/include/Gates_block.h +++ b/squander/src-cpp/gates/include/Gates_block.h @@ -308,8 +308,17 @@ void add_ry(int target_qbit); */ void add_ry_to_front(int target_qbit); +/** +@brief Append a Permutation gate to the list of gates +@param pattern The pattern of the permutation +*/ +void add_permutation(const std::vector& pattern); - +/** +@brief Add a Permutation gate to the front of the list of gates +@param pattern The pattern of the permutation +*/ +void add_permutation_to_front(const std::vector& pattern); /** @brief Append a CRY gate to the list of gates diff --git a/squander/src-cpp/gates/include/Permutation.h b/squander/src-cpp/gates/include/Permutation.h new file mode 100644 index 000000000..529d05c81 --- /dev/null +++ b/squander/src-cpp/gates/include/Permutation.h @@ -0,0 +1,65 @@ +/* +Created on Fri Jun 26 14:13:26 2020 +Copyright 2020 Peter Rakyta, Ph.D. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +@author: Peter Rakyta, Ph.D. +*/ +/*! \file Permutation.h + \brief Class for the representation of Permutation gate. +*/ + +#ifndef PERMUTATION_H +#define PERMUTATION_H + +#include "Gate.h" +#include "common.h" +#include "matrix.h" +#include "logging.h" +#include "tbb/tbb.h" + +class Permutation : public Gate { + +protected: + std::vector pattern; + // Cached cycles for current pattern and matrix size + std::vector> cycles_cache; + int cycles_cache_matrix_size = 0; + bool cycles_cache_valid = false; + + void invalidate_cache(); + void build_cycles_cache(); + +public: + Permutation(); + Permutation(int qbit_num_in, const std::vector& pattern_in); + ~Permutation(); + Matrix get_matrix(); + Matrix get_matrix(int parallel); + void apply_to(Matrix& input, int parallel); + void apply_to(Matrix& input); + void apply_from_right(Matrix& input); + void apply_to(Matrix_float& input, int parallel); + void apply_from_right(Matrix_float& input); + void apply_to_list(std::vector& inputs, int parallel); + std::vector get_pattern(); + void set_pattern(const std::vector& pattern_in); + std::vector get_target_qbits(); + std::vector get_control_qbits(); + std::vector get_involved_qubits(bool only_target = false); + Permutation* clone(); + void reorder_qubits(std::vector qbit_list); +}; + +#endif //PERMUTATION_H \ No newline at end of file diff --git a/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp b/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp index 73bac921b..d3050f5ab 100644 --- a/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp +++ b/squander/src-cpp/gates/kernels/apply_dedicated_gate_kernel_to_input.cpp @@ -25,6 +25,8 @@ limitations under the License. //#include #include "tbb/tbb.h" #include +#include +#include #include #include @@ -529,6 +531,152 @@ void apply_SWAP_kernel_to_input_impl(MatrixT& input, const std::vector& tar } } +void apply_Permutation_kernel_to_input(Matrix& input, const std::vector& pattern, const int& matrix_size){ + + int qbit_num = pattern.size(); + + auto permuted_index = [&](int row_idx) -> int { + int new_row_idx = 0; + for (int idx = 0; idx < qbit_num; idx++) { + int bit = (row_idx >> pattern[idx]) & 1; + new_row_idx |= (bit << idx); + } + return new_row_idx; + }; + + std::vector visited(matrix_size, 0); + + for (int start = 0; start < matrix_size; ++start) { + if (visited[start]) continue; + + std::vector cycle; + int current = start; + while (!visited[current]) { + visited[current] = 1; + cycle.push_back(current); + current = permuted_index(current); + } + + if (cycle.size() <= 1) continue; + + for (size_t idx = 0; idx < cycle.size() - 1; idx++) { + std::swap_ranges( + input.get_data() + cycle[idx] * input.stride, + input.get_data() + cycle[idx] * input.stride + input.cols, + input.get_data() + cycle[idx + 1] * input.stride + ); + } + } +} + +// Overload that applies permutation using precomputed cycles +void apply_Permutation_kernel_to_input(Matrix& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles){ + (void)pattern; // currently unused, kept for interface symmetry / potential validation + (void)matrix_size; // rows already validated by caller + + for (const auto& cycle : cycles) { + for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) { + std::swap_ranges( + input.get_data() + cycle[idx] * input.stride, + input.get_data() + cycle[idx] * input.stride + input.cols, + input.get_data() + cycle[idx + 1] * input.stride + ); + } + } +} + +void apply_Permutation_kernel_to_input_tbb(Matrix& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles){ + (void)pattern; + (void)matrix_size; + + tbb::parallel_for(tbb::blocked_range(0, cycles.size(), 64), + [&](const tbb::blocked_range& range) { + for (size_t cdx = range.begin(); cdx != range.end(); ++cdx) { + const auto& cycle = cycles[cdx]; + for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) { + std::swap_ranges( + input.get_data() + cycle[idx] * input.stride, + input.get_data() + cycle[idx] * input.stride + input.cols, + input.get_data() + cycle[idx + 1] * input.stride + ); + } + } + } + ); +} + +void apply_Permutation_kernel_to_input_omp(Matrix& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles){ + (void)pattern; + (void)matrix_size; + + #pragma omp parallel for schedule(static) + for (int cdx = 0; cdx < (int)cycles.size(); ++cdx) { + const auto& cycle = cycles[cdx]; + for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) { + std::swap_ranges( + input.get_data() + cycle[idx] * input.stride, + input.get_data() + cycle[idx] * input.stride + input.cols, + input.get_data() + cycle[idx + 1] * input.stride + ); + } + } +} + +// float32 (complex64) overloads of the precomputed-cycle Permutation kernels. +// The permutation only swaps whole rows, so the logic is identical to the +// Matrix versions; only the underlying element type differs. +void apply_Permutation_kernel_to_input(Matrix_float& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles){ + (void)pattern; + (void)matrix_size; + + for (const auto& cycle : cycles) { + for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) { + std::swap_ranges( + input.get_data() + cycle[idx] * input.stride, + input.get_data() + cycle[idx] * input.stride + input.cols, + input.get_data() + cycle[idx + 1] * input.stride + ); + } + } +} + +void apply_Permutation_kernel_to_input_tbb(Matrix_float& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles){ + (void)pattern; + (void)matrix_size; + + tbb::parallel_for(tbb::blocked_range(0, cycles.size(), 64), + [&](const tbb::blocked_range& range) { + for (size_t cdx = range.begin(); cdx != range.end(); ++cdx) { + const auto& cycle = cycles[cdx]; + for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) { + std::swap_ranges( + input.get_data() + cycle[idx] * input.stride, + input.get_data() + cycle[idx] * input.stride + input.cols, + input.get_data() + cycle[idx + 1] * input.stride + ); + } + } + } + ); +} + +void apply_Permutation_kernel_to_input_omp(Matrix_float& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles){ + (void)pattern; + (void)matrix_size; + + #pragma omp parallel for schedule(static) + for (int cdx = 0; cdx < (int)cycles.size(); ++cdx) { + const auto& cycle = cycles[cdx]; + for (size_t idx = 0; idx + 1 < cycle.size(); ++idx) { + std::swap_ranges( + input.get_data() + cycle[idx] * input.stride, + input.get_data() + cycle[idx] * input.stride + input.cols, + input.get_data() + cycle[idx + 1] * input.stride + ); + } + } +} + template void apply_SWAP_kernel_from_right_impl(MatrixT& input, const std::vector& target_qbits, const std::vector& control_qbits, const int& matrix_size) { diff --git a/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h b/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h index de6c501fa..03c7d4c30 100644 --- a/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h +++ b/squander/src-cpp/gates/kernels/include/apply_dedicated_gate_kernel_to_input.h @@ -83,6 +83,34 @@ void apply_SWAP_kernel_from_right(Matrix& input, const std::vector& target_ void apply_SYC_kernel_to_input(Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size); void apply_SYC_kernel_from_right(Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size); +/** + * @brief Applies the Permutation gate kernel to the input matrix. + * + * @param input The input matrix on which the transformation is applied. + * @param pattern The pattern of the permutation. + * @param matrix_size The size of the input. + */ +void apply_Permutation_kernel_to_input(Matrix& input, const std::vector& pattern, const int& matrix_size); + +/** + * @brief Applies the Permutation gate kernel using precomputed cycles. + * + * @param input The input matrix on which the transformation is applied. + * @param pattern The pattern of the permutation (used only for validation or future extensions). + * @param matrix_size The size of the input. + * @param cycles The disjoint cycles of row indices representing the permutation. + */ +void apply_Permutation_kernel_to_input(Matrix& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles); + +// Parallelized versions for permutation with precomputed cycles +void apply_Permutation_kernel_to_input_tbb(Matrix& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles); +void apply_Permutation_kernel_to_input_omp(Matrix& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles); + +// float32 (complex64) overloads of the precomputed-cycle Permutation kernels +void apply_Permutation_kernel_to_input(Matrix_float& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles); +void apply_Permutation_kernel_to_input_tbb(Matrix_float& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles); +void apply_Permutation_kernel_to_input_omp(Matrix_float& input, const std::vector& pattern, const int& matrix_size, const std::vector>& cycles); + // TBB Parallelized versions void apply_X_kernel_to_input_tbb(Matrix& input, const std::vector& target_qbits, const std::vector& control_qbits, const int& matrix_size); void apply_X_kernel_from_right_tbb(Matrix& input, const std::vector& target_qbits, const std::vector& control_qbits, const int& matrix_size); diff --git a/squander/src-cpp/sabre_router/CMakeLists.txt b/squander/src-cpp/sabre_router/CMakeLists.txt new file mode 100644 index 000000000..f4cfa9604 --- /dev/null +++ b/squander/src-cpp/sabre_router/CMakeLists.txt @@ -0,0 +1,118 @@ +# =================================================================== +# SQUANDER SABRE Router Module - C++ Routing Engine + pybind11 Bindings +# =================================================================== + +message(STATUS "") +message(STATUS "=== Configuring SABRE Router Module ===") + +# =================================================================== +# Find pybind11 +# =================================================================== + +find_package(pybind11 CONFIG QUIET) + +if(NOT pybind11_FOUND) + message(STATUS "pybind11 not found via find_package, trying Python import...") + execute_process( + COMMAND ${PYTHON_EXECUTABLE} -c "import pybind11; print(pybind11.get_cmake_dir())" + OUTPUT_VARIABLE pybind11_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_QUIET + ) + if(pybind11_DIR) + message(STATUS "Found pybind11 via Python at: ${pybind11_DIR}") + find_package(pybind11 CONFIG PATHS ${pybind11_DIR}) + endif() +endif() + +if(NOT pybind11_FOUND) + message(WARNING "") + message(WARNING "pybind11 not found - SABRE router module will be skipped") + message(WARNING "Install with: pip install pybind11") + message(WARNING "") + return() +endif() + +message(STATUS "pybind11 version: ${pybind11_VERSION}") + +# =================================================================== +# Source Files +# =================================================================== + +set(SABRE_SOURCES + sabre_router.cpp +) + +set(SABRE_HEADERS + include/sabre_router.hpp +) + +# =================================================================== +# Static C++ library +# =================================================================== + +add_library(sabre_router_core STATIC + ${SABRE_SOURCES} + ${SABRE_HEADERS} +) + +target_include_directories(sabre_router_core + PUBLIC + $ +) + +# C++17 for this module only (does not affect global C++11) +target_compile_features(sabre_router_core PUBLIC cxx_std_17) + +target_compile_options(sabre_router_core PRIVATE + $<$,$>: + -Wall -Wextra -fPIC + $<$:-O3 -march=native> + $<$:-g -O0> + > + $<$: + $<$:/O2> + $<$:/Od /Zi> + > +) + +set_target_properties(sabre_router_core PROPERTIES + POSITION_INDEPENDENT_CODE ON +) + +# =================================================================== +# pybind11 module +# =================================================================== + +pybind11_add_module(_sabre_router MODULE + ../../synthesis/bindings.cpp +) + +target_link_libraries(_sabre_router PRIVATE + sabre_router_core +) + +set_target_properties(_sabre_router PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${PROJECT_SOURCE_DIR}/squander/synthesis + OUTPUT_NAME "_sabre_router" +) + +# Set BUILD_RPATH to prioritize conda libraries +if(DEFINED ENV{CONDA_PREFIX}) + set_target_properties(_sabre_router PROPERTIES + BUILD_RPATH "${CONDA_PREFIX}/lib" + BUILD_RPATH_USE_ORIGIN TRUE + ) +endif() + +# =================================================================== +# Installation +# =================================================================== + +install(TARGETS _sabre_router + LIBRARY DESTINATION squander/synthesis + RUNTIME DESTINATION squander/synthesis + COMPONENT python) + +message(STATUS "=== SABRE Router Module Configured ===") +message(STATUS "") diff --git a/squander/src-cpp/sabre_router/include/sabre_router.hpp b/squander/src-cpp/sabre_router/include/sabre_router.hpp new file mode 100644 index 000000000..0e037a9f9 --- /dev/null +++ b/squander/src-cpp/sabre_router/include/sabre_router.hpp @@ -0,0 +1,450 @@ +#pragma once +/* +Copyright 2025 SQUANDER Contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +C++ backend for the SABRE-style partition-aware routing engine. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace squander::routing { + +// --------------------------------------------------------------------------- +// Data structures (flattened from Python objects) +// --------------------------------------------------------------------------- + +struct Edge { + int u, v; +}; + +struct CandidateData { + int partition_idx; + int topology_idx; + int permutation_idx; + int candidate_idx = -1; + int cnot_count; + bool has_multi_qubit_body = true; + + // Permutations within the reduced (q*) space + // P_i[v] = position in Q* space for input routing + // P_o[v] = position in Q* space for output placement + std::vector P_i; + std::vector P_o; + + // node_mapping_flat[Q*_idx] = Q (physical qubit) + // Dense array indexed by Q* index + std::vector node_mapping_flat; + + // qbit_map: original circuit qubit q -> reduced qubit q* + std::vector qbit_map_keys; + std::vector qbit_map_vals; + + // Original circuit qubits involved in this partition + std::vector involved_qbits; + + // Precomputed routing helpers. + std::vector P_i_inv; + std::vector P_o_inv; + std::vector qbit_map_keys_sorted; + std::vector qbit_map_vals_sorted; + std::vector qstar_to_q; +}; + +struct CanonicalEntry { + std::vector edges_u; // virtual qubit indices + std::vector edges_v; + int cnot = 0; +}; + +struct LayoutPartInfo { + bool is_single; + std::vector involved_qbits; +}; + +struct SabreConfig { + int prefilter_top_k = 50; + int prefilter_min_per_partition = 2; + int prefilter_min_3q = 12; + int max_E_size = 20; + int max_lookahead = 4; + double E_weight = 0.5; + double E_alpha = 1.0; // LightSABRE uses no per-depth decay; set <1 for SQUANDER-style decay + double cnot_cost = 1.0 / 3.0; // weight on candidate.cnot_count; swap cost is fixed at 1.0 (1 SWAP = 3 CNOTs) + int sabre_iterations = 1; + int n_layout_trials = 1; + int random_seed = 42; + double decay_delta = 0.001; // Qiskit LightSABRE DECAY_RATE + int swap_burst_budget = 5; // Qiskit LightSABRE DECAY_RESET_INTERVAL + double path_tiebreak_weight = 0.2; + double three_qubit_exit_weight = 1.0; + int boundary_beam_width = 1; + int boundary_beam_depth = 1; +}; + +struct RouteStep { + int type = 0; // 0=swap, 1=partition, 2=single + int partition_idx = -1; + int candidate_idx = -1; + int physical_qubit = -1; + std::vector> swaps; +}; + +struct ForwardRouteResult { + std::vector pi_initial; + std::vector pi; + int cnot_count = 0; + std::vector steps; +}; + +struct TrialResult { + std::vector pi; + double total_cost; +}; + +struct NeighborEdge { + int u_idx; + int v_idx; + double weight; +}; + +struct NeighborInfo { + std::vector neighbor_vqs; + std::vector initial_pos; + std::vector edges; + double weight = 0.0; + + bool uses_tiebreak() const { + return weight > 0.0 && !edges.empty(); + } +}; + +// --------------------------------------------------------------------------- +// Swap cache key for deduplication within a single heuristic_search call +// --------------------------------------------------------------------------- + +struct SwapCacheKey { + int64_t pi_snapshot; + int64_t targets; + int k; + // 0 when the neighbor tiebreak is inactive; otherwise a stable hash of + // (edges, initial_pos, weight) from NeighborInfo so that two calls with + // the same active future context share cache entries. + uint64_t neighbor_hash; + + bool operator==(const SwapCacheKey& o) const { + return pi_snapshot == o.pi_snapshot && targets == o.targets + && k == o.k && neighbor_hash == o.neighbor_hash; + } +}; + +struct SwapCacheKeyHash { + size_t operator()(const SwapCacheKey& k) const { + size_t h = static_cast(k.pi_snapshot); + h ^= static_cast(k.targets) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + h ^= static_cast(k.k) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + h ^= static_cast(k.neighbor_hash) + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + return h; + } +}; + +using SwapList = std::vector>; +using SwapCache = std::unordered_map; + +// --------------------------------------------------------------------------- +// A* state packing helpers +// --------------------------------------------------------------------------- + +// For k <= 4 partition qubits on N <= 64 physical qubits, pack state into int64_t +// State = sum(positions[i] * N^i), fits in 64 bits when N <= 64 and k <= 4 +inline int64_t pack_state(const std::vector& positions, int N) { + int64_t s = 0; + int64_t stride = 1; + for (size_t i = 0; i < positions.size(); i++) { + s += static_cast(positions[i]) * stride; + stride *= N; + } + return s; +} + +inline std::vector unpack_state(int64_t packed, int k, int N) { + std::vector positions(k); + for (int i = 0; i < k; i++) { + positions[i] = static_cast(packed % N); + packed /= N; + } + return positions; +} + +// --------------------------------------------------------------------------- +// SabreRouter class +// --------------------------------------------------------------------------- + +class SabreRouter { +public: + SabreRouter( + const SabreConfig& config, + int N, + std::vector D, + std::vector> adj, + std::vector> DAG, + std::vector> IDAG, + std::vector> candidate_cache, + std::vector layout_partitions, + std::unordered_map canonical_data_fwd, + std::unordered_map canonical_data_rev + ); + + // Thread-safe: all mutable state is stack-local + ForwardRouteResult route_forward( + const std::vector& pi + ) const; + + TrialResult run_trial( + int trial_idx, + const std::vector& seeded_pi, + int n_iterations, + int n_trials + ) const; + +private: + // Distance lookup (flat row-major) + inline double dist(int phys_u, int phys_v) const { + return D_[phys_u * N_ + phys_v]; + } + + // Main heuristic search loop. + // children_graph/parents_graph are swapped for backward passes. + std::pair, double> heuristic_search( + const std::vector& F_init, + std::vector pi, + bool reverse, + std::mt19937* rng, + const std::unordered_map& canonical_data, + const std::vector>& children_graph, + const std::vector>& parents_graph, + ForwardRouteResult* route_trace = nullptr + ) const; + + // A* constrained swap search over the k-dimensional partition state space. + std::pair>, std::vector> + find_constrained_swaps( + const std::vector& pi, + const std::vector& qbit_map_keys, + const std::vector& qbit_map_vals, + const std::vector& node_mapping_flat, + const std::vector& P_route_inv, + SwapCache* swap_cache, + const NeighborInfo* neighbor_info = nullptr + ) const; + + // Lower-bound swap estimate for routing the candidate's partition qubits. + int estimate_swap_count( + const CandidateData& cand, + const std::vector& pi, + bool reverse + ) const; + + // BFS lookahead: multi-qubit partitions near the front layer. + std::vector> generate_extended_set( + const std::vector& F, + const std::vector& resolved, + const std::vector>& children_graph, + const std::vector>& parents_graph + ) const; + + // LightSABRE relative scoring (arXiv:2409.08368, eq. 1). + double score_candidate( + const CandidateData& cand, + const std::vector& F_snapshot, + const std::vector& pi, + const std::vector>& E, + bool reverse, + const std::unordered_map& canonical_data, + SwapCache* swap_cache, + const std::vector* decay = nullptr, + std::vector>* out_swaps = nullptr, + std::vector* out_pi_new = nullptr, + const NeighborInfo* cached_neighbor_info = nullptr + ) const; + + // Route a candidate's partition qubits to their input positions and + // update pi for the exit positions. + std::pair>, std::vector> + transform_pi( + const CandidateData& cand, + const std::vector& pi, + bool reverse, + SwapCache* swap_cache, + const NeighborInfo* neighbor_info = nullptr + ) const; + + NeighborInfo build_neighbor_info( + int exclude_partition_idx, + const std::vector& F_snapshot, + const std::vector>& E, + const std::vector& pi, + const std::unordered_map& canonical_data + ) const; + + double decay_factor_for_swaps( + const std::vector>& swaps, + const std::vector& decay + ) const; + + double routing_objective( + double route_cost, + int cnot_count, + double cnot_weight = 1.0, + double decay_factor = 1.0 + ) const; + + double future_partition_cost( + int partition_idx, + const std::vector& pi, + bool reverse, + const std::unordered_map& canonical_data + ) const; + + void apply_decay_for_swaps( + const std::vector>& swaps, + std::vector& decay + ) const; + + void reset_decay(std::vector& decay) const; + + std::vector bfs_shortest_path(int src, int dst) const; + + std::pair>, std::vector> release_valve( + const std::vector& F, + const std::vector& pi, + const std::unordered_map& canonical_data + ) const; + + // Apply a list of SWAPs to pi + std::vector apply_swaps_to_pi( + const std::vector& pi, + const std::vector>& swaps + ) const; + + // Get initial layer (partitions with no unresolved parents) + std::vector get_initial_layer() const; + + // Get final layer (partitions with no children) + std::vector get_final_layer() const; + + // Prefilter candidates by cheap swap estimate + std::vector prefilter_candidates( + const std::vector& candidates, + const std::vector& pi, + int top_k, + const std::vector& F_snapshot, + const std::vector>& E, + bool reverse, + const std::unordered_map& canonical_data + ) const; + + // Select best candidate with optional stochastic tie-breaking + const CandidateData& select_best_candidate( + const std::vector& candidates, + const std::vector& scores, + std::mt19937* rng + ) const; + + std::pair, std::vector> advance_layout_frontier( + int selected_partition_idx, + const std::vector& F, + const std::vector& resolved, + const std::vector>& children_graph, + const std::vector>& parents_graph + ) const; + + size_t boundary_beam_select_index( + const std::vector& candidates, + const std::vector& scores, + const std::vector>>& cached_swaps, + const std::vector>& cached_pi, + const std::vector& F_snapshot, + const std::vector& resolved, + const std::vector>& children_graph, + const std::vector>& parents_graph, + bool reverse, + const std::unordered_map& canonical_data, + SwapCache* swap_cache + ) const; + + // Check if partition is single-qubit + inline bool partition_is_single(int partition_idx) const { + return layout_partitions_[partition_idx].is_single; + } + + // Gather all candidates for partitions in F + std::vector obtain_partition_candidates( + const std::vector& F + ) const; + + // Random permutation of [0..N-1] + std::vector random_permutation(int n, std::mt19937& rng) const; + + // Initial-layout sampling: trial 0 uses the seed, later trials are random. + std::vector sample_initial_layout( + int trial_idx, + int n_trials, + const std::vector& seeded_pi, + std::mt19937& rng + ) const; + + double entry_future_cost( + const CanonicalEntry& entry, + const std::vector& pi + ) const; + + double future_context_cost( + int exclude_partition_idx, + const std::vector& pi, + const std::vector& F_snapshot, + const std::vector>& E, + bool reverse, + const std::unordered_map& canonical_data + ) const; + + std::vector estimate_candidate_output_layout( + const CandidateData& cand, + const std::vector& pi, + bool reverse + ) const; + + // Immutable data members + SabreConfig config_; + int N_; // number of physical qubits + int num_partitions_; + std::vector D_; // flat N*N distance matrix (owned copy) + std::vector> adj_; + // CSR view of adj_ for tight inner loops + std::vector adj_offsets_; + std::vector adj_flat_; + std::vector> DAG_; + std::vector> IDAG_; + std::vector> candidate_cache_; + std::vector layout_partitions_; + std::unordered_map canonical_data_fwd_; + std::unordered_map canonical_data_rev_; + std::vector alpha_weights_; + double max_finite_distance_ = 1.0; +}; + +} // namespace squander::routing diff --git a/squander/src-cpp/sabre_router/sabre_router.cpp b/squander/src-cpp/sabre_router/sabre_router.cpp new file mode 100644 index 000000000..8f1770ae5 --- /dev/null +++ b/squander/src-cpp/sabre_router/sabre_router.cpp @@ -0,0 +1,1932 @@ +/* +Copyright 2025 SQUANDER Contributors + +C++ backend for the SABRE-style partition-aware routing engine. +*/ + +#include "sabre_router.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace squander::routing { + +namespace { + +std::vector invert_permutation(const std::vector& P) { + std::vector inv(P.size()); + for (size_t i = 0; i < P.size(); i++) { + inv[P[i]] = static_cast(i); + } + return inv; +} + +void prepare_candidate(CandidateData& cand) { + cand.P_i_inv = invert_permutation(cand.P_i); + cand.P_o_inv = invert_permutation(cand.P_o); + + const int k = static_cast(cand.qbit_map_keys.size()); + std::vector order(k); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](int a, int b) { + return cand.qbit_map_keys[a] < cand.qbit_map_keys[b]; + }); + + cand.qbit_map_keys_sorted.resize(k); + cand.qbit_map_vals_sorted.resize(k); + int max_qstar = -1; + for (int i = 0; i < k; i++) { + const int src_idx = order[i]; + const int qstar = cand.qbit_map_vals[src_idx]; + cand.qbit_map_keys_sorted[i] = cand.qbit_map_keys[src_idx]; + cand.qbit_map_vals_sorted[i] = qstar; + if (qstar > max_qstar) max_qstar = qstar; + } + + const int dense_size = std::max( + {max_qstar + 1, + static_cast(cand.P_i.size()), + static_cast(cand.P_o.size()), + static_cast(cand.node_mapping_flat.size())} + ); + cand.qstar_to_q.assign(dense_size, -1); + for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) { + const int qstar = cand.qbit_map_vals[i]; + if (qstar >= 0) { + if (qstar >= static_cast(cand.qstar_to_q.size())) { + cand.qstar_to_q.resize(qstar + 1, -1); + } + cand.qstar_to_q[qstar] = cand.qbit_map_keys[i]; + } + } +} + +inline void unpack_state_into(int64_t packed, int k, int N, std::vector& positions) { + positions.resize(k); + for (int i = 0; i < k; i++) { + positions[i] = static_cast(packed % N); + packed /= N; + } +} + +} // namespace + +// --------------------------------------------------------------------------- +// Constructor +// --------------------------------------------------------------------------- + +SabreRouter::SabreRouter( + const SabreConfig& config, + int N, + std::vector D, + std::vector> adj, + std::vector> DAG, + std::vector> IDAG, + std::vector> candidate_cache, + std::vector layout_partitions, + std::unordered_map canonical_data_fwd, + std::unordered_map canonical_data_rev +) + : config_(config) + , N_(N) + , num_partitions_(static_cast(DAG.size())) + , D_(std::move(D)) + , adj_(std::move(adj)) + , DAG_(std::move(DAG)) + , IDAG_(std::move(IDAG)) + , candidate_cache_(std::move(candidate_cache)) + , layout_partitions_(std::move(layout_partitions)) + , canonical_data_fwd_(std::move(canonical_data_fwd)) + , canonical_data_rev_(std::move(canonical_data_rev)) +{ + if (static_cast(D_.size()) != N_ * N_) { + throw std::invalid_argument("Distance matrix D must be N x N"); + } + // Build CSR view of adj_ + adj_offsets_.resize(N_ + 1); + adj_offsets_[0] = 0; + for (int i = 0; i < N_; i++) { + adj_offsets_[i + 1] = adj_offsets_[i] + static_cast(adj_[i].size()); + } + adj_flat_.resize(adj_offsets_[N_]); + for (int i = 0; i < N_; i++) { + for (size_t j = 0; j < adj_[i].size(); j++) { + adj_flat_[adj_offsets_[i] + j] = adj_[i][j]; + } + } + for (auto& partition_candidates : candidate_cache_) { + for (auto& cand : partition_candidates) { + prepare_candidate(cand); + } + } + + const int max_depth = std::max(0, config_.max_lookahead); + alpha_weights_.resize(max_depth + 1); + if (!alpha_weights_.empty()) { + alpha_weights_[0] = 1.0; + for (int depth = 1; depth <= max_depth; depth++) { + alpha_weights_[depth] = alpha_weights_[depth - 1] * config_.E_alpha; + } + } + + max_finite_distance_ = 1.0; + for (double d : D_) { + if (std::isfinite(d) && d > max_finite_distance_) { + max_finite_distance_ = d; + } + } +} + +// --------------------------------------------------------------------------- +// Helper: random permutation +// --------------------------------------------------------------------------- + +std::vector SabreRouter::random_permutation(int n, std::mt19937& rng) const { + std::vector perm(n); + std::iota(perm.begin(), perm.end(), 0); + std::shuffle(perm.begin(), perm.end(), rng); + return perm; +} + +std::vector SabreRouter::sample_initial_layout( + int trial_idx, + int n_trials, + const std::vector& seeded_pi, + std::mt19937& rng +) const { + if (n_trials <= 1 || trial_idx == 0) { + return seeded_pi; + } + + return random_permutation(N_, rng); +} + +// --------------------------------------------------------------------------- +// apply_swaps_to_pi +// --------------------------------------------------------------------------- + +std::vector SabreRouter::apply_swaps_to_pi( + const std::vector& pi, + const std::vector>& swaps +) const { + std::vector result(pi); + thread_local std::vector p2v; + if (static_cast(p2v.size()) < N_) p2v.assign(N_, 0); + for (int q = 0; q < N_; q++) p2v[result[q]] = q; + + for (auto [P1, P2] : swaps) { + int q1 = p2v[P1]; + int q2 = p2v[P2]; + p2v[P1] = q2; + p2v[P2] = q1; + result[q1] = P2; + result[q2] = P1; + } + return result; +} + +NeighborInfo SabreRouter::build_neighbor_info( + int exclude_partition_idx, + const std::vector& F_snapshot, + const std::vector>& E, + const std::vector& pi, + const std::unordered_map& canonical_data +) const { + NeighborInfo info; + info.weight = config_.path_tiebreak_weight; + if (info.weight <= 0.0) { + return info; + } + + // Per-call scratch via thread_local, reset by tracking touched entries + thread_local std::vector q_to_idx; + thread_local std::vector q_touched; + if (static_cast(q_to_idx.size()) < N_) q_to_idx.assign(N_, -1); + q_touched.clear(); + + auto ensure_qubit = [&](int q) -> int { + int idx = q_to_idx[q]; + if (idx >= 0) return idx; + idx = static_cast(info.neighbor_vqs.size()); + q_to_idx[q] = idx; + q_touched.push_back(q); + info.neighbor_vqs.push_back(q); + info.initial_pos.push_back(pi[q]); + return idx; + }; + + // edges: parallel arrays keyed by (lo, hi) — small linear scan dedup + thread_local std::vector ekey_lo; + thread_local std::vector ekey_hi; + thread_local std::vector eu_idx; + thread_local std::vector ev_idx; + thread_local std::vector ew; + ekey_lo.clear(); ekey_hi.clear(); + eu_idx.clear(); ev_idx.clear(); ew.clear(); + + auto add_edge = [&](int u, int v, double weight) { + const int u_idx = ensure_qubit(u); + const int v_idx = ensure_qubit(v); + const int lo = std::min(u, v); + const int hi = std::max(u, v); + for (size_t i = 0; i < ekey_lo.size(); i++) { + if (ekey_lo[i] == lo && ekey_hi[i] == hi) { + ew[i] += weight; + return; + } + } + ekey_lo.push_back(lo); + ekey_hi.push_back(hi); + eu_idx.push_back(u_idx); + ev_idx.push_back(v_idx); + ew.push_back(weight); + }; + + auto add_partition_edges = [&](int partition_idx, double weight) { + if (partition_idx == exclude_partition_idx || weight <= 0.0) return; + if ( + partition_idx < 0 + || partition_idx >= static_cast(layout_partitions_.size()) + ) return; + auto canonical_it = canonical_data.find(partition_idx); + if (canonical_it != canonical_data.end() + && !canonical_it->second.edges_u.empty() + ) { + const auto& entry = canonical_it->second; + for (size_t i = 0; i < entry.edges_u.size(); i++) { + add_edge(entry.edges_u[i], entry.edges_v[i], weight); + } + return; + } + + const auto& involved = layout_partitions_[partition_idx].involved_qbits; + if (involved.size() < 2) return; + for (size_t i = 0; i < involved.size(); i++) { + for (size_t j = i + 1; j < involved.size(); j++) { + add_edge(involved[i], involved[j], weight); + } + } + }; + + for (int partition_idx : F_snapshot) { + add_partition_edges(partition_idx, 1.0); + } + for (auto [partition_idx, depth] : E) { + const double alpha = + (depth >= 0 && depth < static_cast(alpha_weights_.size())) + ? alpha_weights_[depth] + : std::pow(config_.E_alpha, depth); + add_partition_edges(partition_idx, config_.E_weight * alpha); + } + + info.edges.reserve(ew.size()); + for (size_t i = 0; i < ew.size(); i++) { + info.edges.push_back(NeighborEdge{eu_idx[i], ev_idx[i], ew[i]}); + } + + // Reset q_to_idx via touched-list (avoids O(N) clear) + for (int q : q_touched) q_to_idx[q] = -1; + + return info; +} + +double SabreRouter::decay_factor_for_swaps( + const std::vector>& swaps, + const std::vector& decay +) const { + double factor = 1.0; + for (auto [u, v] : swaps) { + factor = std::max(factor, std::max(decay[u], decay[v])); + } + return factor; +} + +double SabreRouter::routing_objective( + double route_cost, + int cnot_count, + double cnot_weight, + double decay_factor +) const { + return decay_factor * ( + route_cost + + cnot_weight * config_.cnot_cost * static_cast(cnot_count) + ); +} + +void SabreRouter::apply_decay_for_swaps( + const std::vector>& swaps, + std::vector& decay +) const { + if (config_.decay_delta <= 0.0) { + return; + } + for (auto [u, v] : swaps) { + decay[u] += config_.decay_delta; + decay[v] += config_.decay_delta; + } +} + +void SabreRouter::reset_decay(std::vector& decay) const { + std::fill(decay.begin(), decay.end(), 1.0); +} + +std::vector SabreRouter::bfs_shortest_path(int src, int dst) const { + if (src == dst) { + return {src}; + } + + std::vector parent(N_, -1); + std::vector visited(N_, 0); + std::deque queue; + queue.push_back(src); + visited[src] = 1; + + while (!queue.empty()) { + const int node = queue.front(); + queue.pop_front(); + for (int nb : adj_[node]) { + if (visited[nb]) { + continue; + } + visited[nb] = 1; + parent[nb] = node; + if (nb == dst) { + std::vector path; + int cur = dst; + while (cur != src) { + path.push_back(cur); + cur = parent[cur]; + } + path.push_back(src); + std::reverse(path.begin(), path.end()); + return path; + } + queue.push_back(nb); + } + } + + return {}; +} + +std::pair>, std::vector> SabreRouter::release_valve( + const std::vector& F, + const std::vector& pi, + const std::unordered_map& canonical_data +) const { + double best_worst_dist = -std::numeric_limits::infinity(); + int best_partition_idx = -1; + int best_u = -1; + int best_v = -1; + + for (int partition_idx : F) { + auto it = canonical_data.find(partition_idx); + if (it == canonical_data.end()) continue; + const auto& entry = it->second; + if (entry.edges_u.empty()) continue; + + double worst_dist = 0.0; + int worst_u = -1; + int worst_v = -1; + for (size_t i = 0; i < entry.edges_u.size(); i++) { + const int u = entry.edges_u[i]; + const int v = entry.edges_v[i]; + const double d = dist(pi[u], pi[v]); + if (d > worst_dist) { + worst_dist = d; + worst_u = u; + worst_v = v; + } + } + + if (worst_dist <= 1.0 || worst_u < 0) continue; + + if ( + worst_dist > best_worst_dist + || (worst_dist == best_worst_dist + && (best_partition_idx < 0 || partition_idx < best_partition_idx)) + ) { + best_worst_dist = worst_dist; + best_partition_idx = partition_idx; + best_u = worst_u; + best_v = worst_v; + } + } + + if (best_u < 0) { + return {{}, pi}; + } + + const auto path = bfs_shortest_path(pi[best_u], pi[best_v]); + if (path.size() < 2) { + return {{}, pi}; + } + + const int k = static_cast(path.size()) - 1; + const int m = k / 2; + std::vector> swaps; + for (int i = 0; i < m; i++) { + swaps.push_back({path[i], path[i + 1]}); + } + for (int i = k; i > m + 1; i--) { + swaps.push_back({path[i], path[i - 1]}); + } + + auto pi_new = apply_swaps_to_pi(pi, swaps); + return {swaps, pi_new}; +} + +// --------------------------------------------------------------------------- +// get_initial_layer / get_final_layer +// --------------------------------------------------------------------------- + +std::vector SabreRouter::get_initial_layer() const { + std::vector layer; + for (int p = 0; p < num_partitions_; p++) { + if (IDAG_[p].empty()) layer.push_back(p); + } + return layer; +} + +std::vector SabreRouter::get_final_layer() const { + std::vector layer; + for (int p = num_partitions_ - 1; p >= 0; p--) { + if (DAG_[p].empty()) layer.push_back(p); + } + return layer; +} + +// --------------------------------------------------------------------------- +// estimate_swap_count +// --------------------------------------------------------------------------- + +int SabreRouter::estimate_swap_count( + const CandidateData& cand, + const std::vector& pi, + bool reverse +) const { + if (!cand.has_multi_qubit_body) { + return 0; + } + + const std::vector& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv; + + double total = 0.0; + for (size_t i = 0; i < cand.qbit_map_keys.size(); i++) { + int k = cand.qbit_map_keys[i]; + int v = cand.qbit_map_vals[i]; + int target_P = cand.node_mapping_flat[P_route_inv[v]]; + int current_P = pi[k]; + double d = dist(current_P, target_P); + if (d < std::numeric_limits::infinity()) { + total += d; + } + } + return static_cast(total / 2.0); +} + +// --------------------------------------------------------------------------- +// find_constrained_swaps (A* over k-dimensional state space) +// --------------------------------------------------------------------------- + +std::pair>, std::vector> +SabreRouter::find_constrained_swaps( + const std::vector& pi, + const std::vector& qbit_map_keys, + const std::vector& qbit_map_vals, + const std::vector& node_mapping_flat, + const std::vector& P_route_inv, + SwapCache* swap_cache, + const NeighborInfo* neighbor_info +) const { + const int k = static_cast(qbit_map_keys.size()); + + // ---- Setup: target/initial positions, pow_N, h0 ---- + thread_local std::vector target_positions; + thread_local std::vector initial_positions; + thread_local std::vector pow_N; + target_positions.resize(k); + initial_positions.resize(k); + pow_N.resize(k); + { + int64_t s = 1; + for (int i = 0; i < k; i++) { pow_N[i] = s; s *= N_; } + } + + bool already_there = true; + double h0_sum = 0.0; + int64_t initial_packed = 0; + int64_t target_packed = 0; + for (int i = 0; i < k; i++) { + const int q = qbit_map_keys[i]; + const int v = qbit_map_vals[i]; + const int t = node_mapping_flat[P_route_inv[v]]; + const int ip = pi[q]; + target_positions[i] = t; + initial_positions[i] = ip; + if (ip != t) already_there = false; + h0_sum += dist(ip, t); + initial_packed += static_cast(ip) * pow_N[i]; + target_packed += static_cast(t) * pow_N[i]; + } + if (already_there) { + return {{}, pi}; + } + + const bool use_neighbor = + neighbor_info != nullptr && neighbor_info->uses_tiebreak(); + + auto mix64 = [](uint64_t h, uint64_t v) -> uint64_t { + h ^= v + 0x9e3779b97f4a7c15ULL + (h << 6) + (h >> 2); + return h; + }; + + uint64_t neighbor_hash = 0; + if (use_neighbor) { + neighbor_hash = 0xcbf29ce484222325ULL; + for (const auto& edge : neighbor_info->edges) { + const int lo = std::min(edge.u_idx, edge.v_idx); + const int hi = std::max(edge.u_idx, edge.v_idx); + uint64_t w_bits; + std::memcpy(&w_bits, &edge.weight, sizeof(w_bits)); + neighbor_hash = mix64(neighbor_hash, static_cast(lo)); + neighbor_hash = mix64(neighbor_hash, static_cast(hi)); + neighbor_hash = mix64(neighbor_hash, w_bits); + } + for (int p : neighbor_info->initial_pos) { + neighbor_hash = mix64(neighbor_hash, static_cast(p)); + } + uint64_t weight_bits; + const double weight_val = neighbor_info->weight; + std::memcpy(&weight_bits, &weight_val, sizeof(weight_bits)); + neighbor_hash = mix64(neighbor_hash, weight_bits); + } + + const SwapCacheKey cache_key{initial_packed, target_packed, k, neighbor_hash}; + + if (swap_cache) { + auto it = swap_cache->find(cache_key); + if (it != swap_cache->end()) { + auto result_pi = apply_swaps_to_pi(pi, it->second); + return {it->second, result_pi}; + } + } + + // ---- Neighbor heuristic setup ---- + double total_edge_weight = 0.0; + if (use_neighbor) { + for (const auto& edge : neighbor_info->edges) { + total_edge_weight += edge.weight; + } + } + const double neighbor_norm = std::max( + 1.0, total_edge_weight * std::max(1.0, max_finite_distance_) + ); + const double neighbor_scale = + use_neighbor ? (neighbor_info->weight / neighbor_norm) : 0.0; + + auto compute_nb_total = [&](const std::vector& pos_nb) { + double total = 0.0; + for (const auto& edge : neighbor_info->edges) { + total += edge.weight * dist(pos_nb[edge.u_idx], pos_nb[edge.v_idx]); + } + return total; + }; + + double initial_nb_total = 0.0; + if (use_neighbor) { + initial_nb_total = compute_nb_total(neighbor_info->initial_pos); + } + + // ---- Arena + best-state table (replaces visited+parent maps) ---- + struct Node { + int64_t packed; + int parent_idx; + int g; + int sw_lo, sw_hi; + double h_sum; // sum(dist(pos[i], target[i])) — twice the admissible h + double nb_total; // sum(edge.weight * dist(...)) — pre-scale + int nb_arena_idx; // -1 if !use_neighbor; else slot in nb_pos_flat + uint64_t nb_hash; // incremental XOR hash of neighbor VQ positions + }; + thread_local std::vector arena; + // Flat storage for neighbor positions: slot s lives at + // [s * nb_stride, (s+1) * nb_stride). Slots are shared across nodes whose + // swap doesn't touch any neighbor virtual qubit. + thread_local std::vector nb_pos_flat; + thread_local std::vector> vq_edges; + thread_local std::vector nb_scratch; + arena.clear(); + nb_pos_flat.clear(); + arena.reserve(1024); + // key = mix(packed) ^ nb_hash; no heap allocation per lookup + thread_local std::unordered_map best_node; + best_node.clear(); + best_node.reserve(2048); + + const int nb_stride = use_neighbor + ? static_cast(neighbor_info->neighbor_vqs.size()) + : 0; + if (use_neighbor) { + // Per-vq edge index list: which edges touch each virtual qubit. + vq_edges.assign(nb_stride, {}); + for (int e = 0; e < static_cast(neighbor_info->edges.size()); e++) { + const auto& edge = neighbor_info->edges[e]; + vq_edges[edge.u_idx].push_back(e); + if (edge.v_idx != edge.u_idx) { + vq_edges[edge.v_idx].push_back(e); + } + } + nb_pos_flat.reserve(static_cast(nb_stride) * 1024); + nb_pos_flat.insert(nb_pos_flat.end(), + neighbor_info->initial_pos.begin(), + neighbor_info->initial_pos.end()); + nb_scratch.resize(nb_stride); + } + + // Per-(vq_idx, phys) contribution to nb_hash; XOR-based so removals are + // identical to additions (self-inverse), enabling incremental updates. + auto slot_hash = [](int vq_idx, int phys) -> uint64_t { + uint64_t h = static_cast(vq_idx) * 0x9e3779b97f4a7c15ULL + ^ static_cast(phys) * 0x6c62272e07bb0142ULL; + h ^= h >> 33; h *= 0xff51afd7ed558ccdULL; h ^= h >> 33; + return h; + }; + auto make_key = [](int64_t packed, uint64_t nb_hash) -> uint64_t { + uint64_t h = static_cast(packed); + h ^= h >> 33; h *= 0xff51afd7ed558ccdULL; + h ^= h >> 33; h *= 0xc4ceb9fe1a85ec53ULL; + h ^= h >> 33; + return h ^ nb_hash; + }; + + uint64_t initial_nb_hash = 0; + if (use_neighbor) { + for (int z = 0; z < nb_stride; z++) { + initial_nb_hash ^= slot_hash(z, neighbor_info->initial_pos[z]); + } + } + + // ---- Push initial node ---- + // Slot 0 of nb_pos_flat already holds neighbor_info->initial_pos. + { + Node n; + n.packed = initial_packed; + n.parent_idx = -1; + n.g = 0; + n.sw_lo = -1; n.sw_hi = -1; + n.h_sum = h0_sum; + n.nb_total = initial_nb_total; + n.nb_arena_idx = use_neighbor ? 0 : -1; + n.nb_hash = initial_nb_hash; + arena.push_back(n); + best_node.emplace(make_key(initial_packed, initial_nb_hash), 0); + } + + // PQ entry: (f, g, counter, arena_idx) + using PQEntry = std::tuple; + std::priority_queue, std::greater> pq; + uint64_t counter = 0; + pq.push({0.5 * h0_sum + neighbor_scale * initial_nb_total, 0, counter++, 0}); + + thread_local std::vector positions; + positions.resize(k); + + while (!pq.empty()) { + auto [f, g_e, ctr, idx] = pq.top(); + pq.pop(); + (void)f; (void)ctr; + const int g = g_e; + const int64_t packed = arena[idx].packed; + const uint64_t cur_nb_hash = arena[idx].nb_hash; + + // A state can be reinserted with a lower g-cost after this queue entry + // was pushed. When the neighbor tie-breaker is active, future-qubit + // positions are part of the state so equal-length paths with different + // bystander layouts are not collapsed. + const uint64_t cur_key = make_key(packed, cur_nb_hash); + auto cur_best = best_node.find(cur_key); + if (cur_best == best_node.end() || cur_best->second != idx) { + continue; + } + + if (packed == target_packed) { + // Reconstruct path + std::vector> path; + int cur = idx; + while (arena[cur].parent_idx != -1) { + path.push_back({arena[cur].sw_lo, arena[cur].sw_hi}); + cur = arena[cur].parent_idx; + } + std::reverse(path.begin(), path.end()); + + auto result_pi = apply_swaps_to_pi(pi, path); + if (swap_cache) { + (*swap_cache)[cache_key] = path; + } + return {path, result_pi}; + } + + // Stale entry? + if (arena[idx].g < g) continue; + + // Unpack positions for this state + { + int64_t p = packed; + for (int i = 0; i < k; i++) { + positions[i] = static_cast(p % N_); + p /= N_; + } + } + const double cur_h_sum = arena[idx].h_sum; + const double cur_nb_total = arena[idx].nb_total; + const int cur_nb_arena_idx = arena[idx].nb_arena_idx; + // cur_nb_hash already read above + + // Expand: every SWAP that moves at least one partition qubit + for (int i = 0; i < k; i++) { + const int p = positions[i]; + const int t_i = target_positions[i]; + const int adj_lo = adj_offsets_[p]; + const int adj_hi = adj_offsets_[p + 1]; + for (int nb_idx = adj_lo; nb_idx < adj_hi; nb_idx++) { + const int nb = adj_flat_[nb_idx]; + // Find j such that positions[j] == nb (if any) + int j_swap = -1; + for (int j = 0; j < k; j++) { + if (positions[j] == nb) { j_swap = j; break; } + } + + // Incremental packed + int64_t new_packed = packed + static_cast(nb - p) * pow_N[i]; + if (j_swap >= 0) { + new_packed += static_cast(p - nb) * pow_N[j_swap]; + } + + // Incremental h_sum + double new_h_sum = cur_h_sum + - dist(p, t_i) + dist(nb, t_i); + if (j_swap >= 0) { + const int t_j = target_positions[j_swap]; + new_h_sum += -dist(nb, t_j) + dist(p, t_j); + } + + const int new_g = g + 1; + + // Neighbor heuristic: incremental delta. Only edges incident + // to the affected virtual qubits change; everything else + // contributes the same dist as in the parent state. + double new_nb_total = cur_nb_total; + int new_nb_arena_idx = -1; + uint64_t new_nb_hash = cur_nb_hash; + if (use_neighbor) { + const size_t parent_base = + static_cast(cur_nb_arena_idx) * nb_stride; + for (int z = 0; z < nb_stride; z++) { + nb_scratch[z] = nb_pos_flat[parent_base + z]; + } + int idx_nb_vq = -1, idx_p_vq = -1; + for (int z = 0; z < nb_stride; z++) { + const int phys = nb_scratch[z]; + if (phys == nb) idx_nb_vq = z; + else if (phys == p) idx_p_vq = z; + if (idx_nb_vq >= 0 && idx_p_vq >= 0) break; + } + if (idx_nb_vq >= 0 || idx_p_vq >= 0) { + double delta = 0.0; + auto accum = [&](int vq_idx, double sign) { + if (vq_idx < 0) return; + for (int e : vq_edges[vq_idx]) { + const auto& edge = neighbor_info->edges[e]; + delta += sign * edge.weight * dist( + nb_scratch[edge.u_idx], + nb_scratch[edge.v_idx]); + } + }; + accum(idx_nb_vq, -1.0); + accum(idx_p_vq, -1.0); + if (idx_nb_vq >= 0) nb_scratch[idx_nb_vq] = p; + if (idx_p_vq >= 0) nb_scratch[idx_p_vq] = nb; + accum(idx_nb_vq, +1.0); + accum(idx_p_vq, +1.0); + new_nb_total = cur_nb_total + delta; + new_nb_arena_idx = static_cast( + nb_pos_flat.size() / nb_stride); + nb_pos_flat.insert(nb_pos_flat.end(), + nb_scratch.begin(), + nb_scratch.end()); + // Incremental hash: XOR out old slots, XOR in new ones + if (idx_nb_vq >= 0) { + new_nb_hash ^= slot_hash(idx_nb_vq, nb) + ^ slot_hash(idx_nb_vq, p); + } + if (idx_p_vq >= 0) { + new_nb_hash ^= slot_hash(idx_p_vq, p) + ^ slot_hash(idx_p_vq, nb); + } + } else { + new_nb_arena_idx = cur_nb_arena_idx; + // new_nb_hash unchanged + } + } + + const uint64_t new_key = make_key(new_packed, new_nb_hash); + auto existing = best_node.find(new_key); + if (existing != best_node.end() + && arena[existing->second].g <= new_g + ) { + continue; + } + + // Insert/update node + Node n; + n.packed = new_packed; + n.parent_idx = idx; + n.g = new_g; + const int lo = std::min(p, nb); + const int hi = std::max(p, nb); + n.sw_lo = lo; n.sw_hi = hi; + n.h_sum = new_h_sum; + n.nb_total = new_nb_total; + n.nb_arena_idx = new_nb_arena_idx; + n.nb_hash = new_nb_hash; + + int32_t new_idx = static_cast(arena.size()); + arena.push_back(n); + best_node[new_key] = new_idx; + + const double f_new = static_cast(new_g) + + 0.5 * new_h_sum + + neighbor_scale * new_nb_total; + pq.push({f_new, new_g, counter++, new_idx}); + } + } + } + + // Failed to route (should not happen on a connected graph) + return {{}, pi}; +} + +// --------------------------------------------------------------------------- +// transform_pi +// --------------------------------------------------------------------------- + +std::pair>, std::vector> +SabreRouter::transform_pi( + const CandidateData& cand, + const std::vector& pi, + bool reverse, + SwapCache* swap_cache, + const NeighborInfo* neighbor_info +) const { + const std::vector& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv; + const std::vector& P_exit = reverse ? cand.P_i : cand.P_o; + + if (!cand.has_multi_qubit_body) { + std::vector dynamic_node_mapping(P_route_inv.size(), -1); + for (size_t i = 0; i < cand.qbit_map_keys_sorted.size(); i++) { + const int logical_q = cand.qbit_map_keys_sorted[i]; + const int qstar = cand.qbit_map_vals_sorted[i]; + dynamic_node_mapping[P_route_inv[qstar]] = pi[logical_q]; + } + + std::vector pi_output = pi; + for (size_t q_star = 0; q_star < P_exit.size(); q_star++) { + if (q_star < cand.qstar_to_q.size()) { + const int logical_q = cand.qstar_to_q[q_star]; + if (logical_q < 0) continue; + pi_output[logical_q] = dynamic_node_mapping[P_exit[q_star]]; + } + } + return {{}, std::move(pi_output)}; + } + + // Route qubits to input positions + auto [swaps, pi_routed] = find_constrained_swaps( + pi, + cand.qbit_map_keys_sorted, + cand.qbit_map_vals_sorted, + cand.node_mapping_flat, + P_route_inv, + swap_cache, + neighbor_info + ); + + // Update output positions using P_exit + std::vector pi_output = pi_routed; + + for (size_t q_star = 0; q_star < P_exit.size(); q_star++) { + if (q_star < cand.qstar_to_q.size()) { + int k = cand.qstar_to_q[q_star]; + if (k < 0) continue; + pi_output[k] = cand.node_mapping_flat[P_exit[q_star]]; + } + } + + return {swaps, pi_output}; +} + +// --------------------------------------------------------------------------- +// generate_extended_set (BFS lookahead) +// --------------------------------------------------------------------------- + +std::vector> SabreRouter::generate_extended_set( + const std::vector& F, + const std::vector& resolved, + const std::vector>& children_graph, + const std::vector>& parents_graph +) const { + std::vector> E; + std::vector in_E(num_partitions_, 0); + std::vector in_F(num_partitions_, 0); + for (int p : F) in_F[p] = 1; + + struct BFSNode { + int partition; + int depth; + }; + + for (int front_idx : F) { + if (static_cast(E.size()) >= config_.max_E_size) break; + + std::deque queue; + // Push without pre-checking; eligibility is tested when popped so a + // single-qubit partition can act as a transparent transit node. + for (int child : children_graph[front_idx]) { + queue.push_back({child, 1}); + } + + while (!queue.empty() && static_cast(E.size()) < config_.max_E_size) { + auto [part, depth] = queue.front(); + queue.pop_front(); + + if (depth > config_.max_lookahead) continue; + if (in_E[part] || in_F[part] || resolved[part]) continue; + + bool parents_ok = true; + for (int par : parents_graph[part]) { + if (!resolved[par] && !in_F[par]) { + parents_ok = false; + break; + } + } + if (!parents_ok) continue; + + if (layout_partitions_[part].is_single) { + // Single-qubit partitions act as transparent transit nodes: + // forward their grandchildren at the same depth. + for (int child : children_graph[part]) { + queue.push_back({child, depth}); + } + continue; + } + + E.push_back({part, depth}); + in_E[part] = 1; + + if (depth < config_.max_lookahead) { + for (int child : children_graph[part]) { + queue.push_back({child, depth + 1}); + } + } + } + } + + return E; +} + +// --------------------------------------------------------------------------- +// Routing cost helpers +// --------------------------------------------------------------------------- + +double SabreRouter::entry_future_cost( + const CanonicalEntry& entry, + const std::vector& pi +) const { + double total = 0.0; + for (size_t i = 0; i < entry.edges_u.size(); i++) { + const double d = dist(pi[entry.edges_u[i]], pi[entry.edges_v[i]]); + if (d > 1.0) total += d - 1.0; + } + return total; +} + +double SabreRouter::future_partition_cost( + int partition_idx, + const std::vector& pi, + bool reverse, + const std::unordered_map& canonical_data +) const { + if ( + partition_idx >= 0 + && partition_idx < static_cast(candidate_cache_.size()) + && !candidate_cache_[partition_idx].empty() + && candidate_cache_[partition_idx].front().involved_qbits.size() >= 3 + ) { + double best = std::numeric_limits::infinity(); + for (const auto& cand : candidate_cache_[partition_idx]) { + best = std::min( + best, + static_cast(estimate_swap_count(cand, pi, reverse)) + ); + } + return best; + } + + auto it = canonical_data.find(partition_idx); + if (it == canonical_data.end()) { + return std::numeric_limits::infinity(); + } + return entry_future_cost(it->second, pi); +} + +double SabreRouter::future_context_cost( + int exclude_partition_idx, + const std::vector& pi, + const std::vector& F_snapshot, + const std::vector>& E, + bool reverse, + const std::unordered_map& canonical_data +) const { + // Candidate-aware lower bound: for each future partition, use the best + // available candidate entry cost under this layout. This lets 3q line + // blocks distinguish which logical qubit should sit on the path center. + double f_sum = 0.0; + int n_other = 0; + for (int p_idx : F_snapshot) { + if (p_idx == exclude_partition_idx) continue; + const double cost = future_partition_cost( + p_idx, pi, reverse, canonical_data); + if (!std::isfinite(cost)) continue; + f_sum += cost; + n_other++; + } + + double score = n_other > 0 + ? f_sum / static_cast(n_other) + : 0.0; + + if (!E.empty()) { + double e_sum = 0.0; + int e_count = 0; + for (auto [p_idx, depth] : E) { + if (p_idx == exclude_partition_idx) continue; + const double cost = future_partition_cost( + p_idx, pi, reverse, canonical_data); + if (!std::isfinite(cost)) continue; + const double alpha = + (depth >= 0 && depth < static_cast(alpha_weights_.size())) + ? alpha_weights_[depth] + : std::pow(config_.E_alpha, depth); + e_sum += alpha * cost; + e_count++; + } + if (e_count > 0) { + score += config_.E_weight * e_sum / static_cast(e_count); + } + } + + return score; +} + +std::vector SabreRouter::estimate_candidate_output_layout( + const CandidateData& cand, + const std::vector& pi, + bool reverse +) const { + if (!cand.has_multi_qubit_body) { + const std::vector& P_route_inv = reverse ? cand.P_o_inv : cand.P_i_inv; + const std::vector& P_exit = reverse ? cand.P_i : cand.P_o; + std::vector dynamic_node_mapping(P_route_inv.size(), -1); + for (size_t i = 0; i < cand.qbit_map_keys_sorted.size(); i++) { + const int logical_q = cand.qbit_map_keys_sorted[i]; + const int qstar = cand.qbit_map_vals_sorted[i]; + dynamic_node_mapping[P_route_inv[qstar]] = pi[logical_q]; + } + + std::vector pi_output = pi; + for (size_t q_star = 0; q_star < P_exit.size(); q_star++) { + if (q_star < cand.qstar_to_q.size()) { + const int logical_q = cand.qstar_to_q[q_star]; + if (logical_q < 0) continue; + pi_output[logical_q] = dynamic_node_mapping[P_exit[q_star]]; + } + } + return pi_output; + } + + const std::vector& P_exit = reverse ? cand.P_i : cand.P_o; + std::vector pi_output = pi; + + for (size_t q_star = 0; q_star < P_exit.size(); q_star++) { + if (q_star < cand.qstar_to_q.size()) { + int k = cand.qstar_to_q[q_star]; + if (k < 0) continue; + pi_output[k] = cand.node_mapping_flat[P_exit[q_star]]; + } + } + + return pi_output; +} + +// --------------------------------------------------------------------------- +// score_candidate (LightSABRE scoring) +// --------------------------------------------------------------------------- + +double SabreRouter::score_candidate( + const CandidateData& cand, + const std::vector& F_snapshot, + const std::vector& pi, + const std::vector>& E, + bool reverse, + const std::unordered_map& canonical_data, + SwapCache* swap_cache, + const std::vector* decay, + std::vector>* out_swaps, + std::vector* out_pi_new, + const NeighborInfo* cached_neighbor_info +) const { + NeighborInfo local_neighbor_info; + const NeighborInfo* neighbor_ptr; + if (cached_neighbor_info) { + neighbor_ptr = cached_neighbor_info->uses_tiebreak() ? cached_neighbor_info : nullptr; + } else { + local_neighbor_info = build_neighbor_info( + cand.partition_idx, F_snapshot, E, pi, canonical_data); + neighbor_ptr = local_neighbor_info.uses_tiebreak() ? &local_neighbor_info : nullptr; + } + auto [swaps, output_perm] = transform_pi( + cand, + pi, + reverse, + swap_cache, + neighbor_ptr + ); + + double decay_factor = 1.0; + if (decay != nullptr && !swaps.empty()) { + decay_factor = decay_factor_for_swaps(swaps, *decay); + } + double score = routing_objective( + static_cast(swaps.size()), + cand.cnot_count, + 1.0, + decay_factor + ); + + const int cand_idx = cand.partition_idx; + double future_score = future_context_cost( + cand_idx, + output_perm, + F_snapshot, + E, + reverse, + canonical_data + ); + if (cand.involved_qbits.size() >= 3) { + future_score *= config_.three_qubit_exit_weight; + } + score += future_score; + + if (out_swaps) *out_swaps = std::move(swaps); + if (out_pi_new) *out_pi_new = std::move(output_perm); + return score; +} + +// --------------------------------------------------------------------------- +// obtain_partition_candidates +// --------------------------------------------------------------------------- + +std::vector SabreRouter::obtain_partition_candidates( + const std::vector& F +) const { + std::vector result; + for (int p_idx : F) { + if (p_idx < 0 || p_idx >= num_partitions_) continue; + for (const auto& cand : candidate_cache_[p_idx]) { + result.push_back(&cand); + } + } + return result; +} + +// --------------------------------------------------------------------------- +// prefilter_candidates +// --------------------------------------------------------------------------- + +std::vector SabreRouter::prefilter_candidates( + const std::vector& candidates, + const std::vector& pi, + int top_k, + const std::vector& F_snapshot, + const std::vector>& E, + bool reverse, + const std::unordered_map& canonical_data +) const { + if (static_cast(candidates.size()) <= top_k) return candidates; + if (top_k <= 0) return {}; + + using Pair = std::pair; + std::vector estimated; + estimated.reserve(candidates.size()); + for (const auto* cand : candidates) { + const auto approx_output = estimate_candidate_output_layout( + *cand, pi, reverse); + const double est = routing_objective( + static_cast(estimate_swap_count(*cand, pi, reverse)), + cand->cnot_count + ) + future_context_cost( + cand->partition_idx, approx_output, F_snapshot, E, reverse, + canonical_data); + estimated.push_back({est, cand}); + } + + std::stable_sort( + estimated.begin(), + estimated.end(), + [](const Pair& a, const Pair& b) { + if (a.first != b.first) return a.first < b.first; + if (a.second->partition_idx != b.second->partition_idx) { + return a.second->partition_idx < b.second->partition_idx; + } + return a.second->candidate_idx < b.second->candidate_idx; + } + ); + + const int min_per_partition = + std::max(0, config_.prefilter_min_per_partition); + const int min_3q = std::max(0, config_.prefilter_min_3q); + + std::vector result; + result.reserve(std::min(static_cast(candidates.size()), top_k)); + std::unordered_set selected; + + if (min_per_partition > 0 || min_3q > 0) { + std::unordered_map quota_by_partition; + for (const auto& item : estimated) { + const CandidateData* cand = item.second; + int quota = min_per_partition; + if (cand->involved_qbits.size() >= 3) { + quota = std::max(quota, min_3q); + } + if (quota <= 0) continue; + auto it = quota_by_partition.find(cand->partition_idx); + if (it == quota_by_partition.end() || quota > it->second) { + quota_by_partition[cand->partition_idx] = quota; + } + } + + std::unordered_map selected_by_partition; + for (const auto& item : estimated) { + const CandidateData* cand = item.second; + auto quota_it = quota_by_partition.find(cand->partition_idx); + if (quota_it == quota_by_partition.end()) continue; + int& count = selected_by_partition[cand->partition_idx]; + if (count >= quota_it->second) continue; + result.push_back(cand); + selected.insert(cand); + count++; + } + } + + for (const auto& item : estimated) { + if (static_cast(result.size()) >= top_k) break; + const CandidateData* cand = item.second; + if (selected.find(cand) != selected.end()) continue; + result.push_back(cand); + selected.insert(cand); + } + return result; +} + +// --------------------------------------------------------------------------- +// select_best_candidate +// --------------------------------------------------------------------------- + +const CandidateData& SabreRouter::select_best_candidate( + const std::vector& candidates, + const std::vector& scores, + std::mt19937* rng +) const { + (void)rng; + + // Find minimum score + double min_score = scores[0]; + size_t min_idx = 0; + for (size_t i = 1; i < scores.size(); i++) { + if (scores[i] < min_score) { + min_score = scores[i]; + min_idx = i; + } + } + + return *candidates[min_idx]; +} + +// --------------------------------------------------------------------------- +// Boundary beam search helpers +// --------------------------------------------------------------------------- + +std::pair, std::vector> +SabreRouter::advance_layout_frontier( + int selected_partition_idx, + const std::vector& F, + const std::vector& resolved, + const std::vector>& children_graph, + const std::vector>& parents_graph +) const { + std::vector F_next(F); + std::vector resolved_next(resolved); + + F_next.erase( + std::remove(F_next.begin(), F_next.end(), selected_partition_idx), + F_next.end() + ); + if ( + selected_partition_idx >= 0 + && selected_partition_idx < static_cast(resolved_next.size()) + ) { + resolved_next[selected_partition_idx] = 1; + } + + std::deque stack; + for (int child : children_graph[selected_partition_idx]) { + stack.push_back(child); + } + + while (!stack.empty()) { + const int child = stack.front(); + stack.pop_front(); + + if (resolved_next[child]) continue; + if (std::find(F_next.begin(), F_next.end(), child) != F_next.end()) { + continue; + } + + bool parents_ok = true; + for (int parent : parents_graph[child]) { + if (!resolved_next[parent]) { + parents_ok = false; + break; + } + } + if (!parents_ok) continue; + + if (layout_partitions_[child].is_single) { + resolved_next[child] = 1; + for (int grandchild : children_graph[child]) { + stack.push_back(grandchild); + } + } else { + F_next.push_back(child); + } + } + + return {std::move(F_next), std::move(resolved_next)}; +} + +size_t SabreRouter::boundary_beam_select_index( + const std::vector& candidates, + const std::vector& scores, + const std::vector>>& cached_swaps, + const std::vector>& cached_pi, + const std::vector& F_snapshot, + const std::vector& resolved, + const std::vector>& children_graph, + const std::vector>& parents_graph, + bool reverse, + const std::unordered_map& canonical_data, + SwapCache* swap_cache +) const { + size_t fallback_idx = 0; + for (size_t i = 1; i < scores.size(); i++) { + if (scores[i] < scores[fallback_idx]) { + fallback_idx = i; + } + } + + const int beam_width = std::max(1, config_.boundary_beam_width); + const int beam_depth = std::max(1, config_.boundary_beam_depth); + if (beam_width <= 1 || beam_depth <= 1 || candidates.size() <= 1) { + return fallback_idx; + } + + bool has_three_qubit_candidate = false; + for (const auto* cand : candidates) { + if (cand->involved_qbits.size() >= 3) { + has_three_qubit_candidate = true; + break; + } + } + if (!has_three_qubit_candidate) { + return fallback_idx; + } + + struct BeamState { + double rank_cost; + double total_cost; + std::vector pi; + std::vector F; + std::vector resolved; + size_t first_idx; + }; + + auto transition_cost = [&](const CandidateData& cand, size_t idx) { + return routing_objective( + static_cast(cached_swaps[idx].size()), + cand.cnot_count + ); + }; + + auto sort_states = [](const BeamState& a, const BeamState& b) { + if (a.rank_cost != b.rank_cost) return a.rank_cost < b.rank_cost; + return a.first_idx < b.first_idx; + }; + + std::vector states; + states.reserve(candidates.size()); + for (size_t idx = 0; idx < candidates.size(); idx++) { + if (cached_pi[idx].empty()) continue; + const auto& cand = *candidates[idx]; + auto [F_next, resolved_next] = advance_layout_frontier( + cand.partition_idx, + F_snapshot, + resolved, + children_graph, + parents_graph + ); + const double trans_cost = transition_cost(cand, idx); + states.push_back(BeamState{ + scores[idx], + trans_cost, + cached_pi[idx], + std::move(F_next), + std::move(resolved_next), + idx + }); + } + + if (states.empty()) { + return fallback_idx; + } + std::sort(states.begin(), states.end(), sort_states); + if (static_cast(states.size()) > beam_width) { + states.resize(beam_width); + } + + for (int depth = 1; depth < beam_depth; depth++) { + std::vector expanded; + + for (const auto& state : states) { + if (state.F.empty()) { + expanded.push_back(BeamState{ + state.total_cost, + state.total_cost, + state.pi, + state.F, + state.resolved, + state.first_idx + }); + continue; + } + + auto E = generate_extended_set( + state.F, + state.resolved, + children_graph, + parents_graph + ); + + auto rollout_candidates = obtain_partition_candidates(state.F); + if (rollout_candidates.empty()) { + expanded.push_back(BeamState{ + state.total_cost, + state.total_cost, + state.pi, + state.F, + state.resolved, + state.first_idx + }); + continue; + } + + rollout_candidates = prefilter_candidates( + rollout_candidates, + state.pi, + config_.prefilter_top_k, + state.F, + E, + reverse, + canonical_data + ); + + for (const CandidateData* cand : rollout_candidates) { + NeighborInfo neighbor_info = build_neighbor_info( + cand->partition_idx, + state.F, + E, + state.pi, + canonical_data + ); + std::vector> swaps; + std::vector output_perm; + const double score = score_candidate( + *cand, + state.F, + state.pi, + E, + reverse, + canonical_data, + swap_cache, + nullptr, + &swaps, + &output_perm, + &neighbor_info + ); + const double trans_cost = routing_objective( + static_cast(swaps.size()), + cand->cnot_count + ); + const double future_cost = score - trans_cost; + const double new_total = state.total_cost + trans_cost; + const double rank_cost = new_total + future_cost; + + auto [F_next, resolved_next] = advance_layout_frontier( + cand->partition_idx, + state.F, + state.resolved, + children_graph, + parents_graph + ); + expanded.push_back(BeamState{ + rank_cost, + new_total, + std::move(output_perm), + std::move(F_next), + std::move(resolved_next), + state.first_idx + }); + } + } + + if (expanded.empty()) { + break; + } + std::sort(expanded.begin(), expanded.end(), sort_states); + if (static_cast(expanded.size()) > beam_width) { + expanded.resize(beam_width); + } + states = std::move(expanded); + } + + if (states.empty()) { + return fallback_idx; + } + return std::min_element(states.begin(), states.end(), sort_states)->first_idx; +} + +// --------------------------------------------------------------------------- +// heuristic_search (main loop) +// --------------------------------------------------------------------------- + +std::pair, double> SabreRouter::heuristic_search( + const std::vector& F_init, + std::vector pi, + bool reverse, + std::mt19937* rng, + const std::unordered_map& canonical_data, + const std::vector>& cg, + const std::vector>& pg, + ForwardRouteResult* route_trace +) const { + (void)rng; + + std::vector F; + std::vector queue; + std::vector resolved(num_partitions_, 0); + std::vector in_F(num_partitions_, 0); + double total_cost = 0.0; + + // Split F_init into F (multi-qubit) and queue (single-qubit) + for (int p : F_init) { + if (layout_partitions_[p].is_single) { + queue.push_back(p); + } else { + F.push_back(p); + in_F[p] = 1; + } + } + + // Flush initial single-qubit partitions + while (!queue.empty()) { + int p = queue.back(); + queue.pop_back(); + + if (resolved[p]) continue; + resolved[p] = 1; + if (route_trace) { + RouteStep step; + step.type = 2; + step.partition_idx = p; + if (!layout_partitions_[p].involved_qbits.empty()) { + step.physical_qubit = pi[layout_partitions_[p].involved_qbits[0]]; + } + route_trace->steps.push_back(std::move(step)); + } + + for (int child : cg[p]) { + if (!resolved[child] && !in_F[child]) { + bool parents_ok = true; + for (int par : pg[child]) { + if (!resolved[par]) { parents_ok = false; break; } + } + if (parents_ok) { + if (layout_partitions_[child].is_single) { + queue.push_back(child); + } else { + F.push_back(child); + in_F[child] = 1; + } + } + } + } + } + + // Swap cache for this search call (thread-local, on stack) + SwapCache swap_cache; + std::vector decay(N_, 1.0); + int swap_heavy_partitions = 0; + + // Main search loop + while (!F.empty()) { + if ( + config_.swap_burst_budget > 0 + && swap_heavy_partitions >= config_.swap_burst_budget + ) { + auto [valve_swaps, pi_bridged] = release_valve( + F, + pi, + canonical_data + ); + if (!valve_swaps.empty()) { + total_cost += routing_objective( + static_cast(valve_swaps.size()), + 0, + 1.0, + decay_factor_for_swaps(valve_swaps, decay) + ); + if (route_trace) { + RouteStep step; + step.type = 0; + step.swaps = valve_swaps; + route_trace->cnot_count += static_cast(valve_swaps.size()) * 3; + route_trace->steps.push_back(std::move(step)); + } + apply_decay_for_swaps(valve_swaps, decay); + pi = std::move(pi_bridged); + swap_heavy_partitions = 0; + continue; + } + reset_decay(decay); + swap_heavy_partitions = 0; + } + + auto all_candidates = obtain_partition_candidates(F); + if (all_candidates.empty()) break; + + // Generate extended set + auto E = generate_extended_set(F, resolved, cg, pg); + + // Prefilter with a cheap estimate of the candidate's future context. + auto candidates = prefilter_candidates( + all_candidates, pi, config_.prefilter_top_k, F, E, reverse, + canonical_data); + + // Group candidates by partition_idx so build_neighbor_info is shared + std::vector order(candidates.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](size_t a, size_t b) { + return candidates[a]->partition_idx < candidates[b]->partition_idx; + }); + + // Score all candidates and cache each one's transform output + std::vector scores(candidates.size()); + std::vector>> cached_swaps(candidates.size()); + std::vector> cached_pi(candidates.size()); + int prev_partition_idx = -1; + NeighborInfo cached_ni; + for (size_t k_ord = 0; k_ord < order.size(); k_ord++) { + const size_t ci = order[k_ord]; + const int p_idx = candidates[ci]->partition_idx; + if (p_idx != prev_partition_idx) { + cached_ni = build_neighbor_info(p_idx, F, E, pi, canonical_data); + prev_partition_idx = p_idx; + } + scores[ci] = score_candidate( + *candidates[ci], + F, pi, E, reverse, canonical_data, + &swap_cache, &decay, + &cached_swaps[ci], &cached_pi[ci], + &cached_ni + ); + } + + // Select best, optionally using boundary-layout beam rollout + const size_t best_ci = boundary_beam_select_index( + candidates, + scores, + cached_swaps, + cached_pi, + F, + resolved, + cg, + pg, + reverse, + canonical_data, + &swap_cache + ); + const auto& best = *candidates[best_ci]; + + // Remove from F and mark resolved + F.erase(std::remove(F.begin(), F.end(), best.partition_idx), F.end()); + in_F[best.partition_idx] = 0; + resolved[best.partition_idx] = 1; + + // Reuse cached transform from scoring (F_snapshot \ {best} == F_after_erase + // because exclude_partition_idx == best.partition_idx in both cases) + std::vector> swaps = std::move(cached_swaps[best_ci]); + std::vector pi_new = std::move(cached_pi[best_ci]); + const double decay_factor = swaps.empty() + ? 1.0 + : decay_factor_for_swaps(swaps, decay); + total_cost += routing_objective( + static_cast(swaps.size()), + best.cnot_count, + 1.0, + decay_factor + ); + if (route_trace) { + if (!swaps.empty()) { + RouteStep swap_step; + swap_step.type = 0; + swap_step.swaps = swaps; + route_trace->cnot_count += static_cast(swaps.size()) * 3; + route_trace->steps.push_back(std::move(swap_step)); + } + RouteStep part_step; + part_step.type = 1; + part_step.partition_idx = best.partition_idx; + part_step.candidate_idx = best.candidate_idx; + route_trace->cnot_count += best.cnot_count; + route_trace->steps.push_back(std::move(part_step)); + } + pi = std::move(pi_new); + apply_decay_for_swaps(swaps, decay); + if (swaps.empty()) { + swap_heavy_partitions = 0; + reset_decay(decay); + } else { + swap_heavy_partitions++; + } + + // Update F with newly eligible children + for (int child : cg[best.partition_idx]) { + if (!resolved[child] && !in_F[child]) { + bool parents_ok = true; + for (int par : pg[child]) { + if (!resolved[par]) { parents_ok = false; break; } + } + + if (parents_ok) { + if (layout_partitions_[child].is_single) { + resolved[child] = 1; + if (route_trace) { + RouteStep step; + step.type = 2; + step.partition_idx = child; + if (!layout_partitions_[child].involved_qbits.empty()) { + step.physical_qubit = pi[layout_partitions_[child].involved_qbits[0]]; + } + route_trace->steps.push_back(std::move(step)); + } + std::vector stack; + for (int gc : cg[child]) stack.push_back(gc); + + while (!stack.empty()) { + int gc = stack.back(); + stack.pop_back(); + + if (!resolved[gc] && !in_F[gc]) { + bool gc_parents_ok = true; + for (int p_gc : pg[gc]) { + if (!resolved[p_gc]) { gc_parents_ok = false; break; } + } + if (gc_parents_ok) { + if (layout_partitions_[gc].is_single) { + resolved[gc] = 1; + if (route_trace) { + RouteStep step; + step.type = 2; + step.partition_idx = gc; + if (!layout_partitions_[gc].involved_qbits.empty()) { + step.physical_qubit = pi[layout_partitions_[gc].involved_qbits[0]]; + } + route_trace->steps.push_back(std::move(step)); + } + for (int ggc : cg[gc]) stack.push_back(ggc); + } else { + F.push_back(gc); + in_F[gc] = 1; + } + } + } + } + } else { + F.push_back(child); + in_F[child] = 1; + } + } + } + } + } + + return {pi, total_cost}; +} + +ForwardRouteResult SabreRouter::route_forward( + const std::vector& pi +) const { + ForwardRouteResult result; + result.pi_initial = pi; + auto F_fwd = get_initial_layer(); + auto routed = heuristic_search( + F_fwd, + pi, + false, + nullptr, + canonical_data_fwd_, + DAG_, + IDAG_, + &result + ); + result.pi = std::move(routed.first); + return result; +} + + +// --------------------------------------------------------------------------- +// run_trial (full implementation) +// --------------------------------------------------------------------------- + +TrialResult SabreRouter::run_trial( + int trial_idx, + const std::vector& seeded_pi, + int n_iterations, + int n_trials +) const { + // RNG setup + std::mt19937 rng_gen(config_.random_seed + trial_idx); + std::mt19937* rng = (n_trials > 1) ? &rng_gen : nullptr; + + std::vector pi = sample_initial_layout( + trial_idx, n_trials, seeded_pi, rng_gen + ); + + auto F_rev = get_final_layer(); + auto F_fwd = get_initial_layer(); + + // Forward-backward-forward iterations + for (int iteration = 0; iteration < n_iterations; iteration++) { + // Backward pass: swap DAG/IDAG + auto bwd_result = heuristic_search(F_rev, pi, true, rng, canonical_data_rev_, IDAG_, DAG_); + pi = std::move(bwd_result.first); + + // Forward pass (skip on last iteration) + if (iteration < n_iterations - 1) { + auto fwd_result = heuristic_search(F_fwd, pi, false, rng, canonical_data_fwd_, DAG_, IDAG_); + pi = std::move(fwd_result.first); + } + } + + // Deterministic evaluation pass on a copy of pi to score the trial. + auto eval_result = heuristic_search(F_fwd, pi, false, nullptr, canonical_data_fwd_, DAG_, IDAG_); + double cost = eval_result.second; + + // Return the layout from AFTER the backward pass, BEFORE the eval pass. + return TrialResult{std::move(pi), cost}; +} + +} // namespace squander::routing diff --git a/squander/synthesis/PartAM.py b/squander/synthesis/PartAM.py new file mode 100644 index 000000000..24156b791 --- /dev/null +++ b/squander/synthesis/PartAM.py @@ -0,0 +1,3935 @@ +""" +This is an implementation of Partition Aware Mapping. +""" +import csv +import logging +import multiprocessing as mp +import os +import time +from collections import deque, defaultdict +from itertools import combinations, permutations +from multiprocessing import Pool +from typing import List, Optional + +import numpy as np +from tqdm import tqdm + +from squander.decomposition.qgd_N_Qubit_Decompositions_Wrapper import ( + qgd_N_Qubit_Decomposition_adaptive as N_Qubit_Decomposition_adaptive, + qgd_N_Qubit_Decomposition_Tree_Search as N_Qubit_Decomposition_Tree_Search, + qgd_N_Qubit_Decomposition_Tabu_Search as N_Qubit_Decomposition_Tabu_Search, +) +from squander.gates.qgd_Circuit import qgd_Circuit as Circuit +from squander.partitioning.ilp import ( + get_all_partitions, + _get_topo_order, + topo_sort_partitions, + ilp_global_optimal, +) +# Module-level globals for pool workers (set via Pool initializer) +_worker_config = None + +def _init_decompose_worker(config): + global _worker_config + _worker_config = config + +def _decompose_one(Umtx, mini_topology): + """Pool worker function. Uses config set once by initializer instead of + pickling it per task.""" + from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping + return qgd_Partition_Aware_Mapping.DecomposePartition_and_Perm( + Umtx, _worker_config, mini_topology + ) + +def _available_cpus(): + """Return the number of CPUs available to this process. + + Respects affinity masks set by taskset, cgroups, SLURM, etc. + Falls back to mp.cpu_count() on platforms without sched_getaffinity. + """ + try: + return len(os.sched_getaffinity(0)) + except (AttributeError, OSError): + return mp.cpu_count() + + +from squander.synthesis.PartAM_utils import ( + get_subtopologies_of_type, + get_unique_subtopologies, + get_canonical_form, + get_node_mapping, + compute_automorphisms, + derive_result_from_automorphism, + SingleQubitPartitionResult, + PartitionSynthesisResult, + PartitionCandidate, + PartitionScoreData, + check_circuit_compatibility, + construct_swap_circuit, +) + +_routing_worker_state = None + + +class _DynamicMappedPartitionCandidate: + """Partition candidate remapped to the route-time physical layout.""" + + def __init__(self, candidate, node_mapping): + self.candidate = candidate + self.partition_idx = candidate.partition_idx + self.topology_idx = candidate.topology_idx + self.permutation_idx = candidate.permutation_idx + self.cnot_count = candidate.cnot_count + self.node_mapping = dict(node_mapping) + + def get_final_circuit(self, optimized_partitions, N): + partition = optimized_partitions[self.partition_idx] + params = partition.synthesised_parameters[ + self.topology_idx + ][self.permutation_idx] + circuit = partition.synthesised_circuits[ + self.topology_idx + ][self.permutation_idx].get_Flat_Circuit() + return circuit.Remap_Qbits(self.node_mapping, N), params + + +def _init_layout_trial_worker(state): + global _routing_worker_state + from squander.synthesis.PartAM import qgd_Partition_Aware_Mapping + + worker_config = dict(state["config"]) + worker_config["progressbar"] = False + + mapper = qgd_Partition_Aware_Mapping(worker_config) + mapper._adj = [list(neighbors) for neighbors in state["adj"]] + mapper._swap_cache = {} + + _routing_worker_state = { + "mapper": mapper, + "seeded_pi": np.asarray(state["seeded_pi"]), + "DAG": state["DAG"], + "IDAG": state["IDAG"], + "layout_partitions": state["layout_partitions"], + "scoring_partitions": state["scoring_partitions"], + "D": np.asarray(state["D"]), + "candidate_cache": state["candidate_cache"], + "n_iterations": state["n_iterations"], + "n_trials": state["n_trials"], + "random_seed": state["random_seed"], + } + + +def _run_layout_trial_worker(trial_idx): + state = _routing_worker_state + mapper = state["mapper"] + + return mapper._run_single_layout_trial( + trial_idx=trial_idx, + seeded_pi=state["seeded_pi"], + DAG=state["DAG"], + IDAG=state["IDAG"], + layout_partitions=state["layout_partitions"], + scoring_partitions=state["scoring_partitions"], + D=state["D"], + candidate_cache=state["candidate_cache"], + n_iterations=state["n_iterations"], + n_trials=state["n_trials"], + random_seed=state["random_seed"], + ) +# ============================================================================ +# Main Class: qgd_Partition_Aware_Mapping +# ============================================================================ + +class qgd_Partition_Aware_Mapping: + + # ------------------------------------------------------------------------ + # Initialization & Configuration + # ------------------------------------------------------------------------ + + def __init__(self, config): + self.topology = config['topology'] + self.config = config + self.config.setdefault('strategy', 'TreeSearch') + self.config.setdefault('parallel', 0 ) + self.config.setdefault('verbosity', 0 ) + self.config.setdefault('tolerance', 1e-8 ) + self.config.setdefault('test_subcircuits', False ) + self.config.setdefault('test_final_circuit', True ) + self.config.setdefault('max_partition_size', 3 ) + self.config.setdefault('pack_credit_weight', 0.0) + self.config.setdefault('topology', None) + self.config.setdefault('routed', False) + self.config.setdefault('optimizer', 'BFGS') + self.config.setdefault('use_osr', 0) + self.config.setdefault("use_graph_search", 0) + self.config.setdefault('n_layout_trials', 1) + self.config.setdefault('random_seed', 42) + self.config.setdefault('cleanup', True) + self.config.setdefault('prefilter_top_k', 50) + self.config.setdefault('prefilter_min_per_partition', 2) + self.config.setdefault('prefilter_min_3q', 12) + self.config.setdefault('cleanup_top_k', 3) + self.config.setdefault('decay_delta', 0.001) # Qiskit LightSABRE DECAY_RATE + self.config.setdefault('swap_burst_budget', 5) # Qiskit LightSABRE DECAY_RESET_INTERVAL + self.config.setdefault('path_tiebreak_weight', 0.2) + # Neighbor tie-breaker is added to A* f-values normalised to [0, 1]; + # must stay < 0.5 to preserve swap-count optimality. + if self.config['path_tiebreak_weight'] >= 0.5: + logging.warning( + "path_tiebreak_weight=%.3f ≥ 0.5 may override SWAP-count " + "optimality; clamping to 0.49.", + self.config['path_tiebreak_weight'], + ) + self.config['path_tiebreak_weight'] = 0.49 + self.config.setdefault('cnot_cost', 1.0 / 3.0) # 1 SWAP = 3 CNOTs + self.config.setdefault('three_qubit_exit_weight', 1.0) + self.config.setdefault('boundary_beam_width', 1) + self.config.setdefault('boundary_beam_depth', 1) + self.config.setdefault('layout_boundary_beam_width', None) + self.config.setdefault('layout_boundary_beam_depth', None) + self.config.setdefault('routing_trace_path', None) + self.config['partition_weight_model'] = 'window_turnover' + # ILP partition-selection weights. See _parts_to_window_turnover_weights + # for the full cost formula; defaults are calibrated against the + # synthesis-capacity / width-penalty pair below so saturation rewards + # match across widths. + self.config.setdefault('partition_density_weight', 1.0) + self.config.setdefault('partition_boundary_weight', 0.9) + self.config.setdefault('partition_depth_balance_weight', 0.25) + self.config.setdefault('partition_depth_balance_exponent', 2.0) + self.config.setdefault('partition_triangle_weight', 1.5) + self.config.setdefault('partition_triangle_threshold', 0.6) + self.config.setdefault('partition_triangle_window_radius', 8) + self.config.setdefault('partition_synthesis_cost_weight', 1.0) + self.config.setdefault('partition_routing_span_weight', 2.0) + self.config.setdefault('partition_turnover_weight', 0.5) + # Penalises chain-shaped width>=3 blocks; 0.0 disables it. + self.config.setdefault('partition_chain_penalty_weight', 2.0) + self.config.setdefault('partition_min_cost', 0.05) + self.config.setdefault( + 'partition_width_penalties', + {1: 0.25, 2: 1.0, 3: 4.0, 4: 16.0}, + ) + # CNOT lower-bound synthesis budgets (Vidal–Dawson w=2, + # Shende–Markov–Bullock w=3, QSD w=4). + self.config.setdefault( + 'partition_synthesis_capacity', + {1: 1, 2: 3, 3: 14, 4: 61}, + ) + strategy = self.config['strategy'] + self.config.setdefault('parallel_layout_trials', False) + self.config.setdefault('layout_trial_workers', 0) + allowed_strategies = ['TreeSearch', 'TabuSearch', 'Adaptive'] + if not strategy in allowed_strategies: + raise Exception(f"The strategy should be either of {allowed_strategies}, got {strategy}.") + allowed_partition_weight_models = ['window_turnover', 'ilp'] + if self.config['partition_weight_model'] not in allowed_partition_weight_models: + raise Exception( + f"The partition_weight_model should be either of " + f"{allowed_partition_weight_models}, got " + f"{self.config['partition_weight_model']}." + ) + + # Initialize caches for performance optimization + self._topology_cache = {} # {frozenset(edges): [topology_candidates]} + self._swap_cache = {} # {(pi_tuple, qbit_map_frozen): (swaps, output_perm)} + self._adj = None # Precomputed adjacency list (built by compute_distances_bfs) + self._decomp_cache = {} # {(rounded unitary bytes, topology): synthesis result} + + # ------------------------------------------------------------------------ + # Caching Methods + # ------------------------------------------------------------------------ + + def _get_subtopologies_of_type_cached(self, mini_topology): + """ + Cached version of get_subtopologies_of_type. + Uses canonical form of mini_topology as cache key. + """ + + # Create canonical form key + target_qubits = set() + for u, v in mini_topology: + target_qubits.add(u) + target_qubits.add(v) + if not target_qubits: + return [] + + # Use canonical form as cache key + canonical_key = get_canonical_form(target_qubits, mini_topology) + + if canonical_key not in self._topology_cache: + self._topology_cache[canonical_key] = get_subtopologies_of_type(self.topology, mini_topology) + + return self._topology_cache[canonical_key] + + # ------------------------------------------------------------------------ + # Static Synthesis Helpers (extracted from SynthesizeWideCircuit) + # ------------------------------------------------------------------------ + + @staticmethod + def _part_support_and_active_pairs(part, gate_dict): + qubits_in_part = set() + active_pairs = set() + for gate_idx in part: + gate = gate_dict.get(gate_idx) + if gate is None: + continue + qbs = list(gate.get_Involved_Qbits()) + qubits_in_part.update(qbs) + if len(qbs) < 2: + continue + for a in range(len(qbs)): + for b in range(a + 1, len(qbs)): + active_pairs.add( + (min(qbs[a], qbs[b]), max(qbs[a], qbs[b])) + ) + return frozenset(qubits_in_part), frozenset(active_pairs) + + @staticmethod + def _two_qubit_gate_pair(gate): + qbs = list(gate.get_Involved_Qbits()) + if len(qbs) != 2: + return None + return (min(qbs[0], qbs[1]), max(qbs[0], qbs[1])) + + @staticmethod + def _part_two_qubit_gate_count(part, gate_dict): + count = 0 + for gate_idx in part: + gate = gate_dict.get(gate_idx) + if gate is None: + continue + if qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate) is not None: + count += 1 + return count + + @staticmethod + def _synthesis_capacity(width, capacities=None): + """CNOT lower-bound budget for a generic width-qubit unitary. + + Defaults reflect known synthesis bounds: + width 2 → 3 (Vidal–Dawson 2004, tight) + width 3 → 14 (Shende–Markov–Bullock 2004 counting bound) + width 4 → 61 (QSD recursion, practical upper bound) + Width 1 → 1 (no 2q gates; avoids division by zero in block_density). + Widths beyond 4 extrapolate as 61 · 4^(w−4), matching the asymptotic + (23/48)·4^w scaling of QSD. + """ + if capacities is None: + capacities = {1: 1, 2: 3, 3: 14, 4: 61} + + exact = None + if isinstance(capacities, dict): + exact = capacities.get(width) + if exact is None: + exact = capacities.get(str(width)) + if exact is not None: + return float(max(exact, 1)) + + if width <= 1: + return 1.0 + if width == 2: + return 3.0 + if width == 3: + return 14.0 + if width == 4: + return 61.0 + return 61.0 * (4.0 ** (width - 4)) + + @staticmethod + def _configured_width_penalty(width, penalties): + if penalties is None: + penalties = {1: 0.25, 2: 1.0, 3: 4.0, 4: 16.0} + + exact = None + if isinstance(penalties, dict): + exact = penalties.get(width) + if exact is None: + exact = penalties.get(str(width)) + if exact is not None: + return float(exact) + + if width <= 1: + return 0.25 + if width == 2: + return 1.0 + if width == 3: + return 4.0 + if width == 4: + return 16.0 + return 16.0 * (4.0 ** (width - 4)) + + @staticmethod + def _restricted_longest_path_depth(nodes, g, rg, topo_order): + nodes = set(nodes) + if not nodes: + return 0 + + depth = {} + best = 0 + for gate_idx in topo_order: + if gate_idx not in nodes: + continue + pred_depth = 0 + for pred in rg.get(gate_idx, ()): + if pred in nodes: + pred_depth = max(pred_depth, depth.get(pred, 0)) + depth[gate_idx] = pred_depth + 1 + best = max(best, depth[gate_idx]) + return best + + @staticmethod + def _boundary_two_qubit_gate_set(part, support, g, rg, gate_dict, + max_partition_size): + """Return the set of adjacent 2q gates this candidate leaves over a boundary.""" + support = set(support) + boundary_gates = set() + for gate_idx in part: + neighbors = set(g.get(gate_idx, ())) | set(rg.get(gate_idx, ())) + for other_idx in neighbors: + if other_idx in part: + continue + gate = gate_dict.get(other_idx) + if gate is None: + continue + if qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate) is None: + continue + other_support = set(gate.get_Involved_Qbits()) + if not (support & other_support): + continue + if ( + max_partition_size is not None + and len(support | other_support) > max_partition_size + ): + continue + boundary_gates.add(other_idx) + return boundary_gates + + @staticmethod + def _boundary_two_qubit_gate_count(part, support, g, rg, gate_dict, + max_partition_size): + """Count adjacent 2q gates that this candidate leaves over a boundary.""" + return len( + qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_set( + part, support, g, rg, gate_dict, max_partition_size, + ) + ) + + @staticmethod + def _pair_counts_in_topological_window(part, topo_order, topo_index, + gate_dict, radius): + if not part: + return defaultdict(int) + + positions = [topo_index[g] for g in part if g in topo_index] + if not positions: + return defaultdict(int) + + lo = max(0, min(positions) - radius) + hi = min(len(topo_order) - 1, max(positions) + radius) + pair_counts = defaultdict(int) + for pos in range(lo, hi + 1): + gate = gate_dict.get(topo_order[pos]) + if gate is None: + continue + pair = qgd_Partition_Aware_Mapping._two_qubit_gate_pair(gate) + if pair is not None: + pair_counts[pair] += 1 + return pair_counts + + @staticmethod + def _triangle_density_from_pair_counts(support, pair_counts): + """Return a balanced local triangle score in [0, 1]. + + A chain has density zero because one triangle edge is missing. A + balanced three-edge interaction has density one, while skewed triangles + are discounted by the weakest edge's share of the local interactions. + """ + support = sorted(support) + if len(support) < 3: + return 0.0 + + best_density = 0.0 + for a, b, c in combinations(support, 3): + counts = [ + pair_counts.get((min(a, b), max(a, b)), 0), + pair_counts.get((min(a, c), max(a, c)), 0), + pair_counts.get((min(b, c), max(b, c)), 0), + ] + if min(counts) <= 0: + continue + total = sum(counts) + if total <= 0: + continue + density = (3.0 * min(counts)) / float(total) + best_density = max(best_density, min(density, 1.0)) + return best_density + + @staticmethod + def _turnover_between_supports(support_a, support_b): + if len(support_a) < 2 or len(support_b) < 2: + return None + return min(len(support_a), len(support_b)) - len(support_a & support_b) + + @staticmethod + def _average_turnover(part_idx, part, neighbor_gate_sets, + gate_to_parts, allparts, supports): + turnovers = [] + for gate_set in neighbor_gate_sets: + for gate_idx in gate_set - part: + for other_idx in gate_to_parts.get(gate_idx, ()): + if other_idx == part_idx: + continue + other_part = allparts[other_idx] + if part & other_part: + continue + turnover = ( + qgd_Partition_Aware_Mapping._turnover_between_supports( + supports[part_idx], + supports[other_idx], + ) + ) + if turnover is not None: + turnovers.append(turnover) + if not turnovers: + return None + return sum(turnovers) / len(turnovers) + + @staticmethod + def _parts_to_window_turnover_weights(allparts, gate_dict, g, + pack_credit_weight=0.0, + config=None, + max_partition_size=None, + topology_distances=None, + seed_layout=None): + """Linear ILP weights for local block quality. + + The ILP accepts one linear cost per candidate part, so pairwise + interactions are approximated locally. Lower cost is better. + + Core cost terms: + * ``synthesis_cost_weight · width_penalty[width]`` — non-linear + penalty for synthesising a wider unitary block. + * ``− density_weight · (k_2q / synthesis_capacity[width])`` — + capacity-normalised density reward. Each width has the same + saturation level (1.0), implicitly pricing that wider partitions + don't compress to zero body CNOTs. + * ``boundary_weight · boundary_crossings`` — penalises adjacent 2q + gates left across this candidate's boundary. + * Triangle bonus (only above a density threshold), depth-balance + penalty, optional routing-span penalty (heavily weighted to make + topology-spread wide partitions visibly expensive), and optional + turnover penalty as documented per knob below. + + When ``topology_distances`` is supplied, also adds + ``routing_span_weight · Σ max(D[u,v]−1, 0)`` over the part's active 2q + pairs. When ``seed_layout`` is also supplied, ``D`` is permuted + through the layout so the span penalty reflects *physical* qubit + distance under the routing layer's chosen placement. When + ``turnover_weight`` is non-zero, also adds + ``turnover_weight · avg_turnover`` averaging + ``min(|supp_p|, |supp_q|) − |supp_p ∩ supp_q|`` over candidate + partitions ``q`` immediately downstream of ``p`` in the gate DAG. + """ + cfg = {} if config is None else config + if max_partition_size is None: + max_partition_size = cfg.get("max_partition_size") + + density_weight = float(cfg.get("partition_density_weight", 4.0)) + boundary_weight = float(cfg.get("partition_boundary_weight", 0.9)) + depth_balance_weight = float( + cfg.get("partition_depth_balance_weight", 0.25) + ) + depth_balance_exponent = float( + cfg.get("partition_depth_balance_exponent", 2.0) + ) + triangle_weight = float(cfg.get("partition_triangle_weight", 2.5)) + triangle_threshold = float( + cfg.get("partition_triangle_threshold", 0.6) + ) + triangle_threshold = min(max(triangle_threshold, 0.0), 1.0) + triangle_window_radius = max( + int(cfg.get("partition_triangle_window_radius", 8)), + 0, + ) + synthesis_cost_weight = float( + cfg.get("partition_synthesis_cost_weight", 1.0) + ) + routing_span_weight = float( + cfg.get("partition_routing_span_weight", 0.0) + ) + turnover_weight = float(cfg.get("partition_turnover_weight", 0.0)) + chain_penalty_weight = float( + cfg.get("partition_chain_penalty_weight", 0.0) + ) + min_cost = float(cfg.get("partition_min_cost", 0.05)) + width_penalties = cfg.get("partition_width_penalties") + synthesis_capacities = cfg.get("partition_synthesis_capacity") + + use_routing_span = ( + topology_distances is not None and routing_span_weight + ) + if topology_distances is not None and seed_layout is not None: + pi_arr = np.asarray(seed_layout, dtype=int) + layout_distances = topology_distances[np.ix_(pi_arr, pi_arr)] + else: + layout_distances = topology_distances + inf_distance_cap = float( + max(len(layout_distances) - 1, 1) + ) if layout_distances is not None else 0.0 + + N = max(len(allparts), 1) + supports = [] + active_pairs_list = [] + for part in allparts: + support, active_pairs = ( + qgd_Partition_Aware_Mapping._part_support_and_active_pairs( + part, + gate_dict, + ) + ) + supports.append(support) + active_pairs_list.append(active_pairs) + + rg = {gate_idx: set() for gate_idx in g} + for src, dsts in g.items(): + for dst in dsts: + rg.setdefault(dst, set()).add(src) + + use_turnover = turnover_weight != 0.0 + if use_turnover: + gate_to_parts = defaultdict(list) + for idx, part in enumerate(allparts): + for gate_idx in part: + gate_to_parts[gate_idx].append(idx) + successor_gate_sets = [] + for part in allparts: + downstream = set() + for gate_idx in part: + downstream.update(g.get(gate_idx, ())) + successor_gate_sets.append(downstream) + else: + gate_to_parts = None + successor_gate_sets = None + + gate_to_qubit = { + gate_idx: set(gate.get_Involved_Qbits()) + for gate_idx, gate in gate_dict.items() + if gate is not None + } + topo_order = _get_topo_order(g, rg, gate_to_qubit) if g else [] + topo_index = {gate_idx: idx for idx, gate_idx in enumerate(topo_order)} + global_depth = max( + qgd_Partition_Aware_Mapping._restricted_longest_path_depth( + set(g), g, rg, topo_order + ), + 1, + ) + + + weights = [] + for part_idx, part in enumerate(allparts): + support = supports[part_idx] + width = len(support) + width_penalty = ( + qgd_Partition_Aware_Mapping._configured_width_penalty( + width, width_penalties + ) + ) + two_qubit_gate_count = ( + qgd_Partition_Aware_Mapping._part_two_qubit_gate_count( + part, gate_dict + ) + ) + block_density = ( + two_qubit_gate_count + / qgd_Partition_Aware_Mapping._synthesis_capacity( + width, synthesis_capacities + ) + ) + boundary_crossings = ( + qgd_Partition_Aware_Mapping._boundary_two_qubit_gate_count( + part, + support, + g, + rg, + gate_dict, + max_partition_size, + ) + ) + if use_routing_span: + span_cost = 0.0 + for u, v in active_pairs_list[part_idx]: + d = layout_distances[u][v] + if not np.isfinite(d): + d = inf_distance_cap + span_cost += max(float(d) - 1.0, 0.0) + else: + span_cost = 0.0 + if use_turnover: + avg_turnover = ( + qgd_Partition_Aware_Mapping._average_turnover( + part_idx, + part, + [successor_gate_sets[part_idx]], + gate_to_parts, + allparts, + supports, + ) + ) + turnover_cost = 0.0 if avg_turnover is None else float(avg_turnover) + else: + turnover_cost = 0.0 + pair_counts = ( + qgd_Partition_Aware_Mapping._pair_counts_in_topological_window( + part, + topo_order, + topo_index, + gate_dict, + triangle_window_radius, + ) + ) + triangle_density = ( + qgd_Partition_Aware_Mapping._triangle_density_from_pair_counts( + support, + pair_counts, + ) + ) + if triangle_threshold >= 1.0: + triangle_bonus = 0.0 + else: + triangle_bonus = triangle_weight * max( + triangle_density - triangle_threshold, + 0.0, + ) / (1.0 - triangle_threshold) + + if ( + chain_penalty_weight + and width >= 3 + and triangle_threshold > 0.0 + ): + chain_deficit = max( + triangle_threshold - triangle_density, 0.0 + ) / triangle_threshold + chain_penalty = ( + chain_penalty_weight * chain_deficit * (width - 2) + ) + else: + chain_penalty = 0.0 + + internal_depth = ( + qgd_Partition_Aware_Mapping._restricted_longest_path_depth( + part, g, rg, topo_order + ) + ) + depth_fraction = internal_depth / float(global_depth) + depth_penalty = ( + depth_balance_weight + * (depth_fraction ** depth_balance_exponent) + * max(width_penalty, 1.0) + ) + + density_bonus = density_weight * block_density + if pack_credit_weight: + density_bonus += ( + pack_credit_weight + * block_density + * max(two_qubit_gate_count - 1, 0) + ) + + conceptual_cost = ( + synthesis_cost_weight * width_penalty + + boundary_weight * boundary_crossings + + routing_span_weight * span_cost + + turnover_weight * turnover_cost + + chain_penalty + + depth_penalty + - density_bonus + - triangle_bonus + ) + conceptual_cost = max(conceptual_cost, min_cost) + weights.append((conceptual_cost - 1.0) / N) + return weights + + @staticmethod + def _topo_key(mini_topology): + return tuple(sorted(tuple(sorted(e)) for e in mini_topology)) + + @staticmethod + def _cache_key(Umtx, mini_topology): + topo_key = tuple(sorted(tuple(sorted(e)) for e in mini_topology)) + return (np.round(Umtx, decimals=10).tobytes(), topo_key) + + @staticmethod + def _get_auts(mini_topo, aut_cache): + key = tuple(sorted(tuple(sorted(e)) for e in mini_topo)) + if key not in aut_cache: + aut_cache[key] = compute_automorphisms(mini_topo) + return aut_cache[key] + + @staticmethod + def _build_permuted_unitary(meta, P_i, P_o): + N = meta['N'] + circ_tmp = Circuit(N) + circ_tmp.add_Permutation(list(P_i)) + circ_tmp.add_Circuit(meta['circuit']) + circ_tmp.add_Permutation(list(P_o)) + return circ_tmp.get_Matrix(meta['params']) + + @staticmethod + def _add_result_with_auts(result, perm_pair, synth_circuit, synth_params, + topology_idx, N, mini_topology, known_pairs, pair_key, + use_auts, aut_cache): + """Add a synthesis result and derive automorphism equivalents.""" + result.add_result(perm_pair, synth_circuit, synth_params, topology_idx) + if use_auts: + if pair_key not in known_pairs: + known_pairs[pair_key] = set() + known_pairs[pair_key].add(perm_pair) + P_i, P_o = perm_pair + auts = qgd_Partition_Aware_Mapping._get_auts(mini_topology, aut_cache) + identity = tuple(range(N)) + for sigma in auts: + if sigma == identity: + continue + new_P_i, new_P_o, new_circ, new_params = derive_result_from_automorphism( + sigma, P_i, P_o, synth_circuit, synth_params, N + ) + if (new_P_i, new_P_o) not in known_pairs[pair_key]: + result.add_result((new_P_i, new_P_o), new_circ, new_params, topology_idx) + known_pairs[pair_key].add((new_P_i, new_P_o)) + + @staticmethod + def _qiskit_routing_fallback(meta, mini_topology): + """Route original partition circuit on mini_topology using Qiskit transpiler. + + Called when unitary synthesis fails to reach tolerance. Routes the + original (un-permuted) circuit and returns it with identity P_i/P_o. + Returns (circuit, params) or (None, None) if Qiskit is unavailable or + routing fails. + """ + try: + from squander.IO_interfaces.Qiskit_IO import get_Qiskit_Circuit, convert_Qiskit_to_Squander + from qiskit.compiler import transpile + from qiskit.transpiler import CouplingMap + except ImportError: + return None, None + + try: + qk_circ = get_Qiskit_Circuit(meta['circuit'], meta['params']) + edges = [] + for u, v in mini_topology: + edges.append([u, v]) + edges.append([v, u]) + coupling_map = CouplingMap(couplinglist=edges) + qk_routed = transpile( + qk_circ, + coupling_map=coupling_map, + optimization_level=1, + basis_gates=['cx', 'u3'], + ) + return convert_Qiskit_to_Squander(qk_routed) + except Exception as exc: + logging.warning("Qiskit routing fallback failed: %s", exc) + return None, None + + def _build_scoring_partitions(self, optimized_partitions) -> List[Optional[PartitionScoreData]]: + """ + Create lightweight, picklable views of partitions that contain only the + data required during heuristic scoring. + """ + scoring_partitions: List[Optional[PartitionScoreData]] = [] + for partition in optimized_partitions: + if isinstance(partition, SingleQubitPartitionResult): + scoring_partitions.append(None) + continue + + mini_topologies = tuple( + tuple(tuple(edge) for edge in mini_topology) + for mini_topology in partition.mini_topologies + ) + + topology_candidates = [] + for tdx, mini_topology in enumerate(partition.mini_topologies): + if hasattr(partition, "get_topology_candidates"): + candidates = partition.get_topology_candidates(tdx) + else: + candidates = self._get_subtopologies_of_type_cached(mini_topology) + topology_candidates.append( + tuple(tuple(edge) for edge in candidates) + ) + + permutations_pairs = tuple( + tuple((tuple(P_i), tuple(P_o)) for (P_i, P_o) in partition.permutations_pairs[tdx]) + for tdx in range(len(partition.mini_topologies)) + ) + + circuit_structures = tuple( + tuple(tuple(struct) for struct in partition.circuit_structures[tdx]) + for tdx in range(len(partition.mini_topologies)) + ) + cnot_counts = tuple( + tuple(int(cnot) for cnot in partition.cnot_counts[tdx]) + for tdx in range(len(partition.mini_topologies)) + ) + + scoring_partitions.append( + PartitionScoreData( + mini_topologies=mini_topologies, + topology_candidates=tuple(topology_candidates), + permutations_pairs=permutations_pairs, + circuit_structures=circuit_structures, + cnot_counts=cnot_counts, + qubit_map=dict(partition.qubit_map), + involved_qbits=tuple(partition.involved_qbits), + ) + ) + return scoring_partitions + @staticmethod + def _partition_is_single(partition): + if isinstance(partition, dict): + return partition.get("is_single", False) + return isinstance(partition, SingleQubitPartitionResult) + + + @staticmethod + def _partition_involved_qbits(partition): + if isinstance(partition, dict): + return partition["involved_qbits"] + return partition.involved_qbits + + + @staticmethod + def _build_layout_partition_info(optimized_partitions): + return [ + { + "is_single": isinstance( + partition, SingleQubitPartitionResult + ), + "involved_qbits": tuple(partition.involved_qbits), + } + for partition in optimized_partitions + ] + def _build_partition_candidate_cache(self, scoring_partitions): + """ + Precompute all PartitionCandidate objects once, grouped by partition_idx. + + Returns: + tuple where candidate_cache[partition_idx] is a tuple of + PartitionCandidate objects for that partition. Single-qubit + partitions get an empty tuple. + """ + candidate_cache = [] + + for partition_idx, partition in enumerate(scoring_partitions): + if partition is None: + candidate_cache.append(()) + continue + + cached_candidates = [] + for tdx, mini_topology in enumerate(partition.mini_topologies): + topology_candidates = partition.topology_candidates[tdx] + permutation_pairs = partition.permutations_pairs[tdx] + circuit_structures = partition.circuit_structures[tdx] + cnot_counts = partition.cnot_counts[tdx] + + for topology_candidate in topology_candidates: + for pdx, permutation_pair in enumerate(permutation_pairs): + circuit_structure = circuit_structures[pdx] + cached_candidates.append( + PartitionCandidate( + partition_idx, + tdx, + pdx, + circuit_structure, + permutation_pair[0], + permutation_pair[1], + topology_candidate, + mini_topology, + partition.qubit_map, + partition.involved_qbits, + cnot_count=cnot_counts[pdx], + ) + ) + + candidate_cache.append(tuple(cached_candidates)) + + return tuple(candidate_cache) + # ------------------------------------------------------------------------ + # Partition Decomposition Methods + # ------------------------------------------------------------------------ + + @staticmethod + def DecomposePartition_and_Perm(Umtx: np.ndarray, config: dict, mini_topology = None, max_retries: int = 5) -> Circuit: + """ + Call to decompose a partition. Retries up to max_retries times if the + decomposition error exceeds the configured tolerance. Returns the + best-error attempt across all retries and logs a warning when no + attempt reaches ``config["tolerance"]``. + """ + tolerance = config["tolerance"] + strategy = config["strategy"] + + best_err = float('inf') + best_circuit = None + best_params = None + + for attempt in range(max_retries): + if strategy == "TreeSearch": + cDecompose = N_Qubit_Decomposition_Tree_Search(Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology) + elif strategy == "TabuSearch": + cDecompose = N_Qubit_Decomposition_Tabu_Search(Umtx.conj().T, config=config, accelerator_num=0, topology=mini_topology) + elif strategy == "Adaptive": + cDecompose = N_Qubit_Decomposition_adaptive(Umtx.conj().T, level_limit_max=5, level_limit_min=1, topology=mini_topology) + else: + raise Exception(f"Unsupported decomposition type: {strategy}") + cDecompose.set_Verbose(config["verbosity"]) + cDecompose.set_Cost_Function_Variant(3) + cDecompose.set_Optimization_Tolerance(tolerance) + cDecompose.set_Optimizer(config["optimizer"]) + cDecompose.Start_Decomposition() + + err = cDecompose.get_Decomposition_Error() + if err < best_err: + best_err = err + best_circuit = cDecompose.get_Circuit() + best_params = cDecompose.get_Optimized_Parameters() + + if best_err <= tolerance: + break + + return best_circuit, best_params, best_err + + # ------------------------------------------------------------------------ + # Circuit Synthesis + # ------------------------------------------------------------------------ + + def SynthesizeWideCircuit(self, circ, orig_parameters): + """ + Partition and synthesize a full circuit. + + Flow: + 1) Enumerate candidate partitions. + 2) ILP-select the minimum-count non-overlapping cover (uniform weights). + 3) Synthesize only the selected partitions via SeqPAM (two-stage P_i/P_o + sweep over mini_topologies, executed by _run_parallel_synthesis). + + Args: + circ: The full quantum circuit (must be flat — no subcircuit blocks) + orig_parameters: Parameters for circ + + Returns: + optimized_partitions: List of PartitionSynthesisResult / + SingleQubitPartitionResult, in topological order. + """ + working_circ = circ + working_parameters = orig_parameters + qbit_num = circ.get_Qbit_Num() + + # ---- Phase 0: Compute distance matrix ---- + D = self.compute_distances_bfs(qbit_num) + + # ---- Phase 0b: Compute seed layout for layout-aware scoring ---- + # Empty partitions list makes _compute_seeded_layout skip the + # partition-weighted greedy fallback; it returns identity if VF2 + # and SabrePreLayout-augmented VF2 both fail (safe no-op). + seed_layout = self._compute_seeded_layout([], D, qbit_num, working_circ) + + # ---- Phase 1: Partition enumeration ---- + allparts, g, go, rgo, single_qubit_chains, gate_to_qubit, gate_to_tqubit = get_all_partitions(working_circ, self.config["max_partition_size"]) + gate_dict = {i: gate for i, gate in enumerate(working_circ.get_Gates())} + + single_qubit_chains_pre = {x[0]: x for x in single_qubit_chains if rgo[x[0]]} + single_qubit_chains_post = {x[-1]: x for x in single_qubit_chains if go[x[-1]]} + single_qubit_chains_prepost = {x[0]: x for x in single_qubit_chains if x[0] in single_qubit_chains_pre and x[-1] in single_qubit_chains_post} + + # ---- Phase 2: ILP partition selection ---- + # PartAM keeps one partitioning strategy: window_turnover. + ilp_weights = self._parts_to_window_turnover_weights( + allparts, + gate_dict, + g, + pack_credit_weight=self.config['pack_credit_weight'], + config=self.config, + max_partition_size=self.config["max_partition_size"], + topology_distances=D, + seed_layout=seed_layout, + ) + partition_weight_model = self.config['partition_weight_model'] + if partition_weight_model == 'ilp': + ilp_weights = None + L_parts, _ = ilp_global_optimal(allparts, g, weights=ilp_weights) + + # ---- Phase 3: Build gate sets for selected partitions (+ standalone chains) ---- + selected_surrounded_starts = set() + selected_parts_gates = [] + for i in L_parts: + part = allparts[i] + surrounded = {t for s in part for t in go[s] + if t in single_qubit_chains_prepost + and go[single_qubit_chains_prepost[t][-1]] + and next(iter(go[single_qubit_chains_prepost[t][-1]])) in part} + gates = frozenset.union(part, *(single_qubit_chains_prepost[v] for v in surrounded)) + selected_parts_gates.append(gates) + selected_surrounded_starts.update(surrounded) + + standalone_chains = [] + for chain in single_qubit_chains: + if chain[0] not in selected_surrounded_starts: + selected_parts_gates.append(frozenset(chain)) + standalone_chains.append(chain) + + n_multi = len(L_parts) + + size_counts = {} + for gates in selected_parts_gates: + involved = set() + for g in gates: + involved.update(gate_dict[g].get_Involved_Qbits()) + size = len(involved) + size_counts[size] = size_counts.get(size, 0) + 1 + self._selected_partition_counts = dict(size_counts) + if self.config.get('verbosity', 0) > 0: + selected_multi = sum( + count for size, count in size_counts.items() if size > 1 + ) + logging.info( + "Selected partitions: 2-qubit=%d, 3-qubit=%d, total_multi=%d", + size_counts.get(2, 0), + size_counts.get(3, 0), + selected_multi, + ) + + # ---- Phase 4: Assemble partitioned circuit from selected partitions only ---- + partitioned_circuit = Circuit(qbit_num) + params = [] + + for gates in selected_parts_gates[:n_multi]: + c = Circuit(qbit_num) + for gate_idx in _get_topo_order({x: go[x] & gates for x in gates}, + {x: rgo[x] & gates for x in gates}, + gate_to_qubit): + c.add_Gate(gate_dict[gate_idx]) + start = gate_dict[gate_idx].get_Parameter_Start_Index() + params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()]) + partitioned_circuit.add_Circuit(c) + + for chain in standalone_chains: + c = Circuit(qbit_num) + for gate_idx in chain: + c.add_Gate(gate_dict[gate_idx]) + start = gate_dict[gate_idx].get_Parameter_Start_Index() + params.append(working_parameters[start:start + gate_dict[gate_idx].get_Parameter_Num()]) + partitioned_circuit.add_Circuit(c) + + parameters = np.concatenate(params, axis=0) + + # ---- Phase 5: SeqPAM synthesis on selected partitions only ---- + subcircuits = partitioned_circuit.get_Gates() + optimized_results = [None] * len(subcircuits) + partition_meta = [] + for partition_idx, subcircuit in enumerate(subcircuits): + start_idx = subcircuit.get_Parameter_Start_Index() + end_idx = start_idx + subcircuit.get_Parameter_Num() + subcircuit_parameters = parameters[start_idx:end_idx] + involved_qbits = subcircuit.get_Qbits() + qbit_num_sub = len(involved_qbits) + qbit_map = {involved_qbits[idx]: idx for idx in range(len(involved_qbits))} + remapped_subcircuit = subcircuit.Remap_Qbits(qbit_map, qbit_num_sub) + + if qbit_num_sub == 1: + optimized_results[partition_idx] = SingleQubitPartitionResult( + remapped_subcircuit, subcircuit_parameters, + original_qubits=list(involved_qbits) + ) + partition_meta.append(None) + else: + mini_topologies = get_unique_subtopologies(self.topology, qbit_num_sub) + partition_meta.append({ + 'N': qbit_num_sub, + 'circuit': remapped_subcircuit, + 'params': subcircuit_parameters, + 'mini_topologies': mini_topologies, + 'involved_qbits': involved_qbits, + 'qbit_map': qbit_map, + }) + + results_map = self._run_parallel_synthesis(partition_meta) + for partition_idx, result in results_map.items(): + optimized_results[partition_idx] = result + + # ---- Phase 6: Topologically order selected partitions ---- + L = topo_sort_partitions(working_circ, selected_parts_gates) + return [optimized_results[idx] for idx in L] + + def _run_parallel_synthesis(self, partition_meta): + """Phase 2: Run parallel synthesis for all multi-qubit partitions. + + Args: + partition_meta: List of per-partition dicts (None for single-qubit partitions). + + Returns: + results_map: Dict mapping partition_idx to PartitionSynthesisResult. + """ + n_cpus = _available_cpus() + use_auts = self.config.get('use_automorphisms', True) + disable_pbar = self.config.get('progressbar', 0) == False + aut_cache = {} + decomp_cache = self._decomp_cache + + with Pool(processes=n_cpus, initializer=_init_decompose_worker, + initargs=(self.config,)) as pool: + # Initialize PartitionSynthesisResult for each multi-qubit partition + results_map = {} + for partition_idx, meta in enumerate(partition_meta): + if meta is None: + continue + results_map[partition_idx] = PartitionSynthesisResult( + meta['N'], meta['mini_topologies'], meta['involved_qbits'], + meta['qbit_map'], + ) + + # ---- Stage 1: sweep all boundary permutations for small partitions ---- + # For N<=3 the full (P_i, P_o) space is at most 36 pairs. Routing + # needs that complete boundary-state set; otherwise 3q partitions + # expose less layout freedom than 2q partitions. + stage1_futures = [] + stage1_cached = [] + known_pairs = {} + full_enum_keys = set() # (partition_idx, topology_idx) fully covered in S1 + + for partition_idx, meta in enumerate(partition_meta): + if meta is None: + continue + N = meta['N'] + perms_all = list(permutations(range(N))) + for topology_idx, mini_topology in enumerate(meta['mini_topologies']): + if N <= 3: + full_enum_keys.add((partition_idx, topology_idx)) + po_sweep = perms_all + else: + po_sweep = [perms_all[np.random.choice(len(perms_all))]] + for P_o in po_sweep: + for P_i in perms_all: + Umtx = self._build_permuted_unitary(meta, P_i, P_o) + ck = self._cache_key(Umtx, mini_topology) + if ck in decomp_cache: + stage1_cached.append((partition_idx, topology_idx, P_i, P_o, ck)) + else: + future = pool.apply_async( + _decompose_one, (Umtx, mini_topology) + ) + stage1_futures.append((partition_idx, topology_idx, P_i, P_o, ck, future)) + + # Process Stage 1 cache hits immediately + for partition_idx, topology_idx, P_i, P_o, ck in stage1_cached: + meta = partition_meta[partition_idx] + N = meta['N'] + mini_topology = meta['mini_topologies'][topology_idx] + synth_circuit, synth_params, synth_err = decomp_cache[ck] + if synth_err <= self.config['tolerance']: + pair_key = (partition_idx, topology_idx) + self._add_result_with_auts( + results_map[partition_idx], (P_i, P_o), + synth_circuit, synth_params, topology_idx, + N, mini_topology, known_pairs, pair_key, use_auts, aut_cache + ) + + # Collect Stage 1 pool results + cache_hits_s1 = len(stage1_cached) + for partition_idx, topology_idx, P_i, P_o, ck, future in tqdm( + stage1_futures, desc=f"Stage 1 Synthesis ({cache_hits_s1} cached)", + disable=disable_pbar + ): + synth_circuit, synth_params, synth_err = future.get() + decomp_cache[ck] = (synth_circuit, synth_params, synth_err) + meta = partition_meta[partition_idx] + N = meta['N'] + mini_topology = meta['mini_topologies'][topology_idx] + if synth_err <= self.config['tolerance']: + pair_key = (partition_idx, topology_idx) + self._add_result_with_auts( + results_map[partition_idx], (P_i, P_o), + synth_circuit, synth_params, topology_idx, + N, mini_topology, known_pairs, pair_key, use_auts, aut_cache + ) + + # ---- Stage 2: fix top-k P_i from Stage 1, sweep all P_o ---- + # Skipped for partitions already fully enumerated in Stage 1 + # (currently all N<=3 partitions). + top_k_pi = self.config.get('top_k_pi', 1) + stage2_futures = [] + stage2_cached = [] + + for partition_idx, meta in enumerate(partition_meta): + if meta is None: + continue + N = meta['N'] + perms_all = list(permutations(range(N))) + result = results_map[partition_idx] + for topology_idx, mini_topology in enumerate(meta['mini_topologies']): + if (partition_idx, topology_idx) in full_enum_keys: + continue + pair_key = (partition_idx, topology_idx) + kp = known_pairs.get(pair_key, set()) if use_auts else set() + for P_i_cand in result.get_top_k_results(topology_idx, top_k_pi): + for P_o in perms_all: + if use_auts and (tuple(P_i_cand), P_o) in kp: + continue + Umtx = self._build_permuted_unitary(meta, P_i_cand, P_o) + ck = self._cache_key(Umtx, mini_topology) + if ck in decomp_cache: + stage2_cached.append((partition_idx, topology_idx, P_i_cand, P_o, ck)) + else: + future = pool.apply_async( + _decompose_one, (Umtx, mini_topology) + ) + stage2_futures.append((partition_idx, topology_idx, P_i_cand, P_o, ck, future)) + + # Process Stage 2 cache hits + for partition_idx, topology_idx, P_i_cand, P_o, ck in stage2_cached: + meta = partition_meta[partition_idx] + N = meta['N'] + mini_topology = meta['mini_topologies'][topology_idx] + synth_circuit, synth_params, synth_err = decomp_cache[ck] + if synth_err <= self.config['tolerance']: + pair_key = (partition_idx, topology_idx) + self._add_result_with_auts( + results_map[partition_idx], (tuple(P_i_cand), P_o), + synth_circuit, synth_params, topology_idx, + N, mini_topology, known_pairs, pair_key, use_auts, aut_cache + ) + + # Collect Stage 2 pool results + cache_hits_s2 = len(stage2_cached) + for partition_idx, topology_idx, P_i_cand, P_o, ck, future in tqdm( + stage2_futures, desc=f"Stage 2 Synthesis ({cache_hits_s2} cached)", + disable=disable_pbar + ): + synth_circuit, synth_params, synth_err = future.get() + decomp_cache[ck] = (synth_circuit, synth_params, synth_err) + meta = partition_meta[partition_idx] + N = meta['N'] + mini_topology = meta['mini_topologies'][topology_idx] + if synth_err <= self.config['tolerance']: + pair_key = (partition_idx, topology_idx) + self._add_result_with_auts( + results_map[partition_idx], (tuple(P_i_cand), P_o), + synth_circuit, synth_params, topology_idx, + N, mini_topology, known_pairs, pair_key, use_auts, aut_cache + ) + + # Qiskit routing fallback: for any (partition, topology) pair where all + # synthesis attempts failed (no results stored), route the original circuit + # with Qiskit and add the result with identity P_i/P_o permutations. + qiskit_fallback_cache = {} + for partition_idx, meta in enumerate(partition_meta): + if meta is None: + continue + N = meta['N'] + for topology_idx, mini_topology in enumerate(meta['mini_topologies']): + if results_map[partition_idx].permutations_pairs[topology_idx]: + continue + fkey = (partition_idx, topology_idx) + if fkey not in qiskit_fallback_cache: + fb_circuit, fb_params = self._qiskit_routing_fallback(meta, mini_topology) + qiskit_fallback_cache[fkey] = (fb_circuit, fb_params) + fb_circuit, fb_params = qiskit_fallback_cache[fkey] + if fb_circuit is None: + logging.warning( + "Partition %d topology_idx %d: synthesis failed and Qiskit " + "fallback unavailable; no result for this combination.", + partition_idx, topology_idx, + ) + continue + identity = tuple(range(N)) + results_map[partition_idx].add_result( + (identity, identity), fb_circuit, fb_params, topology_idx + ) + + return results_map + + # ------------------------------------------------------------------------ + # Main Public API + # ------------------------------------------------------------------------ + def _run_single_layout_trial( + self, + trial_idx, + seeded_pi, + DAG, + IDAG, + layout_partitions, + scoring_partitions, + D, + candidate_cache, + n_iterations, + n_trials, + random_seed, + ): + N = len(seeded_pi) + rng = ( + np.random.RandomState(random_seed + trial_idx) + if n_trials > 1 + else None + ) + pi = self._sample_initial_layout(trial_idx, n_trials, seeded_pi, rng) + + for iteration in range(n_iterations): + F_rev = self.get_final_layer(DAG, N, layout_partitions) + pi, _ = self._heuristic_search_layout_only( + F_rev, + pi, + IDAG, + DAG, + layout_partitions, + scoring_partitions, + D, + rng=rng, + reverse=True, + candidate_cache=candidate_cache, + ) + + if iteration < n_iterations - 1: + F_fwd = self.get_initial_layer(IDAG, N, layout_partitions) + pi, _ = self._heuristic_search_layout_only( + F_fwd, + pi, + DAG, + IDAG, + layout_partitions, + scoring_partitions, + D, + rng=rng, + candidate_cache=candidate_cache, + ) + + F_eval = self.get_initial_layer(IDAG, N, layout_partitions) + _, cost = self._heuristic_search_layout_only( + F_eval, + pi.copy(), + DAG, + IDAG, + layout_partitions, + scoring_partitions, + D, + rng=None, + candidate_cache=candidate_cache, + ) + return cost, pi + + + def _run_layout_trials( + self, + seeded_pi, + DAG, + IDAG, + layout_partitions, + scoring_partitions, + D, + candidate_cache, + n_iterations, + n_trials, + random_seed, + ): + use_cpp = self.config.get('use_cpp_router', True) + if use_cpp: + return self._run_layout_trials_cpp( + seeded_pi, DAG, IDAG, layout_partitions, + scoring_partitions, D, candidate_cache, + n_iterations, n_trials, random_seed, + ) + + trial_indices = list(range(max(1, n_trials))) + use_parallel = ( + self.config.get("parallel_layout_trials", False) + and len(trial_indices) > 1 + ) + + if not use_parallel: + return [ + self._run_single_layout_trial( + trial_idx=trial_idx, + seeded_pi=seeded_pi, + DAG=DAG, + IDAG=IDAG, + layout_partitions=layout_partitions, + scoring_partitions=scoring_partitions, + D=D, + candidate_cache=candidate_cache, + n_iterations=n_iterations, + n_trials=n_trials, + random_seed=random_seed, + ) + for trial_idx in trial_indices + ] + + workers = self.config.get("layout_trial_workers", 0) + if workers <= 0: + workers = min(len(trial_indices), _available_cpus()) + + worker_state = { + "config": dict(self.config), + "adj": tuple(tuple(neighbors) for neighbors in self._adj), + "seeded_pi": np.asarray(seeded_pi), + "DAG": DAG, + "IDAG": IDAG, + "layout_partitions": layout_partitions, + "scoring_partitions": scoring_partitions, + "D": np.asarray(D), + "candidate_cache": candidate_cache, + "n_iterations": n_iterations, + "n_trials": n_trials, + "random_seed": random_seed, + } + + with Pool( + processes=workers, + initializer=_init_layout_trial_worker, + initargs=(worker_state,), + ) as pool: + return pool.map(_run_layout_trial_worker, trial_indices) + + def _run_layout_trials_cpp( + self, + seeded_pi, + DAG, + IDAG, + layout_partitions, + scoring_partitions, + D, + candidate_cache, + n_iterations, + n_trials, + random_seed, + ): + from squander.synthesis._sabre_router import SabreRouter, SabreConfig + + route_beam_width = self.config.get('boundary_beam_width', 1) + route_beam_depth = self.config.get('boundary_beam_depth', 1) + layout_beam_width = self.config.get( + 'layout_boundary_beam_width', route_beam_width + ) + layout_beam_depth = self.config.get( + 'layout_boundary_beam_depth', route_beam_depth + ) + if layout_beam_width is None: + layout_beam_width = route_beam_width + if layout_beam_depth is None: + layout_beam_depth = route_beam_depth + + def make_cpp_config(beam_width, beam_depth): + cfg = SabreConfig() + cfg.prefilter_top_k = self.config.get('prefilter_top_k', 50) + if hasattr(cfg, 'prefilter_min_per_partition'): + cfg.prefilter_min_per_partition = self.config.get( + 'prefilter_min_per_partition', 2 + ) + if hasattr(cfg, 'prefilter_min_3q'): + cfg.prefilter_min_3q = self.config.get('prefilter_min_3q', 12) + cfg.max_E_size = self.config.get('max_E_size', 20) + cfg.max_lookahead = self.config.get('max_lookahead', 4) + cfg.E_weight = self.config.get('E_weight', 0.5) + cfg.E_alpha = self.config.get('E_alpha', 1.0) + cfg.cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0) + cfg.sabre_iterations = n_iterations + cfg.n_layout_trials = max(1, n_trials) + cfg.random_seed = random_seed + cfg.decay_delta = self.config.get('decay_delta', 0.001) + cfg.swap_burst_budget = self.config.get('swap_burst_budget', 5) + cfg.path_tiebreak_weight = self.config.get( + 'path_tiebreak_weight', 0.2 + ) + if hasattr(cfg, 'three_qubit_exit_weight'): + cfg.three_qubit_exit_weight = self.config.get( + 'three_qubit_exit_weight', 1.0 + ) + if hasattr(cfg, 'boundary_beam_width'): + cfg.boundary_beam_width = beam_width + if hasattr(cfg, 'boundary_beam_depth'): + cfg.boundary_beam_depth = beam_depth + return cfg + + layout_cfg = make_cpp_config(layout_beam_width, layout_beam_depth) + route_cfg = make_cpp_config(route_beam_width, route_beam_depth) + use_distinct_route_router = ( + layout_beam_width != route_beam_width + or layout_beam_depth != route_beam_depth + ) + self._routing_layout_boundary_beam = ( + int(layout_beam_width), + int(layout_beam_depth), + ) + self._routing_boundary_beam = ( + int(route_beam_width), + int(route_beam_depth), + ) + canonical_fwd = self._build_canonical_neighbor_data( + scoring_partitions, reverse=False + ) + canonical_rev = self._build_canonical_neighbor_data( + scoring_partitions, reverse=True + ) + + # Convert candidate_cache: list of tuples -> list of lists + candidate_cache_lists = [list(cands) for cands in candidate_cache] + + # Convert layout_partitions: list of dicts with tuple involved_qbits + layout_partitions_lists = [ + {'is_single': lp['is_single'], 'involved_qbits': list(lp['involved_qbits'])} + for lp in layout_partitions + ] + + trial_router = SabreRouter( + layout_cfg, D, self._adj, DAG, IDAG, + candidate_cache_lists, layout_partitions_lists, + canonical_fwd, canonical_rev, + ) + router = trial_router + if use_distinct_route_router: + router = SabreRouter( + route_cfg, D, self._adj, DAG, IDAG, + candidate_cache_lists, layout_partitions_lists, + canonical_fwd, canonical_rev, + ) + + seeded_pi_list = [int(x) for x in seeded_pi] + n_trials_actual = max(1, n_trials) + trial_indices = list(range(n_trials_actual)) + + use_parallel = ( + self.config.get("parallel_layout_trials", False) + and n_trials_actual > 1 + ) + + if not use_parallel: + self._routing_layout_trial_workers = 1 + layout_trials_t0 = time.time() + trial_results = [ + trial_router.run_trial( + idx, seeded_pi_list, n_iterations, n_trials_actual + ) + for idx in trial_indices + ] + else: + from concurrent.futures import ThreadPoolExecutor + workers = self.config.get("layout_trial_workers", 0) + if workers <= 0: + workers = min(n_trials_actual, _available_cpus()) + + self._routing_layout_trial_workers = workers + layout_trials_t0 = time.time() + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = [ + pool.submit( + trial_router.run_trial, + idx, + seeded_pi_list, + n_iterations, + n_trials_actual, + ) + for idx in trial_indices + ] + trial_results = [f.result() for f in futures] + self._routing_layout_trials_time = time.time() - layout_trials_t0 + + heuristic_ranked = sorted(trial_results, key=lambda x: x[0]) + actual_rank_default = min( + max(1, self.config.get("cleanup_top_k", 3) * 2), + n_trials_actual, + ) + actual_rank_top_k = self.config.get( + "actual_routing_rank_top_k", actual_rank_default + ) + if actual_rank_top_k is None or actual_rank_top_k <= 0: + actual_rank_top_k = len(heuristic_ranked) + actual_rank_top_k = min(int(actual_rank_top_k), len(heuristic_ranked)) + + actual_rank_inputs = heuristic_ranked[:actual_rank_top_k] + self._routing_actual_rank_count = len(actual_rank_inputs) + + def route_rank_input(item): + heuristic_cost, trial_pi = item + actual_cnot, pi_out, pi_init, steps = router.route_forward( + [int(x) for x in trial_pi] + ) + return (actual_cnot, pi_out, heuristic_cost, pi_init, steps) + + use_parallel_actual_routing = ( + self.config.get("parallel_layout_trials", False) + and len(actual_rank_inputs) > 1 + ) + actual_rank_t0 = time.time() + if use_parallel_actual_routing: + from concurrent.futures import ThreadPoolExecutor + workers = self.config.get("layout_trial_workers", 0) + if workers <= 0: + workers = min(len(actual_rank_inputs), _available_cpus()) + + self._routing_actual_rank_workers = workers + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = [ + pool.submit(route_rank_input, item) + for item in actual_rank_inputs + ] + ranked = [f.result() for f in futures] + else: + self._routing_actual_rank_workers = 1 + ranked = [ + route_rank_input(item) + for item in actual_rank_inputs + ] + self._routing_actual_rank_time = time.time() - actual_rank_t0 + + ranked.sort(key=lambda x: (x[0], x[2])) + ranked.extend( + (float("inf"), pi, cost, None, None) + for cost, pi in heuristic_ranked[actual_rank_top_k:] + ) + return ranked + + @staticmethod + def _snapshot_single_qubit_circuits(optimized_partitions): + return { + i: p.circuit.copy() + for i, p in enumerate(optimized_partitions) + if isinstance(p, SingleQubitPartitionResult) + } + + @staticmethod + def _restore_single_qubit_circuits(optimized_partitions, saved_circuits): + for idx, orig in saved_circuits.items(): + optimized_partitions[idx].circuit = orig.copy() + + @staticmethod + def _partition_order_cnot_breakdown(partition_order): + routing_cnot = 0 + partition_cnot = 0 + for part in partition_order: + if isinstance(part, Circuit): + routing_cnot += part.get_Gate_Nums().get('CNOT', 0) + elif isinstance(part, SingleQubitPartitionResult): + continue + else: + partition_cnot += int(getattr(part, 'cnot_count', 0)) + return routing_cnot, partition_cnot + + def _partition_order_from_cpp_steps( + self, steps, optimized_partitions, candidate_cache, N, pi_initial=None + ): + partition_order = [] + pi = [int(x) for x in pi_initial] if pi_initial is not None else None + for step in steps: + kind = step[0] + if kind == "swap": + swaps = [(int(u), int(v)) for u, v in step[1]] + if swaps: + partition_order.append(construct_swap_circuit(swaps, N)) + if pi is not None: + pi = self._apply_swaps_to_pi(pi, swaps) + elif kind == "partition": + partition_idx = int(step[1]) + candidate_idx = int(step[2]) + candidate = candidate_cache[partition_idx][candidate_idx] + if ( + pi is not None + and self._candidate_is_layout_transparent(candidate) + ): + node_mapping = self._zero_cnot_dynamic_node_mapping( + pi, candidate + ) + partition_order.append( + _DynamicMappedPartitionCandidate( + candidate, node_mapping + ) + ) + pi = self._apply_zero_cnot_candidate_exit_to_pi( + pi, candidate, node_mapping + ) + else: + partition_order.append(candidate) + if pi is not None: + pi = self._apply_candidate_exit_to_pi(pi, candidate) + elif kind == "single": + partition_idx = int(step[1]) + physical_qubit = int(step[2]) + part = optimized_partitions[partition_idx] + circuit_qubit = int(part.circuit.get_Qbits()[0]) + part.circuit = part.circuit.Remap_Qbits( + {circuit_qubit: physical_qubit}, N + ) + partition_order.append(part) + return partition_order + + @staticmethod + def _csv_list(values): + return " ".join(str(int(v)) for v in values) + + @staticmethod + def _csv_edges(edges): + return " ".join(f"{int(u)}-{int(v)}" for u, v in edges) + + @staticmethod + def _candidate_physical_nodes(candidate): + nodes = set() + for u, v in candidate.topology: + nodes.add(int(u)) + nodes.add(int(v)) + if not nodes: + nodes.update(int(v) for v in candidate.node_mapping.values()) + return sorted(nodes) + + @staticmethod + def _candidate_has_multi_qubit_body(candidate): + return bool(getattr(candidate, "circuit_structure", ())) + + @staticmethod + def _candidate_is_layout_transparent(candidate): + return not qgd_Partition_Aware_Mapping._candidate_has_multi_qubit_body( + candidate + ) + + @staticmethod + def _apply_candidate_exit_to_pi(pi, candidate): + pi_out = [int(x) for x in pi] + qbit_map_inverse = {v: k for k, v in candidate.qbit_map.items()} + for q_star, mapped_qstar in enumerate(candidate.P_o): + if q_star in qbit_map_inverse: + logical_q = qbit_map_inverse[q_star] + pi_out[logical_q] = candidate.node_mapping[mapped_qstar] + return pi_out + + @staticmethod + def _zero_cnot_dynamic_node_mapping(pi, candidate): + P_i_inv = [candidate.P_i.index(i) for i in range(len(candidate.P_i))] + node_mapping = {} + for logical_q, q_star in candidate.qbit_map.items(): + node_mapping[P_i_inv[q_star]] = int(pi[int(logical_q)]) + return node_mapping + + @staticmethod + def _apply_zero_cnot_candidate_exit_to_pi(pi, candidate, node_mapping): + pi_out = [int(x) for x in pi] + qbit_map_inverse = {v: k for k, v in candidate.qbit_map.items()} + for q_star, mapped_qstar in enumerate(candidate.P_o): + if q_star in qbit_map_inverse: + logical_q = qbit_map_inverse[q_star] + pi_out[logical_q] = node_mapping[mapped_qstar] + return pi_out + + @staticmethod + def _immediate_multi_successors(partition_idx, DAG, layout_partitions): + successors = [] + seen = set() + queue = deque(DAG[partition_idx]) + while queue: + child = queue.popleft() + if child in seen: + continue + seen.add(child) + if layout_partitions[child]["is_single"]: + queue.extend(DAG[child]) + else: + successors.append(child) + return successors + + @staticmethod + def _support_overlap_summary(partition_idx, successors, layout_partitions): + support = set(layout_partitions[partition_idx]["involved_qbits"]) + summary = [] + max_overlap = 0 + min_turnover = None + for child in successors: + child_support = set(layout_partitions[child]["involved_qbits"]) + overlap = len(support & child_support) + turnover = min(len(support), len(child_support)) - overlap + max_overlap = max(max_overlap, overlap) + min_turnover = ( + turnover + if min_turnover is None + else min(min_turnover, turnover) + ) + summary.append(f"{child}:{overlap}/{turnover}") + return ( + max_overlap, + 0 if min_turnover is None else min_turnover, + " ".join(summary), + ) + + @staticmethod + def _eligible_multi_frontier(resolved, IDAG, layout_partitions): + frontier = [] + for idx, info in enumerate(layout_partitions): + if resolved[idx] or info["is_single"]: + continue + if all(resolved[parent] for parent in IDAG[idx]): + frontier.append(idx) + return frontier + + def _write_cpp_routing_trace( + self, + trace_path, + steps, + pi_initial, + candidate_cache, + layout_partitions, + DAG, + IDAG, + N, + ): + """Write a CSV trace for the final selected C++ route.""" + if not trace_path: + return + + trace_dir = os.path.dirname(os.path.abspath(trace_path)) + if trace_dir: + os.makedirs(trace_dir, exist_ok=True) + + pi = [int(x) for x in pi_initial] + resolved = [False] * len(layout_partitions) + pending_swaps = [] + cumulative_swaps = 0 + cumulative_body_cnot = 0 + rows = [] + + for route_step_idx, step in enumerate(steps): + kind = step[0] + if kind == "swap": + swaps = [(int(u), int(v)) for u, v in step[1]] + if swaps: + pending_swaps.extend(swaps) + pi = self._apply_swaps_to_pi(pi, swaps) + continue + + if kind == "single": + partition_idx = int(step[1]) + logical_qubits = tuple( + layout_partitions[partition_idx]["involved_qbits"] + ) + physical_qubit = int(step[2]) + resolved[partition_idx] = True + rows.append({ + "row": len(rows), + "route_step": route_step_idx, + "kind": "single", + "partition_idx": partition_idx, + "candidate_idx": "", + "topology_idx": "", + "permutation_idx": "", + "logical_qubits": self._csv_list(logical_qubits), + "physical_nodes": str(physical_qubit), + "topology_edges": "", + "entry_layout": self._csv_list( + pi[q] for q in logical_qubits + ), + "exit_layout": self._csv_list( + pi[q] for q in logical_qubits + ), + "swap_count": 0, + "routing_cnot": 0, + "body_cnot": 0, + "cumulative_swap_count": cumulative_swaps, + "cumulative_routing_cnot": 3 * cumulative_swaps, + "cumulative_body_cnot": cumulative_body_cnot, + "frontier_size": len( + self._eligible_multi_frontier( + resolved, IDAG, layout_partitions + ) + ), + "successor_count": 0, + "max_successor_overlap": 0, + "min_successor_turnover": 0, + "successor_overlap": "", + "swaps": "", + }) + continue + + if kind != "partition": + continue + + partition_idx = int(step[1]) + candidate_idx = int(step[2]) + candidate = candidate_cache[partition_idx][candidate_idx] + logical_qubits = tuple(int(q) for q in candidate.involved_qbits) + entry_layout = [int(pi[q]) for q in logical_qubits] + if self._candidate_is_layout_transparent(candidate): + dynamic_node_mapping = self._zero_cnot_dynamic_node_mapping( + pi, candidate + ) + exit_pi = self._apply_zero_cnot_candidate_exit_to_pi( + pi, candidate, dynamic_node_mapping + ) + physical_nodes = sorted(dynamic_node_mapping.values()) + topology_edges = "" + else: + exit_pi = self._apply_candidate_exit_to_pi(pi, candidate) + physical_nodes = self._candidate_physical_nodes(candidate) + topology_edges = self._csv_edges(candidate.topology) + exit_layout = [int(exit_pi[q]) for q in logical_qubits] + successors = self._immediate_multi_successors( + partition_idx, DAG, layout_partitions + ) + max_overlap, min_turnover, overlap_summary = ( + self._support_overlap_summary( + partition_idx, successors, layout_partitions + ) + ) + frontier_size = len( + self._eligible_multi_frontier( + resolved, IDAG, layout_partitions + ) + ) + swap_count = len(pending_swaps) + cumulative_swaps += swap_count + cumulative_body_cnot += int(candidate.cnot_count) + rows.append({ + "row": len(rows), + "route_step": route_step_idx, + "kind": "partition", + "partition_idx": partition_idx, + "candidate_idx": candidate_idx, + "topology_idx": int(candidate.topology_idx), + "permutation_idx": int(candidate.permutation_idx), + "logical_qubits": self._csv_list(logical_qubits), + "physical_nodes": self._csv_list(physical_nodes), + "topology_edges": topology_edges, + "entry_layout": self._csv_list(entry_layout), + "exit_layout": self._csv_list(exit_layout), + "swap_count": swap_count, + "routing_cnot": 3 * swap_count, + "body_cnot": int(candidate.cnot_count), + "cumulative_swap_count": cumulative_swaps, + "cumulative_routing_cnot": 3 * cumulative_swaps, + "cumulative_body_cnot": cumulative_body_cnot, + "frontier_size": frontier_size, + "successor_count": len(successors), + "max_successor_overlap": max_overlap, + "min_successor_turnover": min_turnover, + "successor_overlap": overlap_summary, + "swaps": self._csv_edges(pending_swaps), + }) + resolved[partition_idx] = True + pi = exit_pi + pending_swaps = [] + + fieldnames = [ + "row", + "route_step", + "kind", + "partition_idx", + "candidate_idx", + "topology_idx", + "permutation_idx", + "logical_qubits", + "physical_nodes", + "topology_edges", + "entry_layout", + "exit_layout", + "swap_count", + "routing_cnot", + "body_cnot", + "cumulative_swap_count", + "cumulative_routing_cnot", + "cumulative_body_cnot", + "frontier_size", + "successor_count", + "max_successor_overlap", + "min_successor_turnover", + "successor_overlap", + "swaps", + ] + with open(trace_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + self._routing_trace_path = trace_path + + + def _rank_layout_trials_by_actual_routing( + self, + trial_results, + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + candidate_cache, + rank_top_k=None, + ): + """Reroute a bounded candidate set and rank it by actual CNOT count.""" + if trial_results and len(trial_results[0]) >= 5: + return sorted(trial_results, key=lambda x: (x[0], x[2])) + heuristic_ranked = sorted(trial_results, key=lambda x: x[0]) + if rank_top_k is None or rank_top_k <= 0: + rank_top_k = len(heuristic_ranked) + rank_top_k = min(int(rank_top_k), len(heuristic_ranked)) + actual_candidates = heuristic_ranked[:rank_top_k] + heuristic_tail = heuristic_ranked[rank_top_k:] + + saved_sq_circuits = self._snapshot_single_qubit_circuits( + optimized_partitions + ) + ranked_results = [] + old_progressbar = self.config.get("progressbar", 0) + self.config["progressbar"] = False + try: + for heuristic_cost, trial_pi in actual_candidates: + self._restore_single_qubit_circuits( + optimized_partitions, saved_sq_circuits + ) + F_trial = self.get_initial_layer( + IDAG, len(trial_pi), optimized_partitions + ) + partition_order, _, _ = self.Heuristic_Search( + F_trial, + np.asarray(trial_pi, dtype=np.int64).copy(), + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + candidate_cache=candidate_cache, + ) + trial_circuit, _ = self.Construct_circuit_from_HS( + partition_order, optimized_partitions, len(trial_pi) + ) + actual_cnot = trial_circuit.get_Gate_Nums().get("CNOT", 0) + ranked_results.append((actual_cnot, trial_pi, heuristic_cost, None, None)) + finally: + if old_progressbar is None: + self.config.pop("progressbar", None) + else: + self.config["progressbar"] = old_progressbar + self._restore_single_qubit_circuits( + optimized_partitions, saved_sq_circuits + ) + + ranked_results.sort(key=lambda x: (x[0], x[2])) + ranked_results.extend( + (float("inf"), pi, cost, None, None) for cost, pi in heuristic_tail + ) + return ranked_results + + def Partition_Aware_Mapping( + self, circ: Circuit, orig_parameters: np.ndarray + ): + N = circ.get_Qbit_Num() + + optimized_partitions = self.SynthesizeWideCircuit(circ, orig_parameters) + + for partition in optimized_partitions: + if isinstance(partition, PartitionSynthesisResult): + partition._topology = self.topology + partition._topology_cache = self._topology_cache + + DAG, IDAG = self.construct_DAG_and_IDAG(optimized_partitions) + + D = self.compute_distances_bfs(N) + scoring_partitions = self._build_scoring_partitions(optimized_partitions) + candidate_cache = self._build_partition_candidate_cache( + scoring_partitions + ) + layout_partitions = self._build_layout_partition_info( + optimized_partitions + ) + seeded_pi = self._compute_seeded_layout( + optimized_partitions, D, N, circ + ) + + n_iterations = self.config.get('sabre_iterations', 1) + n_trials = self.config.get('n_layout_trials', 1) + random_seed = self.config.get('random_seed', 42) + do_cleanup = self.config.get('cleanup', True) + + routing_start = time.time() + routing_swap_cnot = 0 + partition_body_cnot = 0 + routing_elapsed_before_cleanup = None + cleanup_total = 0.0 + final_route_steps = None + final_route_pi_initial = None + + if n_iterations == 0: + F = self.get_initial_layer(IDAG, N, optimized_partitions) + partition_order, pi, pi_initial = self.Heuristic_Search( + F, + pi=seeded_pi.copy(), + DAG=DAG, + IDAG=IDAG, + optimized_partitions=optimized_partitions, + scoring_partitions=scoring_partitions, + D=D, + candidate_cache=candidate_cache, + ) + final_circuit, final_parameters = self.Construct_circuit_from_HS( + partition_order, optimized_partitions, N + ) + routing_swap_cnot, partition_body_cnot = ( + self._partition_order_cnot_breakdown(partition_order) + ) + + else: + trial_results = self._run_layout_trials( + seeded_pi=seeded_pi, + DAG=DAG, + IDAG=IDAG, + layout_partitions=layout_partitions, + scoring_partitions=scoring_partitions, + D=D, + candidate_cache=candidate_cache, + n_iterations=n_iterations, + n_trials=max(1, n_trials), + random_seed=random_seed, + ) + actual_rank_default = min( + max(1, self.config.get("cleanup_top_k", 3) * 2), + max(1, n_trials), + ) + actual_rank_top_k = self.config.get( + "actual_routing_rank_top_k", actual_rank_default + ) + trial_results = self._rank_layout_trials_by_actual_routing( + trial_results, + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + candidate_cache, + rank_top_k=actual_rank_top_k, + ) + routing_elapsed_before_cleanup = time.time() - routing_start + + # Pick the best trial (already ranked by actual routing). + _, best_pi, _, trace_pi_init, route_steps = trial_results[0] + + if route_steps is not None: + partition_order = self._partition_order_from_cpp_steps( + route_steps, + optimized_partitions, + candidate_cache, + N, + pi_initial=trace_pi_init, + ) + pi = np.asarray(best_pi, dtype=np.int64) + pi_initial = np.asarray(trace_pi_init, dtype=np.int64) + final_route_steps = route_steps + final_route_pi_initial = pi_initial.copy() + else: + F = self.get_initial_layer(IDAG, N, optimized_partitions) + partition_order, pi, pi_initial = self.Heuristic_Search( + F, + best_pi.copy(), + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + candidate_cache=candidate_cache, + ) + + trial_circuit, trial_params = self.Construct_circuit_from_HS( + partition_order, optimized_partitions, N + ) + routing_swap_cnot, partition_body_cnot = ( + self._partition_order_cnot_breakdown(partition_order) + ) + pre_cleanup_cnots = trial_circuit.get_Gate_Nums().get('CNOT', 0) + + if do_cleanup: + from squander.decomposition.qgd_Wide_Circuit_Optimization import ( + qgd_Wide_Circuit_Optimization, + ) + + cleanup_config = dict(self.config) + cleanup_config['topology'] = self.topology + cleanup_config['routed'] = True + cleanup_config['test_subcircuits'] = False + cleanup_config['test_final_circuit'] = False + cleanup_config['global_min'] = True + cleanup_config['use_osr'] = 1 + cleanup_config['use_graph_search'] = 1 + cleanup_config['max_partition_size'] = 4 + + wco = qgd_Wide_Circuit_Optimization(cleanup_config) + + cleanup_t0 = time.time() + final_circuit, final_parameters = wco.OptimizeWideCircuit( + trial_circuit.get_Flat_Circuit(), + trial_params, + ) + cleanup_total += time.time() - cleanup_t0 + else: + final_circuit, final_parameters = trial_circuit, trial_params + + if do_cleanup and n_iterations > 0: + self._routing_time = routing_elapsed_before_cleanup + self._cleanup_time = cleanup_total + self._cnot_pre_cleanup = pre_cleanup_cnots + else: + self._routing_time = time.time() - routing_start + self._cleanup_time = 0.0 + self._cnot_pre_cleanup = final_circuit.get_Gate_Nums().get( + 'CNOT', 0 + ) + + if self.config.get('cleanup', True): + from squander.decomposition.qgd_Wide_Circuit_Optimization import ( + qgd_Wide_Circuit_Optimization, + ) + + cleanup_config = dict(self.config) + cleanup_config['topology'] = self.topology + cleanup_config['routed'] = True + cleanup_config['test_subcircuits'] = False + cleanup_config['test_final_circuit'] = False + cleanup_config['global_min'] = True + wco = qgd_Wide_Circuit_Optimization(cleanup_config) + + final_circuit, final_parameters = wco.OptimizeWideCircuit( + final_circuit.get_Flat_Circuit(), final_parameters + ) + + self._routing_swap_cnot = routing_swap_cnot + self._partition_body_cnot = partition_body_cnot + + routing_trace_path = self.config.get("routing_trace_path", None) + if routing_trace_path: + if final_route_steps is not None and final_route_pi_initial is not None: + self._write_cpp_routing_trace( + routing_trace_path, + final_route_steps, + final_route_pi_initial, + candidate_cache, + layout_partitions, + DAG, + IDAG, + N, + ) + else: + logging.warning( + "routing_trace_path was set, but no C++ route steps were " + "available for the selected route." + ) + + return final_circuit, final_parameters, pi_initial, pi + + # ------------------------------------------------------------------------ + # Heuristic Search + # ------------------------------------------------------------------------ + + def _select_best_candidate(self, partition_candidates, scores, rng=None): + """Select the lowest-scoring candidate deterministically.""" + del rng + scores_array = np.array(scores) + return partition_candidates[np.argmin(scores_array)] + + def _prefilter_candidates( + self, + partition_candidates, + pi, + D, + top_k, + F=None, + E=None, + candidate_cache=None, + layout_partitions=None, + reverse=False, + W=0.5, + alpha=1.0, + canonical_data=None, + ): + """Pre-filter candidates using cheap swap-count estimate before full A* scoring.""" + if top_k <= 0: + return [] + if len(partition_candidates) <= top_k: + return partition_candidates + cnot_cost = self.config.get('cnot_cost', 1.0 / 3.0) + estimates = np.array([ + ( + self._routing_objective( + pc.estimate_swap_count(pi, D, reverse=reverse), + pc.cnot_count, + cnot_cost, + ) + + self._future_context_cost( + pc.partition_idx, + self._estimate_candidate_output_layout( + pc, pi, reverse=reverse + ), + F or (), + E or (), + D, + candidate_cache, + reverse=reverse, + cnot_cost=cnot_cost, + W=W, + alpha=alpha, + layout_partitions=layout_partitions, + canonical_data=canonical_data, + ) + ) + for pc in partition_candidates + ]) + selected = set() + min_per_partition = int( + self.config.get('prefilter_min_per_partition', 0) or 0 + ) + min_3q = int(self.config.get('prefilter_min_3q', 0) or 0) + if min_per_partition > 0 or min_3q > 0: + by_partition = defaultdict(list) + for idx, pc in enumerate(partition_candidates): + by_partition[pc.partition_idx].append(idx) + for indices in by_partition.values(): + sample = partition_candidates[indices[0]] + quota = min_per_partition + if len(sample.involved_qbits) >= 3: + quota = max(quota, min_3q) + if quota <= 0: + continue + ranked = sorted(indices, key=lambda i: estimates[i]) + selected.update(ranked[:min(quota, len(ranked))]) + + remaining = max(0, top_k - len(selected)) + if remaining > 0: + ranked_global = np.argsort(estimates) + for idx in ranked_global: + selected.add(int(idx)) + if len(selected) >= top_k: + break + + if not selected: + top_k_indices = np.argpartition(estimates, top_k)[:top_k] + selected.update(int(i) for i in top_k_indices) + + return [ + partition_candidates[i] + for i in sorted(selected, key=lambda idx: estimates[idx]) + ] + + @staticmethod + def _decay_factor_for_swaps(swaps, decay): + if not swaps: + return 1.0 + return max(max(decay[u], decay[v]) for u, v in swaps) + + @staticmethod + def _routing_objective( + route_cost, + cnot_count, + cnot_cost, + cnot_weight=1.0, + decay_factor=1.0, + ): + return decay_factor * ( + float(route_cost) + + cnot_weight * cnot_cost * float(cnot_count) + ) + + def _apply_decay_for_swaps(self, swaps, decay): + delta = self.config.get("decay_delta", 0.001) + if delta <= 0: + return + for u, v in swaps: + decay[u] += delta + decay[v] += delta + + @staticmethod + def _reset_decay(decay): + for idx in range(len(decay)): + decay[idx] = 1.0 + + @staticmethod + def _apply_swaps_to_pi(pi, swaps): + pi_new = [int(x) for x in pi] + n = len(pi_new) + p2v = [0] * n + for q in range(n): + p2v[pi_new[q]] = q + for P1, P2 in swaps: + q1, q2 = p2v[P1], p2v[P2] + p2v[P1], p2v[P2] = q2, q1 + pi_new[q1], pi_new[q2] = P2, P1 + return pi_new + + def _sample_initial_layout(self, trial_idx, n_trials, seeded_pi, rng): + seeded_pi = np.asarray(seeded_pi, dtype=np.int64) + if n_trials <= 1 or rng is None or trial_idx == 0: + return seeded_pi.copy() + + return rng.permutation(len(seeded_pi)) + + def _bfs_shortest_path(self, src, dst): + if src == dst: + return [src] + parent = {src: None} + q = deque([src]) + while q: + node = q.popleft() + for nb in self._adj[node]: + if nb in parent: + continue + parent[nb] = node + if nb == dst: + path = [dst] + while parent[path[-1]] is not None: + path.append(parent[path[-1]]) + path.reverse() + return path + q.append(nb) + return [] + + @staticmethod + def _entry_future_cost(entry, output_perm_arr, D_arr): + eu = entry.get("edges_u") + if eu is None: + return 0.0 + phys_u = output_perm_arr[eu] + phys_v = output_perm_arr[entry["edges_v"]] + return float(np.maximum(0, D_arr[phys_u, phys_v] - 1).sum()) + + @staticmethod + def _estimate_candidate_output_layout(partition_candidate, pi, reverse=False): + P_exit = partition_candidate.P_i if reverse else partition_candidate.P_o + pi_output = [int(x) for x in pi] + qbit_map_inverse = { + v: k for k, v in partition_candidate.qbit_map.items() + } + for q_star in range(len(P_exit)): + if q_star in qbit_map_inverse: + k = qbit_map_inverse[q_star] + pi_output[k] = partition_candidate.node_mapping[P_exit[q_star]] + return pi_output + + @staticmethod + def _future_context_cost( + exclude_partition_idx, + pi, + F, + E, + D, + candidate_cache, + reverse=False, + cnot_cost=1.0 / 3.0, + W=0.5, + alpha=1.0, + layout_partitions=None, + canonical_data=None, + ): + del cnot_cost, layout_partitions + + # Candidate-aware lower bound: for each future partition, use the best + # available candidate entry cost under this layout. This preserves the + # monotone distance signal while allowing 3q line blocks to distinguish + # which logical qubit should sit on the path center. + pi_arr = np.asarray(pi, dtype=np.intp) + D_arr = np.asarray(D) + + def partition_cost(p_idx): + if candidate_cache is not None and 0 <= p_idx < len(candidate_cache): + candidates = candidate_cache[p_idx] + if candidates and len(candidates[0].involved_qbits) >= 3: + return min( + cand.estimate_swap_count(pi, D, reverse=reverse) + for cand in candidates + ) + if canonical_data is None: + return None + entry = canonical_data.get(p_idx) + if entry is None: + return None + return qgd_Partition_Aware_Mapping._entry_future_cost( + entry, pi_arr, D_arr + ) + + f_sum = 0.0 + n_other = 0 + for p_idx in F: + if p_idx == exclude_partition_idx: + continue + cost = partition_cost(p_idx) + if cost is None: + continue + f_sum += cost + n_other += 1 + score = f_sum / n_other if n_other > 0 else 0.0 + + if E: + e_sum = 0.0 + e_count = 0 + for p_idx, depth in E: + if p_idx == exclude_partition_idx: + continue + cost = partition_cost(p_idx) + if cost is None: + continue + e_sum += (alpha ** depth) * cost + e_count += 1 + if e_count: + score += W * e_sum / e_count + return score + + def _release_valve(self, F, pi, D, canonical_data): + pi_arr = np.asarray(pi, dtype=np.intp) + D_arr = np.asarray(D) + best = None + for p_idx in F: + entry = canonical_data.get(p_idx) + if entry is None: + continue + eu = entry.get("edges_u") + if eu is None: + continue + ev = entry["edges_v"] + phys_u = pi_arr[eu] + phys_v = pi_arr[ev] + dists = D_arr[phys_u, phys_v] + if dists.size == 0: + continue + worst_idx = int(np.argmax(dists)) + worst_d = float(dists[worst_idx]) + if worst_d <= 1: + continue + if best is None or worst_d > best[0] or ( + worst_d == best[0] and p_idx < best[1] + ): + best = (worst_d, p_idx, int(eu[worst_idx]), int(ev[worst_idx])) + + if best is None: + return [], list(pi) + + _, _, u, v = best + path = self._bfs_shortest_path(int(pi[u]), int(pi[v])) + if len(path) < 2: + return [], list(pi) + + k = len(path) - 1 + m = k // 2 + swaps = [] + for i in range(m): + swaps.append((path[i], path[i + 1])) + for i in range(k, m + 1, -1): + swaps.append((path[i], path[i - 1])) + + return swaps, self._apply_swaps_to_pi(pi, swaps) + + @staticmethod + def _build_neighbor_info( + partition_idx, + F, + E, + pi, + canonical_data, + weight=0.2, + W=0.5, + alpha=0.9, + layout_partitions=None, + ): + if weight <= 0 or layout_partitions is None: + return None + + edge_weights = {} + qubits = set() + + def add_edges(target_idx, edge_weight): + if target_idx == partition_idx or edge_weight <= 0: + return + if target_idx >= len(layout_partitions): + return + entry = canonical_data.get(target_idx) if canonical_data else None + if entry is not None and entry.get("edges_u") is not None: + for u, v in zip(entry["edges_u"], entry["edges_v"]): + u = int(u) + v = int(v) + qubits.add(u) + qubits.add(v) + key = (u, v) if u <= v else (v, u) + edge_weights[key] = ( + edge_weights.get(key, 0.0) + edge_weight + ) + return + + involved = qgd_Partition_Aware_Mapping._partition_involved_qbits( + layout_partitions[target_idx] + ) + for i, u in enumerate(involved): + for v in involved[i + 1:]: + u = int(u) + v = int(v) + qubits.add(u) + qubits.add(v) + key = (u, v) if u <= v else (v, u) + edge_weights[key] = ( + edge_weights.get(key, 0.0) + edge_weight + ) + + for future_idx in F: + add_edges(future_idx, 1.0) + if E: + for future_idx, depth in E: + add_edges(future_idx, W * (alpha ** depth)) + + if not edge_weights: + return None + + neighbor_vqs = sorted(qubits) + q_to_idx = {q: idx for idx, q in enumerate(neighbor_vqs)} + edges = [ + (q_to_idx[u], q_to_idx[v], edge_weight) + for (u, v), edge_weight in edge_weights.items() + ] + return { + "neighbor_vqs": neighbor_vqs, + "initial_pos": tuple(int(pi[q]) for q in neighbor_vqs), + "edges": edges, + "weight": weight, + } + + def _advance_layout_frontier( + self, + selected_partition_idx, + F, + resolved_partitions, + DAG, + IDAG, + optimized_partitions, + ): + """Advance a copied frontier without mutating circuits. + + This mirrors the layout-only single-qubit elision logic and is used by + the boundary beam rollout. It intentionally tracks only dependency + state and layout; final circuit construction still happens through the + concrete chosen route. + """ + F_next = list(F) + resolved_next = list(resolved_partitions) + + if selected_partition_idx in F_next: + F_next.remove(selected_partition_idx) + resolved_next[selected_partition_idx] = True + + stack = deque(DAG[selected_partition_idx]) + while stack: + child = stack.popleft() + if resolved_next[child] or child in F_next: + continue + if not all(resolved_next[parent] for parent in IDAG[child]): + continue + if self._partition_is_single(optimized_partitions[child]): + resolved_next[child] = True + stack.extend(DAG[child]) + else: + F_next.append(child) + + return tuple(F_next), tuple(resolved_next) + + def _boundary_beam_select_index( + self, + partition_candidates, + scores, + cached_swaps, + cached_pi, + F_snapshot, + resolved_partitions, + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + candidate_cache, + canonical_data, + reverse=False, + W=0.5, + alpha=1.0, + cnot_cost=1.0 / 3.0, + adj=None, + ): + """Choose the next candidate by rolling out boundary-layout states. + + The ordinary SABRE selector commits to the locally best candidate. This + keeps a small beam of possible boundary layouts across several future + partitions, then returns the first candidate from the best rollout. + """ + beam_width = int(self.config.get("boundary_beam_width", 1) or 1) + beam_depth = int(self.config.get("boundary_beam_depth", 1) or 1) + fallback_idx = int(np.argmin(np.asarray(scores))) + if beam_width <= 1 or beam_depth <= 1 or len(partition_candidates) <= 1: + return fallback_idx + if not any(len(cand.involved_qbits) >= 3 for cand in partition_candidates): + return fallback_idx + + max_E_size = self.config.get("max_E_size", 20) + max_lookahead = self.config.get("max_lookahead", 4) + top_k = self.config.get("prefilter_top_k", 50) + path_weight = self.config.get("path_tiebreak_weight", 0.2) + three_q_weight = self.config.get("three_qubit_exit_weight", 1.0) + + def transition_cost(cand, swaps): + return self._routing_objective( + len(swaps or ()), + cand.cnot_count, + cnot_cost, + ) + + states = [] + for idx, cand in enumerate(partition_candidates): + if cached_pi[idx] is None: + continue + trans_cost = transition_cost(cand, cached_swaps[idx]) + F_next, resolved_next = self._advance_layout_frontier( + cand.partition_idx, + F_snapshot, + resolved_partitions, + DAG, + IDAG, + optimized_partitions, + ) + states.append( + ( + float(scores[idx]), + float(trans_cost), + tuple(int(x) for x in cached_pi[idx]), + F_next, + resolved_next, + idx, + ) + ) + + if not states: + return fallback_idx + + states.sort(key=lambda item: (item[0], item[5])) + states = states[:beam_width] + + for _ in range(1, beam_depth): + expanded = [] + for _, total_cost, pi_state, F_state, resolved_state, first_idx in states: + if not F_state: + expanded.append( + (total_cost, total_cost, pi_state, F_state, resolved_state, first_idx) + ) + continue + + resolved_list = list(resolved_state) + F_list = list(F_state) + E = self.generate_extended_set( + F_list, + DAG, + IDAG, + resolved_list, + optimized_partitions, + max_E_size=max_E_size, + max_lookahead=max_lookahead, + ) + candidates = self.obtain_partition_candidates( + F_list, + optimized_partitions, + candidate_cache=candidate_cache, + ) + if not candidates: + expanded.append( + (total_cost, total_cost, pi_state, F_state, resolved_state, first_idx) + ) + continue + candidates = self._prefilter_candidates( + candidates, + list(pi_state), + D, + top_k, + F=F_state, + E=E, + candidate_cache=candidate_cache, + layout_partitions=optimized_partitions, + reverse=reverse, + W=W, + alpha=alpha, + canonical_data=canonical_data, + ) + + for cand in candidates: + neighbor_info = self._build_neighbor_info( + cand.partition_idx, + F_state, + E, + pi_state, + canonical_data, + weight=path_weight, + W=W, + alpha=alpha, + layout_partitions=optimized_partitions, + ) + score, swaps, output_perm = self.score_partition_candidate( + cand, + F_state, + list(pi_state), + scoring_partitions, + D, + self._swap_cache, + E=E, + W=W, + alpha=alpha, + reverse=reverse, + canonical_data=canonical_data, + adj=adj, + cnot_cost=cnot_cost, + path_tiebreak_weight=path_weight, + cached_neighbor_info=neighbor_info, + candidate_cache=candidate_cache, + layout_partitions=optimized_partitions, + return_transforms=True, + three_qubit_exit_weight=three_q_weight, + ) + trans_cost = transition_cost(cand, swaps) + future_cost = float(score) - trans_cost + new_total = total_cost + trans_cost + rank_cost = new_total + future_cost + F_next, resolved_next = self._advance_layout_frontier( + cand.partition_idx, + F_state, + resolved_state, + DAG, + IDAG, + optimized_partitions, + ) + expanded.append( + ( + rank_cost, + new_total, + tuple(int(x) for x in output_perm), + F_next, + resolved_next, + first_idx, + ) + ) + + if not expanded: + break + expanded.sort(key=lambda item: (item[0], item[5])) + states = expanded[:beam_width] + + if not states: + return fallback_idx + return int(min(states, key=lambda item: (item[0], item[5]))[5]) + + def Heuristic_Search( + self, + F, + pi, + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + candidate_cache=None, + ): + pi_initial = pi.copy() + F = list(F) + + resolved_partitions = [False] * len(DAG) + partition_order = [] + resolved_count = 0 + + queue = deque( + p + for p in F + if isinstance(optimized_partitions[p], SingleQubitPartitionResult) + ) + while queue: + partition_idx = queue.pop() + if resolved_partitions[partition_idx]: + continue + if partition_idx in F: + F.remove(partition_idx) + + single_qubit_part = optimized_partitions[partition_idx] + original_qubit = int(single_qubit_part.involved_qbits[0]) + circuit_qubit = int(single_qubit_part.circuit.get_Qbits()[0]) + single_qubit_part.circuit = single_qubit_part.circuit.Remap_Qbits( + {circuit_qubit: int(pi[original_qubit])}, + max(D.shape), + ) + partition_order.append(single_qubit_part) + resolved_partitions[partition_idx] = True + resolved_count += 1 + + for child in DAG[partition_idx]: + if not resolved_partitions[child] and child not in F: + if all(resolved_partitions[p] for p in IDAG[child]): + if isinstance( + optimized_partitions[child], + SingleQubitPartitionResult, + ): + queue.append(child) + else: + F.append(child) + + total_partitions = len(DAG) + pbar = tqdm( + total=total_partitions, + desc="Heuristic Search", + bar_format=( + "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} resolved" + ), + disable=self.config.get("progressbar", 0) is False, + mininterval=0.2, + ) + if resolved_count: + pbar.update(resolved_count) + + max_E_size = self.config.get("max_E_size", 20) + max_lookahead = self.config.get("max_lookahead", 4) + E_W = self.config.get("E_weight", 0.5) + E_alpha = self.config.get("E_alpha", 1.0) + swap_burst_budget = self.config.get("swap_burst_budget", 5) + + canonical_data = self._build_canonical_neighbor_data( + scoring_partitions, reverse=False + ) + decay = [1.0] * len(pi) + swap_heavy_partitions = 0 + + while F: + if ( + swap_burst_budget > 0 + and swap_heavy_partitions >= swap_burst_budget + ): + valve_swaps, pi_bridged = self._release_valve( + F, pi, D, canonical_data + ) + if valve_swaps: + partition_order.append( + construct_swap_circuit(valve_swaps, len(pi)) + ) + self._apply_decay_for_swaps(valve_swaps, decay) + pi = np.asarray(pi_bridged) + swap_heavy_partitions = 0 + continue + self._reset_decay(decay) + swap_heavy_partitions = 0 + + F_snapshot = tuple(F) + E = self.generate_extended_set( + F, + DAG, + IDAG, + resolved_partitions, + optimized_partitions, + max_E_size=max_E_size, + max_lookahead=max_lookahead, + ) + + partition_candidates = self.obtain_partition_candidates( + F, + optimized_partitions, + candidate_cache=candidate_cache, + ) + if not partition_candidates: + break + + top_k = self.config.get("prefilter_top_k", 50) + partition_candidates = self._prefilter_candidates( + partition_candidates, + pi, + D, + top_k, + F=F_snapshot, + E=E, + candidate_cache=candidate_cache, + layout_partitions=optimized_partitions, + W=E_W, + alpha=E_alpha, + canonical_data=canonical_data, + ) + + # Group candidates by partition_idx to reuse _build_neighbor_info + candidate_order = sorted( + range(len(partition_candidates)), + key=lambda i: partition_candidates[i].partition_idx + ) + scores = [0.0] * len(partition_candidates) + cached_swaps = [None] * len(partition_candidates) + cached_pi = [None] * len(partition_candidates) + prev_partition_idx = None + cached_neighbor_info = None + for ci in candidate_order: + cand = partition_candidates[ci] + if cand.partition_idx != prev_partition_idx: + cached_neighbor_info = self._build_neighbor_info( + cand.partition_idx, + F_snapshot, + E, + pi, + canonical_data, + weight=self.config.get("path_tiebreak_weight", 0.2), + W=E_W, + alpha=E_alpha, + layout_partitions=optimized_partitions, + ) + prev_partition_idx = cand.partition_idx + score, swaps, output_perm = self.score_partition_candidate( + cand, + F_snapshot, + pi, + scoring_partitions, + D, + self._swap_cache, + E=E, + W=E_W, + alpha=E_alpha, + canonical_data=canonical_data, + adj=self._adj, + cnot_cost=self.config.get("cnot_cost", 1.0 / 3.0), + path_tiebreak_weight=self.config.get( + "path_tiebreak_weight", 0.2 + ), + decay=decay, + cached_neighbor_info=cached_neighbor_info, + candidate_cache=candidate_cache, + layout_partitions=optimized_partitions, + return_transforms=True, + three_qubit_exit_weight=self.config.get( + "three_qubit_exit_weight", 1.0 + ), + ) + scores[ci] = score + cached_swaps[ci] = swaps + cached_pi[ci] = output_perm + + best_idx = self._boundary_beam_select_index( + partition_candidates, + scores, + cached_swaps, + cached_pi, + F_snapshot, + resolved_partitions, + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + candidate_cache, + canonical_data, + W=E_W, + alpha=E_alpha, + cnot_cost=self.config.get("cnot_cost", 1.0 / 3.0), + adj=self._adj, + ) + min_partition_candidate = partition_candidates[best_idx] + + F.remove(min_partition_candidate.partition_idx) + resolved_partitions[min_partition_candidate.partition_idx] = True + resolved_count += 1 + pbar.update(1) + + swap_order, pi = cached_swaps[best_idx], cached_pi[best_idx] + if swap_order: + partition_order.append(construct_swap_circuit(swap_order, len(pi))) + self._apply_decay_for_swaps(swap_order, decay) + swap_heavy_partitions += 1 + else: + swap_heavy_partitions = 0 + self._reset_decay(decay) + + partition_order.append(min_partition_candidate) + + children = deque(DAG[min_partition_candidate.partition_idx]) + while children: + child = children.popleft() + parents_resolved = all( + resolved_partitions[parent] for parent in IDAG[child] + ) + if (not resolved_partitions[child] and child not in F) and ( + parents_resolved + ): + if isinstance( + optimized_partitions[child], SingleQubitPartitionResult + ): + child_partition = optimized_partitions[child] + original_qubit = int(child_partition.involved_qbits[0]) + circuit_qubit = int(child_partition.circuit.get_Qbits()[0]) + child_partition.circuit = child_partition.circuit.Remap_Qbits( + {circuit_qubit: int(pi[original_qubit])}, + max(D.shape), + ) + partition_order.append(child_partition) + resolved_partitions[child] = True + resolved_count += 1 + pbar.update(1) + children.extend(DAG[child]) + else: + F.append(child) + + pbar.close() + return partition_order, pi, pi_initial + + def _heuristic_search_layout_only( + self, + F, + pi, + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + rng=None, + reverse=False, + candidate_cache=None, + ): + """Run heuristic search but only track layout (pi). No circuit modification. + + Args: + reverse: When True, swap P_i/P_o roles in scoring and layout + updates (used for backward passes in SABRE iterations). + + Returns: + (pi, total_cost): final layout and layout-only heuristic score. + Trial ranking reroutes returned layouts and sorts by actual + constructed-circuit CNOT count; this score is only a tie-breaker. + """ + F = list(F) + resolved_partitions = [False] * len(DAG) + total_cost = 0.0 + + queue = deque( + p for p in F if self._partition_is_single(optimized_partitions[p]) + ) + while queue: + partition_idx = queue.pop() + if resolved_partitions[partition_idx]: + continue + if partition_idx in F: + F.remove(partition_idx) + resolved_partitions[partition_idx] = True + + for child in DAG[partition_idx]: + if not resolved_partitions[child] and child not in F: + if all(resolved_partitions[p] for p in IDAG[child]): + if self._partition_is_single(optimized_partitions[child]): + queue.append(child) + else: + F.append(child) + + max_E_size = self.config.get("max_E_size", 20) + max_lookahead = self.config.get("max_lookahead", 4) + E_W = self.config.get("E_weight", 0.5) + E_alpha = self.config.get("E_alpha", 1.0) + cnot_cost = self.config.get("cnot_cost", 1.0 / 3.0) + swap_burst_budget = self.config.get("swap_burst_budget", 5) + + canonical_data = self._build_canonical_neighbor_data( + scoring_partitions, reverse=reverse + ) + decay = [1.0] * len(pi) + swap_heavy_partitions = 0 + + while F: + if ( + swap_burst_budget > 0 + and swap_heavy_partitions >= swap_burst_budget + ): + valve_swaps, pi = self._release_valve(F, pi, D, canonical_data) + if valve_swaps: + total_cost += self._routing_objective( + len(valve_swaps), + 0, + cnot_cost, + decay_factor=self._decay_factor_for_swaps( + valve_swaps, decay + ), + ) + self._apply_decay_for_swaps(valve_swaps, decay) + swap_heavy_partitions = 0 + continue + self._reset_decay(decay) + swap_heavy_partitions = 0 + + F_snapshot = tuple(F) + E = self.generate_extended_set( + F, + DAG, + IDAG, + resolved_partitions, + optimized_partitions, + max_E_size=max_E_size, + max_lookahead=max_lookahead, + ) + + partition_candidates = self.obtain_partition_candidates( + F, + optimized_partitions, + candidate_cache=candidate_cache, + ) + if not partition_candidates: + break + + top_k = self.config.get("prefilter_top_k", 50) + partition_candidates = self._prefilter_candidates( + partition_candidates, + pi, + D, + top_k, + F=F_snapshot, + E=E, + candidate_cache=candidate_cache, + layout_partitions=optimized_partitions, + reverse=reverse, + W=E_W, + alpha=E_alpha, + canonical_data=canonical_data, + ) + + # Group candidates by partition_idx to reuse _build_neighbor_info + candidate_order = sorted( + range(len(partition_candidates)), + key=lambda i: partition_candidates[i].partition_idx + ) + scores = [0.0] * len(partition_candidates) + cached_swaps = [None] * len(partition_candidates) + cached_pi = [None] * len(partition_candidates) + prev_partition_idx = None + cached_neighbor_info = None + for ci in candidate_order: + cand = partition_candidates[ci] + if cand.partition_idx != prev_partition_idx: + cached_neighbor_info = self._build_neighbor_info( + cand.partition_idx, + F_snapshot, + E, + pi, + canonical_data, + weight=self.config.get("path_tiebreak_weight", 0.2), + W=E_W, + alpha=E_alpha, + layout_partitions=optimized_partitions, + ) + prev_partition_idx = cand.partition_idx + score, swaps, output_perm = self.score_partition_candidate( + cand, + F_snapshot, + pi, + scoring_partitions, + D, + self._swap_cache, + E=E, + W=E_W, + alpha=E_alpha, + reverse=reverse, + canonical_data=canonical_data, + adj=self._adj, + cnot_cost=cnot_cost, + path_tiebreak_weight=self.config.get( + "path_tiebreak_weight", 0.2 + ), + decay=decay, + cached_neighbor_info=cached_neighbor_info, + candidate_cache=candidate_cache, + layout_partitions=optimized_partitions, + return_transforms=True, + three_qubit_exit_weight=self.config.get( + "three_qubit_exit_weight", 1.0 + ), + ) + scores[ci] = score + cached_swaps[ci] = swaps + cached_pi[ci] = output_perm + + best_idx = self._boundary_beam_select_index( + partition_candidates, + scores, + cached_swaps, + cached_pi, + F_snapshot, + resolved_partitions, + DAG, + IDAG, + optimized_partitions, + scoring_partitions, + D, + candidate_cache, + canonical_data, + reverse=reverse, + W=E_W, + alpha=E_alpha, + cnot_cost=cnot_cost, + adj=self._adj, + ) + best = partition_candidates[best_idx] + F.remove(best.partition_idx) + resolved_partitions[best.partition_idx] = True + + swaps, pi = cached_swaps[best_idx], cached_pi[best_idx] + decay_factor = 1.0 + if swaps: + decay_factor = self._decay_factor_for_swaps(swaps, decay) + total_cost += self._routing_objective( + len(swaps), + best.cnot_count, + cnot_cost, + decay_factor=decay_factor, + ) + if swaps: + self._apply_decay_for_swaps(swaps, decay) + swap_heavy_partitions += 1 + else: + swap_heavy_partitions = 0 + self._reset_decay(decay) + + for child in DAG[best.partition_idx]: + if not resolved_partitions[child] and child not in F: + if all(resolved_partitions[p] for p in IDAG[child]): + if self._partition_is_single(optimized_partitions[child]): + resolved_partitions[child] = True + stack = deque(DAG[child]) + while stack: + gc = stack.pop() + if not resolved_partitions[gc] and gc not in F: + if all( + resolved_partitions[p] + for p in IDAG[gc] + ): + if self._partition_is_single( + optimized_partitions[gc] + ): + resolved_partitions[gc] = True + stack.extend(DAG[gc]) + else: + F.append(gc) + else: + F.append(child) + + return pi, total_cost + # ------------------------------------------------------------------------ + # Circuit Construction + # ------------------------------------------------------------------------ + + def Construct_circuit_from_HS(self, partition_order, optimized_partitions,N): + final_circuit = Circuit(N) + final_parameters = [] + perm_count = 0 + partition_count = 0 + + for part in partition_order: + if isinstance(part, Circuit): + final_circuit.add_Circuit(part) + perm_count += 1 + elif isinstance(part, SingleQubitPartitionResult): + final_circuit.add_Circuit(part.circuit) + final_parameters.append(part.parameters) + partition_count += 1 + else: + part_circ, part_parameters = part.get_final_circuit(optimized_partitions,N) + final_circuit.add_Circuit(part_circ) + final_parameters.append(part_parameters) + partition_count += 1 + + if final_parameters: + final_parameters = np.concatenate([np.atleast_1d(p).ravel() for p in final_parameters], axis=0) + else: + final_parameters = np.array([]) + if not check_circuit_compatibility(final_circuit,self.topology): + logging.error("Final circuit is not compatible with device topology") + return final_circuit, final_parameters + + # ------------------------------------------------------------------------ + # Scoring + # ------------------------------------------------------------------------ + + def _build_canonical_neighbor_data(self, scoring_partitions, reverse=False): + """Build a compact future-routing surrogate per partition. + + For each partition, pick the edge pattern with the lowest CNOT count; + the router uses this as a canonical "best still-available option" when + scoring future partitions. + """ + data = {} + for idx, partition in enumerate(scoring_partitions): + if partition is None: + continue + qbit_map_inv = {v: q for q, v in partition.qubit_map.items()} + variant_map = {} + for tdx, mini_topology in enumerate(partition.mini_topologies): + for pdx, (P_i, P_o) in enumerate(partition.permutations_pairs[tdx]): + cnot = partition.cnot_counts[tdx][pdx] + P_route = P_o if reverse else P_i + if mini_topology: + edge_key = tuple( + sorted( + tuple( + sorted( + ( + qbit_map_inv[P_route[u]], + qbit_map_inv[P_route[v]], + ) + ) + ) + for u, v in mini_topology + ) + ) + else: + edge_key = tuple() + prev_cnot = variant_map.get(edge_key) + if prev_cnot is None or cnot < prev_cnot: + variant_map[edge_key] = cnot + if not variant_map: + continue + edge_key, cnot = min( + variant_map.items(), + key=lambda item: (item[1], len(item[0]), item[0]), + ) + if edge_key: + eu = np.array([e[0] for e in edge_key], dtype=np.intp) + ev = np.array([e[1] for e in edge_key], dtype=np.intp) + else: + eu = ev = None + data[idx] = {"edges_u": eu, "edges_v": ev, "cnot": cnot} + return data + + @staticmethod + def score_partition_candidate(partition_candidate, F, pi, scoring_partitions, D, swap_cache, + E=None, W=0.5, alpha=0.9, reverse=False, + canonical_data=None, adj=None, + cnot_cost=1.0 / 3.0, + path_tiebreak_weight=0.2, decay=None, + cached_neighbor_info=None, + candidate_cache=None, + layout_partitions=None, + return_transforms=False, + three_qubit_exit_weight=1.0): + """LightSABRE-style relative scoring (arXiv:2409.08368, eq. 1). + + H = |swaps| + + cnot_cost * cand.cnot_count + + (1/|F'|) * average routing cost over F \\ {cand} + + (W/|E|) * alpha^d-decayed routing cost over E + """ + if cached_neighbor_info is not None: + neighbor_info = cached_neighbor_info + else: + neighbor_info = qgd_Partition_Aware_Mapping._build_neighbor_info( + partition_candidate.partition_idx, + F, + E, + pi, + canonical_data, + weight=path_tiebreak_weight, + W=W, + alpha=alpha, + layout_partitions=layout_partitions, + ) + swaps, output_perm = partition_candidate.transform_pi( + pi, + D, + swap_cache, + reverse=reverse, + adj=adj, + neighbor_info=neighbor_info, + ) + decay_factor = 1.0 + if decay is not None and swaps: + decay_factor = qgd_Partition_Aware_Mapping._decay_factor_for_swaps( + swaps, decay + ) + score = qgd_Partition_Aware_Mapping._routing_objective( + len(swaps), + partition_candidate.cnot_count, + cnot_cost, + decay_factor=decay_factor, + ) + + if candidate_cache is None: + if return_transforms: + return score, swaps, output_perm + return score + + cand_idx = partition_candidate.partition_idx + future_score = qgd_Partition_Aware_Mapping._future_context_cost( + cand_idx, + output_perm, + F, + E, + D, + candidate_cache, + reverse=reverse, + cnot_cost=cnot_cost, + W=W, + alpha=alpha, + layout_partitions=layout_partitions, + canonical_data=canonical_data, + ) + if len(partition_candidate.involved_qbits) >= 3: + future_score *= three_qubit_exit_weight + score += future_score + + if return_transforms: + return score, swaps, output_perm + return score + + # ------------------------------------------------------------------------ + # Extended Set + # ------------------------------------------------------------------------ + + @staticmethod + def generate_extended_set( + F, + DAG, + IDAG, + resolved_partitions, + optimized_partitions, + max_E_size=20, + max_lookahead=4, + ): + """ + Generate SABRE-style extended set: multi-qubit partitions near the + front layer, up to ``max_lookahead`` levels deep and ``max_E_size`` + entries. Returns list of (partition_idx, depth) tuples. + """ + E = [] + E_set = set() + F_set = set(F) + + for front_idx in F: + if len(E) >= max_E_size: + break + + queue = deque((child, 1) for child in DAG[front_idx]) + + while queue and len(E) < max_E_size: + child_idx, depth = queue.popleft() + if depth > max_lookahead: + continue + if child_idx in E_set or child_idx in F_set: + continue + if resolved_partitions[child_idx]: + continue + + parents_resolved = all( + resolved_partitions[p] or p in F_set for p in IDAG[child_idx] + ) + if not parents_resolved: + continue + + if qgd_Partition_Aware_Mapping._partition_is_single( + optimized_partitions[child_idx] + ): + for grandchild in DAG[child_idx]: + queue.append((grandchild, depth)) + continue + + E.append((child_idx, depth)) + E_set.add(child_idx) + + if depth < max_lookahead: + for grandchild in DAG[child_idx]: + queue.append((grandchild, depth + 1)) + + return E + + # ------------------------------------------------------------------------ + # Candidate Generation + # ------------------------------------------------------------------------ + + def obtain_partition_candidates( + self, + F, + optimized_partitions=None, + candidate_cache=None, + ): + if candidate_cache is not None: + partition_candidates = [] + for partition_idx in F: + cached = candidate_cache[partition_idx] + if cached: + partition_candidates.extend(cached) + return partition_candidates + + partition_candidates = [] + for partition_idx in F: + partition = optimized_partitions[partition_idx] + for tdx, mini_topology in enumerate(partition.mini_topologies): + if hasattr(partition, 'get_topology_candidates'): + topology_candidates = partition.get_topology_candidates(tdx) + else: + topology_candidates = self._get_subtopologies_of_type_cached( + mini_topology + ) + for topology_candidate in topology_candidates: + for pdx, permutation_pair in enumerate( + partition.permutations_pairs[tdx] + ): + partition_candidates.append( + PartitionCandidate( + partition_idx, + tdx, + pdx, + partition.circuit_structures[tdx][pdx], + permutation_pair[0], + permutation_pair[1], + topology_candidate, + mini_topology, + partition.qubit_map, + partition.involved_qbits, + cnot_count=partition.cnot_counts[tdx][pdx], + ) + ) + return partition_candidates + + # ------------------------------------------------------------------------ + # Graph Construction + # ------------------------------------------------------------------------ + + def get_initial_layer(self, IDAG, N, optimized_partitions): + del N, optimized_partitions + return [idx for idx in range(len(IDAG)) if not IDAG[idx]] + + + def get_final_layer(self, DAG, N, optimized_partitions): + del N, optimized_partitions + return [idx for idx in range(len(DAG) - 1, -1, -1) if not DAG[idx]] + + def construct_DAG_and_IDAG(self, optimized_partitions): + DAG = [] + IDAG = [] + for idx in range(len(optimized_partitions)): + parents = [] + children = [] + if idx != len(optimized_partitions)-1: + involved_qbits_current = optimized_partitions[idx].involved_qbits.copy() + for next_idx in range(idx+1, len(optimized_partitions)): + involved_qbits_next = optimized_partitions[next_idx].involved_qbits + intersection = [i for i in involved_qbits_current if i in involved_qbits_next] + if len(intersection) > 0: + children.append(next_idx) + for intersection_qbit in intersection: + involved_qbits_current.remove(intersection_qbit) + if len(involved_qbits_current) == 0: + break + if idx != 0: + involved_qbits_current = optimized_partitions[idx].involved_qbits.copy() + for prev_idx in range(idx-1, -1, -1): + involved_qbits_prev = optimized_partitions[prev_idx].involved_qbits + intersection = [i for i in involved_qbits_current if i in involved_qbits_prev] + if len(intersection) > 0: + parents.append(prev_idx) + for intersection_qbit in intersection: + involved_qbits_current.remove(intersection_qbit) + if len(involved_qbits_current) == 0: + break + DAG.append(children) + IDAG.append(parents) + return DAG, IDAG + + # ------------------------------------------------------------------------ + # Distance & Layout + # ------------------------------------------------------------------------ + + def compute_distances_bfs(self, N): + """BFS distance computation - faster than Floyd-Warshall.""" + D = np.ones((N, N)) * np.inf + + # Build adjacency list + adj = defaultdict(list) + for u, v in self.config['topology']: + adj[u].append(v) + adj[v].append(u) + + # BFS from each vertex + for start in range(N): + D[start][start] = 0 + queue = deque([(start, 0)]) + visited = {start} + + while queue: + node, dist = queue.popleft() + for neighbor in adj[node]: + if neighbor not in visited: + visited.add(neighbor) + D[start][neighbor] = dist + 1 + queue.append((neighbor, dist + 1)) + + # Store adjacency list for reuse by A* routing + self._adj = [list(adj[i]) for i in range(N)] + + return D + + def _compute_seeded_layout(self, optimized_partitions, D, N, circ): + """VF2Layout + SabrePreLayout seeded initial layout (LightSABRE §II.3). + + The interaction graph is built from the circuit's two-qubit gate pairs + (matching the paper's gate-level approach), not from partition cliques. + Partition-level weights are used only for the greedy fallback. + + Steps: + 1. VF2Layout: subgraph isomorphism of gate interaction graph into + hardware topology. If a mapping exists, every gate qubit pair + lands on adjacent physical qubits (zero SWAPs). + 2. SabrePreLayout: augment topology with distance-d edges (d=2), + retry VF2 — handles "almost perfect" embeddings. + 3. Fallback: greedy weighted-distance placement from partition weights. + """ + if not self.topology: + return np.arange(N) + + # --- build gate-level interaction graph from circuit CNOT pairs --- + gate_edges = set() + for g in circ.get_Gates(): + gname = str(type(g).__name__) + if 'CNOT' in gname or 'CX' in gname: + ctrl = g.get_Control_Qbit() + tgt = g.get_Target_Qbit() + gate_edges.add((min(ctrl, tgt), max(ctrl, tgt))) + + if not gate_edges: + return np.arange(N) + + # --- try rustworkx VF2 approaches --- + try: + import rustworkx as rx + except ImportError: + return self._greedy_seeded_layout(optimized_partitions, D, N) + + G_int = rx.PyGraph() + G_int.add_nodes_from(range(N)) + for u, v in gate_edges: + G_int.add_edge(u, v, None) + + G_hw = rx.PyGraph() + G_hw.add_nodes_from(range(N)) + for u, v in self.topology: + G_hw.add_edge(u, v, None) + + # Step 1: VF2Layout — exact subgraph isomorphism + pi = self._try_vf2_layout(G_int, G_hw, N) + if pi is not None: + return pi + + # Step 2: SabrePreLayout — augment topology with distance-2 edges + G_aug = rx.PyGraph() + G_aug.add_nodes_from(range(N)) + seen = set() + for u, v in self.topology: + G_aug.add_edge(u, v, None) + seen.add((min(u, v), max(u, v))) + for i in range(N): + for j in range(i + 1, N): + if (i, j) not in seen and D[i][j] <= 2: + G_aug.add_edge(i, j, None) + seen.add((i, j)) + + pi = self._try_vf2_layout(G_int, G_aug, N) + if pi is not None: + return pi + + # Step 3: greedy fallback using partition-level weights + return self._greedy_seeded_layout(optimized_partitions, D, N) + + def _try_vf2_layout(self, G_int, G_hw, N): + """Try VF2 subgraph isomorphism of G_int into G_hw. + + Returns pi (logical->physical mapping) or None if no embedding exists. + Uses induced=False to allow non-edges in the interaction graph to + correspond to edges in the hardware graph (monotone subgraph iso). + """ + import rustworkx as rx + + try: + vf2_iter = rx.vf2_mapping(G_hw, G_int, subgraph=True, induced=False) + mapping = next(vf2_iter) # {hw_node: int_node} + except StopIteration: + return None + + # Invert: pi[logical_q] = physical_q + pi = np.zeros(N, dtype=int) + inv = {v: k for k, v in mapping.items()} + used = set(inv.values()) + free = [p for p in range(N) if p not in used] + fi = 0 + for q in range(N): + if q in inv: + pi[q] = inv[q] + else: + pi[q] = free[fi] + fi += 1 + return pi + + def _greedy_seeded_layout(self, optimized_partitions, D, N): + """Greedy weighted-distance placement (fallback when VF2 fails).""" + # Build interaction weights from partitions + interaction_weight = defaultdict(float) + for partition in optimized_partitions: + if isinstance(partition, SingleQubitPartitionResult): + continue + if not isinstance(partition, PartitionSynthesisResult): + continue + involved = list(partition.involved_qbits) + if len(involved) < 2: + continue + best_cnot = float('inf') + for tdx in range(len(partition.cnot_counts)): + if not partition.cnot_counts[tdx]: + continue + cnot_min = min(partition.cnot_counts[tdx]) + if cnot_min < best_cnot: + best_cnot = cnot_min + if best_cnot == float('inf'): + continue + for i in range(len(involved)): + for j in range(i + 1, len(involved)): + key = (min(involved[i], involved[j]), + max(involved[i], involved[j])) + interaction_weight[key] += best_cnot + + if not interaction_weight: + return np.arange(N) + + pi = np.arange(N) + placed_logical = set() + placed_physical = set() + + (q1, q2), _ = max(interaction_weight.items(), key=lambda x: x[1]) + p1, p2 = self.topology[0] + + holder1 = np.where(pi == p1)[0][0] + pi[q1], pi[holder1] = p1, pi[q1] + holder2 = np.where(pi == p2)[0][0] + pi[q2], pi[holder2] = p2, pi[q2] + placed_logical.update([q1, q2]) + placed_physical.update([p1, p2]) + + remaining = [q for q in range(N) if q not in placed_logical] + + def _score(q): + return sum( + interaction_weight.get((min(q, pq), max(q, pq)), 0.0) + for pq in placed_logical + ) + + remaining.sort(key=_score, reverse=True) + + for logical_q in remaining: + best_physical = None + best_dist = float('inf') + + for physical_q in range(N): + if physical_q in placed_physical: + continue + + total_dist = 0.0 + total_w = 0.0 + for other_q in placed_logical: + key = (min(logical_q, other_q), max(logical_q, other_q)) + w = interaction_weight.get(key, 0.0) + if w > 0: + total_dist += D[physical_q][pi[other_q]] * w + total_w += w + + avg = total_dist / total_w if total_w > 0 else 0.0 + if avg < best_dist: + best_dist = avg + best_physical = physical_q + + if best_physical is not None: + holder = np.where(pi == best_physical)[0][0] + pi[logical_q], pi[holder] = best_physical, pi[logical_q] + placed_logical.add(logical_q) + placed_physical.add(best_physical) + + return pi diff --git a/squander/synthesis/PartAM_utils.py b/squander/synthesis/PartAM_utils.py new file mode 100644 index 000000000..69c2c9732 --- /dev/null +++ b/squander/synthesis/PartAM_utils.py @@ -0,0 +1,688 @@ +import heapq +import logging +from collections import defaultdict +from dataclasses import dataclass +from itertools import combinations, permutations +from typing import Dict, FrozenSet, List, Set, Tuple + +import numpy as np + +from squander.gates.qgd_Circuit import qgd_Circuit as Circuit + + +# ============================================================================ +# SWAP Routing Algorithms +# ============================================================================ +def _neighbor_signature(neighbor_info): + """Stable hash-friendly signature of an active neighbor_info. + + Returns None when the neighbor heuristic is inactive (no info, zero + weight, or empty edge list) — callers treat all such calls as cache- + compatible. Otherwise returns a tuple of (sorted edges as + (min(u,v), max(u,v), weight), initial_pos tuple, rounded weight). + """ + if neighbor_info is None: + return None + weight = neighbor_info.get('weight', 0.0) + edges = neighbor_info.get('edges') or () + if weight == 0.0 or not edges: + return None + canonical_edges = tuple(sorted( + (min(int(u), int(v)), max(int(u), int(v)), float(w)) + for u, v, w in edges + )) + initial_pos = tuple(int(p) for p in neighbor_info.get('initial_pos', ())) + return (canonical_edges, initial_pos, round(float(weight), 6)) + + +def find_constrained_swaps_partial(pi_A, pi_B_dict, dist_matrix, adj=None, neighbor_info=None): + """ + Route partition qubits to their target physical positions using A* over + the k-dimensional state space of partition qubit positions only. + + For k partition qubits on an n-node topology the state space has at most + n^k entries (n*(n-1)*...*(n-k+1) distinct states). For the typical case + of k=2 or k=3 and n≤20 this is tiny (≤2744 states) so the search + completes in microseconds while still finding an optimal SWAP sequence. + + The original full-state A* had O(n!) state space which was exponentially + slow. The naive greedy replacement oscillated when two adjacent partition + qubits needed to move in the same direction. This implementation avoids + both problems. + + Args: + pi_A : List[int], pi_A[q] = current physical position of virtual qubit q. + pi_B_dict : Dict {q: target_physical} for the qubits that need routing. + dist_matrix : n×n distance/cost matrix; dist[i][j]==1 means i and j are adjacent. + + Returns: + swaps : List of (P1, P2) adjacent-qubit SWAP operations (optimal). + final_permutation: Updated virtual→physical mapping after all SWAPs. + """ + n = len(pi_A) + + # Build adjacency list from dist_matrix if not provided + if adj is None: + adj = [[] for _ in range(n)] + for i in range(n): + for j in range(i + 1, n): + if dist_matrix[i][j] == 1: + adj[i].append(j) + adj[j].append(i) + + partition_qubits = sorted(pi_B_dict.keys()) + k = len(partition_qubits) + + initial_positions = tuple(int(pi_A[q]) for q in partition_qubits) + target_positions = tuple(int(pi_B_dict[q]) for q in partition_qubits) + + if initial_positions == target_positions: + return [], list(pi_A) + + def heuristic(positions): + # Admissible lower bound: sum of individual distances / 2 + return sum(dist_matrix[positions[i]][target_positions[i]] for i in range(k)) / 2 + + # SABRE-aware tiebreaker: prefer SWAP paths that keep future-partition + # qubits closer together. The weight is small enough to never override + # optimality (same SWAP count), only break ties among equal-length paths. + if neighbor_info is not None and neighbor_info['edges']: + n_vqs = neighbor_info['neighbor_vqs'] + n_edges = neighbor_info['edges'] # list of (idx_u, idx_v, edge_weight) + n_weight = neighbor_info['weight'] + initial_n_pos = neighbor_info['initial_pos'] + # Reverse map: physical position → index in n_vqs (for displacement tracking) + _n_len = len(n_vqs) + use_neighbor = True + + # Normalize so neighbor_heuristic returns values in [0, 1]. + # This guarantees n_weight * neighbor_heuristic < 1 (for n_weight < 1), + # so the tiebreaker never overrides SWAP-count optimality. + _total_edge_weight = sum(w for _, _, w in n_edges) + _diameter = int(np.max(dist_matrix[dist_matrix < np.inf])) if n > 1 else 1 + _norm = max(1.0, _total_edge_weight * _diameter) + + def neighbor_heuristic(n_pos): + return sum(w * dist_matrix[n_pos[i]][n_pos[j]] for i, j, w in n_edges) / _norm + else: + initial_n_pos = () + n_weight = 0.0 + _n_len = 0 + use_neighbor = False + + def neighbor_heuristic(n_pos): + return 0.0 + + # A* over k-dimensional state space. + # Each state is a tuple of physical positions, one per partition qubit. + # Paths are reconstructed via a parent-pointer dict to avoid copying lists + # on every heap push (which would be O(depth²) total). + counter = 0 # tiebreak counter so tuples never compare paths + # When the neighbor tie-breaker is active, the full search state must + # include the tracked future-qubit positions. Otherwise two equal-length + # paths to the same partition positions but different bystander layouts + # collapse into one visited entry, defeating the downstream-layout signal. + initial_state = ( + (initial_positions, initial_n_pos) if use_neighbor else initial_positions + ) + parent = {} # state_key → (parent_state_key, swap) for path reconstruction + parent[initial_state] = None + + h0 = heuristic(initial_positions) + nh0 = n_weight * neighbor_heuristic(initial_n_pos) if use_neighbor else 0.0 + heap = [] + heapq.heappush(heap, (h0 + nh0, 0, counter, initial_positions, initial_n_pos)) + visited = {initial_state: 0} + + while heap: + f, g, _, positions, n_pos = heapq.heappop(heap) + + state_key = (positions, n_pos) if use_neighbor else positions + + if positions == target_positions: + # Reconstruct swap path via parent pointers + path = [] + state = state_key + while parent[state] is not None: + prev_state, swap = parent[state] + path.append(swap) + state = prev_state + path.reverse() + + # Replay swaps on the full mapping to get final virt→phys + final_v2p = list(pi_A) + final_p2v = [0] * n + for q_idx in range(n): + final_p2v[int(final_v2p[q_idx])] = q_idx + for P1, P2 in path: + q1, q2 = final_p2v[P1], final_p2v[P2] + final_p2v[P1], final_p2v[P2] = q2, q1 + final_v2p[q1], final_v2p[q2] = P2, P1 + return path, final_v2p + + if visited.get(state_key, float('inf')) < g: + continue + + # Quick lookup: physical position → index within partition_qubits list + pos_to_k_idx = {p: i for i, p in enumerate(positions)} + + # Build reverse map for neighbor displacement tracking + if use_neighbor: + n_phys_to_idx = {n_pos[idx]: idx for idx in range(_n_len)} + + # Expand: try every SWAP that moves at least one partition qubit + for i, p in enumerate(positions): + for nb in adj[p]: + new_positions = list(positions) + new_positions[i] = nb + # If the neighbor also holds a partition qubit, swap it too + if nb in pos_to_k_idx: + j = pos_to_k_idx[nb] + new_positions[j] = p + new_positions = tuple(new_positions) + + new_g = g + 1 + # When a partition qubit swaps into nb, a tracked neighbor at nb + # is displaced to p AND a tracked neighbor at p (if it overlaps + # with a partition qubit) moves to nb. Update both sides. + if use_neighbor: + new_n_pos = list(n_pos) + if nb in n_phys_to_idx: + new_n_pos[n_phys_to_idx[nb]] = p + if p in n_phys_to_idx: + new_n_pos[n_phys_to_idx[p]] = nb + new_n_pos = tuple(new_n_pos) + new_nh = n_weight * neighbor_heuristic(new_n_pos) + else: + new_n_pos = n_pos + new_nh = 0.0 + + new_state_key = ( + (new_positions, new_n_pos) if use_neighbor else new_positions + ) + if visited.get(new_state_key, float('inf')) <= new_g: + continue + + visited[new_state_key] = new_g + swap_key = (min(p, nb), max(p, nb)) + parent[new_state_key] = (state_key, swap_key) + counter += 1 + heapq.heappush(heap, (new_g + heuristic(new_positions) + new_nh, + new_g, counter, new_positions, new_n_pos)) + + logging.warning( + "find_constrained_swaps_partial: failed to route %s → %s", + initial_positions, target_positions, + ) + return [], list(pi_A) + + +# ============================================================================ +# Topology Utilities +# ============================================================================ + +def _get_induced_edges(edges: List[Tuple[int, int]], qubit_subset: Set[int]) -> List[Tuple[int, int]]: + return [edge for edge in edges if edge[0] in qubit_subset and edge[1] in qubit_subset] + +def _is_connected(nodes: Set[int], edges: List[Tuple[int, int]]) -> bool: + if len(nodes) <= 1: + return True + adj = defaultdict(set) + for u, v in edges: + if u in nodes and v in nodes: + adj[u].add(v) + adj[v].add(u) + start = next(iter(nodes)) + visited = {start} + stack = [start] + while stack: + node = stack.pop() + for neighbor in adj[node]: + if neighbor not in visited: + visited.add(neighbor) + stack.append(neighbor) + return visited == nodes + +def get_canonical_form(qubit_subset: Set[int], induced_edges: List[Tuple[int, int]]) -> FrozenSet[Tuple[int, int]]: + qubits = sorted(qubit_subset) + n = len(qubits) + best_edges = None + for perm in permutations(range(n)): + mapping = {qubits[i]: perm[i] for i in range(n)} + relabeled = tuple(sorted([tuple(sorted([mapping[u], mapping[v]])) for u, v in induced_edges])) + if best_edges is None or relabeled < best_edges: + best_edges = relabeled + return frozenset(best_edges) + +def get_unique_subtopologies(edges: List[Tuple[int, int]], k: int) -> List[List[Tuple[int, int]]]: + """Return one representative locally-labeled (0..k-1) edge list per unique k-node + connected subgraph isomorphism class found in the graph defined by *edges*.""" + if k <= 0: + return [] + if k == 1: + return [[]] + nodes = set() + for u, v in edges: + nodes.add(u) + nodes.add(v) + nodes = sorted(nodes) + if len(nodes) < k: + return [] + canonical_forms = {} + for subset in combinations(nodes, k): + subset_set = set(subset) + induced = _get_induced_edges(edges, subset_set) + if not _is_connected(subset_set, induced): + continue + canonical = get_canonical_form(subset_set, induced) + if canonical not in canonical_forms: + # Store locally-labeled edges (0..k-1) so the decomposer always + # receives a valid k-qubit topology regardless of global qubit indices. + canonical_forms[canonical] = sorted(canonical) + return list(canonical_forms.values()) + +def get_subtopologies_of_type(edges: List[Tuple[int, int]], target_topology: List[Tuple[int, int]]) -> List[List[Tuple[int, int]]]: + """Return all connected k-node subgraphs of *edges* that are isomorphic to + *target_topology*, each expressed with the original global qubit labels + (needed for physical routing decisions).""" + target_qubits = set() + for u, v in target_topology: + target_qubits.add(u) + target_qubits.add(v) + k = len(target_qubits) if target_qubits else 1 + if k <= 0: + return [] + nodes = set() + for u, v in edges: + nodes.add(u) + nodes.add(v) + if k == 1: + return [[] for _ in nodes] + nodes = sorted(nodes) + if len(nodes) < k: + return [] + target_canonical = get_canonical_form(target_qubits, target_topology) + matches = [] + for subset in combinations(nodes, k): + subset_set = set(subset) + induced = _get_induced_edges(edges, subset_set) + if not _is_connected(subset_set, induced): + continue + canonical = get_canonical_form(subset_set, induced) + if canonical == target_canonical: + matches.append(induced) # global labels retained for routing + return matches + +_node_mapping_cache = {} + +def get_node_mapping(topology1: List[Tuple[int, int]], topology2: List[Tuple[int, int]]) -> dict: + cache_key = (tuple(tuple(e) for e in topology1), tuple(tuple(e) for e in topology2)) + cached = _node_mapping_cache.get(cache_key) + if cached is not None: + return cached + + qubits1 = set() + for u, v in topology1: + qubits1.add(u) + qubits1.add(v) + qubits2 = set() + for u, v in topology2: + qubits2.add(u) + qubits2.add(v) + if len(qubits1) != len(qubits2): + _node_mapping_cache[cache_key] = {} + return {} + sorted_qubits1 = sorted(qubits1) + sorted_qubits2 = sorted(qubits2) + n = len(sorted_qubits1) + for perm in permutations(range(n)): + mapping = {sorted_qubits1[i]: sorted_qubits2[perm[i]] for i in range(n)} + mapped_edges = set() + for u, v in topology1: + mapped_edges.add(tuple(sorted([mapping[u], mapping[v]]))) + original_edges = set(tuple(sorted([u, v])) for u, v in topology2) + if mapped_edges == original_edges: + _node_mapping_cache[cache_key] = mapping + return mapping + _node_mapping_cache[cache_key] = {} + return {} + + +def compute_automorphisms(mini_topology: List[Tuple[int, int]]) -> List[Tuple[int, ...]]: + """Compute all automorphisms of a locally-labeled mini_topology (nodes 0..N-1). + + An automorphism is a permutation sigma of {0,...,N-1} that preserves the + undirected edge set. For N<=4 (typical partition size) brute-forcing all + N! permutations is at most 24 checks. + + Returns: + List of permutation tuples. Always includes the identity as the first + element. + """ + nodes = set() + for u, v in mini_topology: + nodes.add(u) + nodes.add(v) + if not nodes: + return [()] + N = max(nodes) + 1 + edge_set = set() + for u, v in mini_topology: + edge_set.add((min(u, v), max(u, v))) + + automorphisms = [] + for perm in permutations(range(N)): + mapped = set() + for u, v in mini_topology: + mapped.add((min(perm[u], perm[v]), max(perm[u], perm[v]))) + if mapped == edge_set: + automorphisms.append(perm) + return automorphisms + + +def derive_result_from_automorphism(sigma, P_i, P_o, circuit, parameters, N): + """Derive an equivalent decomposition result from a topology automorphism. + + Given that C(theta) approximates P_o . U . P_i on topology T, the circuit + sigma(C)(theta) approximates (sigma . P_o) . U . (P_i . sigma^-1) on T + (since sigma preserves T). + + Returns: + (new_P_i, new_P_o, new_circuit, parameters) + Parameters are returned as-is (identical values, different qubit labels). + """ + sigma_inv = [0] * N + for i in range(N): + sigma_inv[sigma[i]] = i + + new_P_i = tuple(P_i[sigma_inv[j]] for j in range(N)) + new_P_o = tuple(sigma[P_o[j]] for j in range(N)) + + remap = {i: sigma[i] for i in range(N)} + new_circuit = circuit.Remap_Qbits(remap, N) + + return new_P_i, new_P_o, new_circuit, parameters + + +# ============================================================================ +# Data Classes +# ============================================================================ + +class SingleQubitPartitionResult: + + def __init__(self, circuit_in, parameters_in, original_qubits=None): + self.circuit = circuit_in + self.parameters = parameters_in + self.involved_qbits = original_qubits if original_qubits is not None else circuit_in.get_Qbits() + +# Virtual qubits q, reduced virtual qubits (the remapped circuit only up to partition_size) q* +# Physical qubits Q, reduced physical qubits Q* +class PartitionSynthesisResult: + + def __init__(self, N, mini_topologies, involved_qbits, qubit_map, topology=None, topology_cache=None): + # Physical mini_topology of the partition q* + self.mini_topologies = mini_topologies + # Qubit num of the partition + self.N = N + # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc + # P_o in Q*->q* permutation pattern [Q*1 Q*0 Q*2] This means that the current output of Q*1 is equal to q*0 + self.permutations_pairs = [[] for _ in range(len(mini_topologies))] + # Synthesis results + self.synthesised_circuits = [[] for _ in range(len(mini_topologies))] + self.synthesised_parameters = [[] for _ in range(len(mini_topologies))] + self.cnot_counts = [[] for _ in range(len(mini_topologies))] + self.circuit_structures = [[] for _ in range(len(mini_topologies))] + # Involved q qubits on the circuit + self.involved_qbits = involved_qbits + # {q:q*} + self.qubit_map = qubit_map + # Lazy per-topology candidate cache + self._topology_candidates = [None] * len(mini_topologies) + self._topology = topology + self._topology_cache = topology_cache + + def add_result(self, permutations_pair, synthesised_circuit, synthesised_parameters, topology_idx): + from squander.utils import circuit_to_CNOT_basis + + flat_circuit = synthesised_circuit.get_Flat_Circuit() + flat_circuit, synthesised_parameters = circuit_to_CNOT_basis( + flat_circuit, + np.asarray(synthesised_parameters), + ) + unsupported_multi = [ + gate.get_Name() + for gate in flat_circuit.get_Gates() + if len(gate.get_Involved_Qbits()) > 1 + and gate.get_Name() != "CNOT" + ] + if unsupported_multi: + raise ValueError( + "Partition synthesis produced non-CNOT multi-qubit gates " + f"after CNOT-basis conversion: {unsupported_multi}" + ) + self.permutations_pairs[topology_idx].append(permutations_pair) + self.synthesised_circuits[topology_idx].append(flat_circuit) + self.synthesised_parameters[topology_idx].append(synthesised_parameters) + self.cnot_counts[topology_idx].append(flat_circuit.get_Gate_Nums().get('CNOT', 0)) + self.circuit_structures[topology_idx].append(self.extract_circuit_structure(flat_circuit)) + + def extract_circuit_structure(self, circuit): + circuit_structure = [] + for gate in circuit.get_Gates(): + if gate.get_Name() == "Permutation": + continue + involved_qbits = gate.get_Involved_Qbits() + if len(involved_qbits) != 1: + circuit_structure.append(involved_qbits) + return circuit_structure + + def get_best_result(self, topology_idx): + best_index = np.argmin(self.cnot_counts[topology_idx]) + return self.permutations_pairs[topology_idx][best_index], self.synthesised_circuits[topology_idx][best_index], self.synthesised_parameters[topology_idx][best_index] + + def get_top_k_results(self, topology_idx, k): + counts = self.cnot_counts[topology_idx] + pairs = self.permutations_pairs[topology_idx] + if not counts: + return [] + indices = np.argsort(counts) + seen_pi = set() + result = [] + for i in indices: + pi_key = tuple(pairs[i][0]) + if pi_key not in seen_pi: + seen_pi.add(pi_key) + result.append(pairs[i][0]) + if len(result) >= k: + break + return result + + def get_topology_candidates(self, topology_idx): + """ + Get topology candidates for a given topology index, using cache if available. + """ + if self._topology_candidates[topology_idx] is None: + mini_topology = self.mini_topologies[topology_idx] + if self._topology_cache is not None: + # Use cached version if available + target_qubits = set() + for u, v in mini_topology: + target_qubits.add(u) + target_qubits.add(v) + if target_qubits: + canonical_key = get_canonical_form(target_qubits, mini_topology) + if canonical_key in self._topology_cache: + self._topology_candidates[topology_idx] = self._topology_cache[canonical_key] + else: + # Compute and cache + if self._topology is not None: + candidates = get_subtopologies_of_type(self._topology, mini_topology) + self._topology_cache[canonical_key] = candidates + self._topology_candidates[topology_idx] = candidates + else: + self._topology_candidates[topology_idx] = [] + else: + self._topology_candidates[topology_idx] = [] + else: + # No cache, compute directly + if self._topology is not None: + self._topology_candidates[topology_idx] = get_subtopologies_of_type(self._topology, mini_topology) + else: + self._topology_candidates[topology_idx] = [] + return self._topology_candidates[topology_idx] + + + +class PartitionCandidate: + + def __init__(self, partition_idx, topology_idx, permutation_idx, circuit_structure, P_i, P_o, topology, mini_topology, qbit_map, involved_qbits, cnot_count=0): + #Which partition does this belong to + self.partition_idx = partition_idx + #the index of the Q* topology + self.topology_idx = topology_idx + #the index of the P_i and P_o pair + self.permutation_idx = permutation_idx + # the structure of the circuit in Q* + self.circuit_structure = circuit_structure + # P_i in q*->Q* permutation pattern: [q*1 q*0 q*2] where q*1 goes to Q* qubit 0 and etc + self.P_i = P_i + # P_o in Q*->q* permutation pattern [Q*1 Q*0 Q*2] This means that the current output of Q*1 is equal to q*0 + self.P_o = P_o + #The mini_topology in Q + self.topology = topology + #The mini topology in Q* + self.mini_topology = mini_topology + # {q:q*} + self.qbit_map = qbit_map + # q belonging to the original circuit + self.involved_qbits = involved_qbits + self.cnot_count = cnot_count + # {Q*:Q} + self.node_mapping = get_node_mapping(mini_topology, topology) + + def transform_pi(self, pi, D, swap_cache=None, reverse=False, adj=None, neighbor_info=None): + # The synthesized circuit S implements: add_Permutation(P_i) -> Original -> add_Permutation(P_o) + # + # Forward (reverse=False): + # Route qubits to input positions derived from P_i_inv, then + # update pi to output positions derived from P_o. + # + # Reverse (reverse=True): + # We traverse the partition backwards, so the "entry" is the output + # side and the "exit" is the input side. Swap P_i <-> P_o roles. + if not reverse: + P_route_inv = [self.P_i.index(i) for i in range(len(self.P_i))] + P_exit = self.P_o + else: + P_route_inv = [self.P_o.index(i) for i in range(len(self.P_o))] + P_exit = self.P_i + + qbit_map_input = {k : self.node_mapping[P_route_inv[v]] for k,v in self.qbit_map.items()} + # Convert pi to plain Python list of ints (may contain np.int64) + pi_list = [int(x) for x in pi] + n = len(pi_list) + + # Cache is keyed on (pi, qbit_map, neighbor_signature). The signature + # captures the neighbor-heuristic context so hits across calls with + # the same active neighbor_info are safe. + if swap_cache is not None: + pi_tuple = tuple(pi_list) + qbit_map_frozen = frozenset(qbit_map_input.items()) + neighbor_sig = _neighbor_signature(neighbor_info) + cache_key = (pi_tuple, qbit_map_frozen, neighbor_sig) + if cache_key in swap_cache: + swaps, pi_init = swap_cache[cache_key] + else: + swaps, pi_init = find_constrained_swaps_partial( + pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info) + swap_cache[cache_key] = (swaps, pi_init) + else: + swaps, pi_init = find_constrained_swaps_partial( + pi_list, qbit_map_input, D, adj=adj, neighbor_info=neighbor_info) + + pi_output = pi_init.copy() + qbit_map_inverse = {v: k for k, v in self.qbit_map.items()} + for q_star in range(len(P_exit)): + if q_star in qbit_map_inverse: + k = qbit_map_inverse[q_star] + pi_output[k] = self.node_mapping[P_exit[q_star]] + return swaps, pi_output + + def estimate_swap_count(self, pi, D, reverse=False) -> int: + """O(n) lower-bound on the number of SWAPs needed to route this + partition's virtual qubits to their target physical positions. + Uses the same admissible heuristic as the A* search internaly: + floor(sum_of_distances / 2) + """ + P_route = self.P_o if reverse else self.P_i + P_i_inv = [P_route.index(i) for i in range(len(P_route))] + total = 0.0 + for k, v in self.qbit_map.items(): + target_P = self.node_mapping[P_i_inv[v]] + current_P = int(pi[k]) + d = D[current_P][target_P] + if not np.isinf(d): + total += d + return int(total / 2) + + def get_final_circuit(self,optimized_partitions,N): + partition = optimized_partitions[self.partition_idx] + part_parameters = partition.synthesised_parameters[self.topology_idx][self.permutation_idx] + part_circuit = partition.synthesised_circuits[self.topology_idx][self.permutation_idx].get_Flat_Circuit() + part_circuit = part_circuit.Remap_Qbits(self.node_mapping, N) + return part_circuit, part_parameters + + +@dataclass(frozen=True) +class PartitionScoreData: + mini_topologies: Tuple[Tuple[Tuple[int, int], ...], ...] + topology_candidates: Tuple[Tuple[Tuple[int, int], ...], ...] + permutations_pairs: Tuple[ + Tuple[Tuple[Tuple[int, ...], Tuple[int, ...]], ...], ... + ] + circuit_structures: Tuple[Tuple[Tuple[int, ...], ...], ...] + cnot_counts: Tuple[Tuple[int, ...], ...] + qubit_map: Dict[int, int] + involved_qbits: Tuple[int, ...] + + +# ============================================================================ +# Circuit Utilities +# ============================================================================ + +def check_circuit_compatibility(circuit: Circuit, topology): + circuit_topology = [] + + def collect_two_qubit_edges(gate): + if isinstance(gate, Circuit): + for subgate in gate.get_Gates(): + collect_two_qubit_edges(subgate) + return + + qubits = gate.get_Involved_Qbits() + if len(qubits) == 1: + return + if len(qubits) == 2: + qubits = tuple(qubits) + if qubits not in circuit_topology and qubits[::-1] not in circuit_topology: + circuit_topology.append(qubits) + return + + for subgate in gate.get_Gates(): + collect_two_qubit_edges(subgate) + + for gate in circuit.get_Gates(): + collect_two_qubit_edges(gate) + + for qubits in circuit_topology: + if qubits not in topology and qubits[::-1] not in topology: + return False + return True + +def construct_swap_circuit(swap_order, N): + swap_circ = Circuit(N) + for swap in swap_order: + swap_circ.add_CNOT(swap[0],swap[1]) + swap_circ.add_CNOT(swap[1],swap[0]) + swap_circ.add_CNOT(swap[0],swap[1]) + return swap_circ diff --git a/squander/synthesis/bindings.cpp b/squander/synthesis/bindings.cpp new file mode 100644 index 000000000..6a930d095 --- /dev/null +++ b/squander/synthesis/bindings.cpp @@ -0,0 +1,243 @@ +/* +Copyright 2025 SQUANDER Contributors + +pybind11 bindings for the SABRE routing engine. +*/ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "sabre_router.hpp" + +namespace py = pybind11; +using namespace squander::routing; + +// --------------------------------------------------------------------------- +// Helper: extract fields from a Python PartitionCandidate object into CandidateData +// --------------------------------------------------------------------------- + +static CandidateData extract_candidate(py::handle pc) { + CandidateData cd; + cd.partition_idx = pc.attr("partition_idx").cast(); + cd.topology_idx = pc.attr("topology_idx").cast(); + cd.permutation_idx = pc.attr("permutation_idx").cast(); + cd.cnot_count = pc.attr("cnot_count").cast(); + cd.has_multi_qubit_body = py::len(pc.attr("circuit_structure")) > 0; + + // P_i, P_o: tuples of ints + cd.P_i = pc.attr("P_i").cast>(); + cd.P_o = pc.attr("P_o").cast>(); + + // node_mapping: dict {Q* -> Q} -> flatten to dense array + py::dict nm = pc.attr("node_mapping"); + int max_qstar = -1; + for (auto [key, val] : nm) { + int qs = key.cast(); + if (qs > max_qstar) max_qstar = qs; + } + cd.node_mapping_flat.resize(max_qstar + 1, -1); + for (auto [key, val] : nm) { + cd.node_mapping_flat[key.cast()] = val.cast(); + } + + // qbit_map: dict {q -> q*} + py::dict qm = pc.attr("qbit_map"); + cd.qbit_map_keys.reserve(py::len(qm)); + cd.qbit_map_vals.reserve(py::len(qm)); + for (auto [key, val] : qm) { + cd.qbit_map_keys.push_back(key.cast()); + cd.qbit_map_vals.push_back(val.cast()); + } + + // involved_qbits: tuple of ints + cd.involved_qbits = pc.attr("involved_qbits").cast>(); + + return cd; +} + +// --------------------------------------------------------------------------- +// Helper: extract canonical_data dict -> unordered_map +// --------------------------------------------------------------------------- + +static std::vector extract_int_array(py::handle obj) { + std::vector result; + auto arr = py::array_t::ensure(obj); + if (!arr) { + return result; + } + auto acc = arr.unchecked<1>(); + result.resize(acc.shape(0)); + for (py::ssize_t i = 0; i < acc.shape(0); i++) { + result[i] = acc(i); + } + return result; +} + +static std::unordered_map extract_canonical_data(py::dict cd) { + std::unordered_map result; + for (auto [key, val] : cd) { + int pidx = key.cast(); + CanonicalEntry entry; + py::dict d = py::reinterpret_borrow(val); + if (d.contains("edges_u") && !d["edges_u"].is_none()) { + entry.edges_u = extract_int_array(d["edges_u"]); + } + if (d.contains("edges_v") && !d["edges_v"].is_none()) { + entry.edges_v = extract_int_array(d["edges_v"]); + } + entry.cnot = d["cnot"].cast(); + result[pidx] = std::move(entry); + } + return result; +} + +// --------------------------------------------------------------------------- +// Helper: extract layout_partitions list -> vector +// --------------------------------------------------------------------------- + +static std::vector extract_layout_partitions(py::list lp) { + std::vector result; + result.reserve(py::len(lp)); + for (auto item : lp) { + py::dict d = py::reinterpret_borrow(item); + LayoutPartInfo info; + info.is_single = d["is_single"].cast(); + info.involved_qbits = d["involved_qbits"].cast>(); + result.push_back(std::move(info)); + } + return result; +} + +// --------------------------------------------------------------------------- +// Module definition +// --------------------------------------------------------------------------- + +PYBIND11_MODULE(_sabre_router, m) { + m.doc() = "SQUANDER SABRE Routing Engine - C++ Backend"; + + // Bind SabreConfig + py::class_(m, "SabreConfig") + .def(py::init<>()) + .def_readwrite("prefilter_top_k", &SabreConfig::prefilter_top_k) + .def_readwrite("prefilter_min_per_partition", &SabreConfig::prefilter_min_per_partition) + .def_readwrite("prefilter_min_3q", &SabreConfig::prefilter_min_3q) + .def_readwrite("max_E_size", &SabreConfig::max_E_size) + .def_readwrite("max_lookahead", &SabreConfig::max_lookahead) + .def_readwrite("E_weight", &SabreConfig::E_weight) + .def_readwrite("E_alpha", &SabreConfig::E_alpha) + .def_readwrite("cnot_cost", &SabreConfig::cnot_cost) + .def_readwrite("sabre_iterations", &SabreConfig::sabre_iterations) + .def_readwrite("n_layout_trials", &SabreConfig::n_layout_trials) + .def_readwrite("random_seed", &SabreConfig::random_seed) + .def_readwrite("decay_delta", &SabreConfig::decay_delta) + .def_readwrite("swap_burst_budget", &SabreConfig::swap_burst_budget) + .def_readwrite("path_tiebreak_weight", &SabreConfig::path_tiebreak_weight) + .def_readwrite("three_qubit_exit_weight", &SabreConfig::three_qubit_exit_weight) + .def_readwrite("boundary_beam_width", &SabreConfig::boundary_beam_width) + .def_readwrite("boundary_beam_depth", &SabreConfig::boundary_beam_depth); + + // Bind SabreRouter with data-converting constructor + py::class_(m, "SabreRouter") + .def(py::init( + [](const SabreConfig& config, + py::array_t D_arr, + std::vector> adj, + std::vector> DAG, + std::vector> IDAG, + py::list candidate_cache_py, + py::list layout_partitions_py, + py::dict canonical_data_fwd_py, + py::dict canonical_data_rev_py + ) { + // Extract D matrix + auto buf = D_arr.request(); + if (buf.ndim != 2 || buf.shape[0] != buf.shape[1]) { + throw std::invalid_argument("D must be a square 2D array"); + } + int N = static_cast(buf.shape[0]); + std::vector D_flat(N * N); + auto* ptr = static_cast(buf.ptr); + std::copy(ptr, ptr + N * N, D_flat.begin()); + + // Convert candidate_cache: list of lists of PartitionCandidate + std::vector> cc; + cc.reserve(py::len(candidate_cache_py)); + for (auto part_cands : candidate_cache_py) { + std::vector cands; + py::list cl = py::reinterpret_borrow(part_cands); + cands.reserve(py::len(cl)); + for (auto c : cl) { + auto cd = extract_candidate(c); + cd.candidate_idx = static_cast(cands.size()); + cands.push_back(std::move(cd)); + } + cc.push_back(std::move(cands)); + } + + auto lp = extract_layout_partitions(layout_partitions_py); + auto cd_fwd = extract_canonical_data(canonical_data_fwd_py); + auto cd_rev = extract_canonical_data(canonical_data_rev_py); + + return new SabreRouter( + config, N, std::move(D_flat), std::move(adj), std::move(DAG), std::move(IDAG), + std::move(cc), std::move(lp), std::move(cd_fwd), std::move(cd_rev) + ); + }), + py::arg("config"), + py::arg("D"), + py::arg("adj"), + py::arg("DAG"), + py::arg("IDAG"), + py::arg("candidate_cache"), + py::arg("layout_partitions"), + py::arg("canonical_data_fwd"), + py::arg("canonical_data_rev") + ) + .def("route_forward", + [](const SabreRouter& self, + const std::vector& pi + ) -> py::tuple { + py::gil_scoped_release release; + auto result = self.route_forward(pi); + py::gil_scoped_acquire acquire; + py::list steps; + for (const auto& step : result.steps) { + if (step.type == 0) { + steps.append(py::make_tuple("swap", step.swaps)); + } else if (step.type == 1) { + steps.append(py::make_tuple("partition", step.partition_idx, step.candidate_idx)); + } else { + steps.append(py::make_tuple("single", step.partition_idx, step.physical_qubit)); + } + } + return py::make_tuple(result.cnot_count, result.pi, result.pi_initial, steps); + }, + py::arg("pi"), + "Run actual forward routing and return CNOT count, final pi, initial pi, and route steps" + ) + .def("run_trial", + [](const SabreRouter& self, + int trial_idx, + const std::vector& seeded_pi, + int n_iterations, + int n_trials + ) -> py::tuple { + py::gil_scoped_release release; + auto result = self.run_trial(trial_idx, seeded_pi, n_iterations, n_trials); + py::gil_scoped_acquire acquire; + return py::make_tuple(result.total_cost, result.pi); + }, + py::arg("trial_idx"), + py::arg("seeded_pi"), + py::arg("n_iterations"), + py::arg("n_trials"), + "Run a single layout trial (GIL-free, thread-safe)" + ); +} diff --git a/squander/synthesis/qgd_SABRE.py b/squander/synthesis/qgd_SABRE.py index 924bf361d..aecad803a 100644 --- a/squander/synthesis/qgd_SABRE.py +++ b/squander/synthesis/qgd_SABRE.py @@ -68,11 +68,11 @@ def _compute_smart_initial_layout(self, circuit): gates = circuit.get_Gates() for gate in gates: - if gate.get_Control_Qbit() != -1: - q1 = gate.get_Target_Qbit() - q2 = gate.get_Control_Qbit() - if q1 < self.circuit_qbit_num and q2 < self.circuit_qbit_num: - key = (min(q1, q2), max(q1, q2)) + q_control = gate.get_Control_Qbit() + if q_control != -1: + q_target = gate.get_Target_Qbit() + if q_target < self.circuit_qbit_num and q_control < self.circuit_qbit_num: + key = (min(q_target, q_control), max(q_target, q_control)) interaction_count[key] += 1 if not interaction_count: diff --git a/squander/utils.py b/squander/utils.py index d33eec17b..1ea558a51 100644 --- a/squander/utils.py +++ b/squander/utils.py @@ -130,7 +130,7 @@ def qasm_to_squander_circuit(filename: str, return_transpiled=False): for n in dir(gate) if not n.startswith("_") and issubclass(getattr(gate, n), gate.Gate) - and n not in ("Gate", "CROT", "CR", "SYC") + and n not in ("Gate", "CROT", "CR", "SYC","Permutation") } if any(gate.operation.name not in SUPPORTED_GATES_NAMES for gate in qc.data): qc_transpiled = qiskit.transpile( @@ -460,6 +460,7 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray): RXX, RYY, RZZ, + Permutation, ) gates = circ.get_Gates() @@ -652,6 +653,24 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray): circuit.add_CNOT(t2, t1) circuit.add_CNOT(t1, t2) params.append([]) + elif isinstance(gate, Permutation): + pattern = list(gate.get_Pattern()) + inverse_pattern = [0] * len(pattern) + for idx, mapped_idx in enumerate(pattern): + inverse_pattern[mapped_idx] = idx + current = list(range(len(pattern))) + for idx, target in enumerate(inverse_pattern): + swap_idx = current.index(target) + if swap_idx == idx: + continue + circuit.add_CNOT(idx, swap_idx) + circuit.add_CNOT(swap_idx, idx) + circuit.add_CNOT(idx, swap_idx) + current[idx], current[swap_idx] = ( + current[swap_idx], + current[idx], + ) + params.append([]) elif isinstance(gate, RXX): t1, t2 = gate.get_Target_Qbits() circuit.add_CNOT(t1, t2) @@ -695,6 +714,8 @@ def circuit_to_CNOT_basis(circ: Circuit, parameters: np.ndarray): ] ) + if not params: + return circuit, np.array([]) return circuit, np.concatenate(params) diff --git a/tests/decomposition/test_IBM.py b/tests/decomposition/test_IBM.py index 2e2479307..eafd42fab 100644 --- a/tests/decomposition/test_IBM.py +++ b/tests/decomposition/test_IBM.py @@ -251,10 +251,12 @@ def test_IBM_Chellenge_tree_search(self): data = loadmat('data/Umtx.mat') # The unitary to be decomposed Umtx = data['Umtx'] - + #turn off OSR + config = {"use_osr":0} + # creating a class to decompose the unitary - cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T ) + cDecompose = N_Qubit_Decomposition_Tree_Search( Umtx.conj().T, config = config ) # setting the verbosity of the decomposition @@ -305,10 +307,11 @@ def test_IBM_Chellenge_tabu_search(self): data = loadmat('data/Umtx.mat') # The unitary to be decomposed Umtx = data['Umtx'] - + #turn off OSR + config = {"use_osr":0} # creating a class to decompose the unitary - cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T ) + cDecompose = N_Qubit_Decomposition_Tabu_Search( Umtx.conj().T,config=config ) # setting the verbosity of the decomposition diff --git a/tests/gates/test_Permutation.py b/tests/gates/test_Permutation.py new file mode 100644 index 000000000..4a4ecbb06 --- /dev/null +++ b/tests/gates/test_Permutation.py @@ -0,0 +1,490 @@ +''' +Copyright 2020 Peter Rakyta, Ph.D. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +You should have received a copy of the GNU General Public License +along with this program. If not, see http://www.gnu.org/licenses/. +''' + +import numpy as np +import pytest +from itertools import permutations + +from squander.gates.gates_Wrapper import Permutation +from squander.gates.qgd_Circuit import qgd_Circuit + + +class Test_Permutation: + """Test class for Permutation gate""" + + def test_permutation_creation_identity(self): + """ + Test creating identity permutation gates + """ + for qbit_num in range(1, 6): + # Identity permutation: [0, 1, 2, ..., n-1] + pattern = list(range(qbit_num)) + perm_gate = Permutation(qbit_num, pattern) + + assert perm_gate.get_Parameter_Num() == 0 + pattern_retrieved = perm_gate.get_Pattern() + assert pattern_retrieved == pattern + + def test_permutation_creation_swap(self): + """ + Test creating swap permutation gates + """ + for qbit_num in range(2, 6): + # Swap first and last qubits: [n-1, 1, 2, ..., n-2, 0] + pattern = list(range(qbit_num)) + pattern[0], pattern[-1] = pattern[-1], pattern[0] + perm_gate = Permutation(qbit_num, pattern) + + pattern_retrieved = perm_gate.get_Pattern() + assert pattern_retrieved == pattern + + def test_permutation_creation_reverse(self): + """ + Test creating reverse permutation gates + """ + for qbit_num in range(1, 6): + # Reverse permutation: [n-1, n-2, ..., 1, 0] + pattern = list(range(qbit_num))[::-1] + perm_gate = Permutation(qbit_num, pattern) + + pattern_retrieved = perm_gate.get_Pattern() + assert pattern_retrieved == pattern + + def test_permutation_creation_random(self): + """ + Test creating random permutation gates + """ + np.random.seed(42) + for qbit_num in range(2, 6): + # Random permutation + pattern = list(range(qbit_num)) + np.random.shuffle(pattern) + perm_gate = Permutation(qbit_num, pattern) + + pattern_retrieved = perm_gate.get_Pattern() + assert pattern_retrieved == pattern + + def test_permutation_creation_invalid_size(self): + """ + Test that creating permutation with wrong pattern size raises error + """ + qbit_num = 3 + # Pattern too small + with pytest.raises(ValueError, match="Pattern size.*does not match"): + Permutation(qbit_num, [0, 1]) + + # Pattern too large + with pytest.raises(ValueError, match="Pattern size.*does not match"): + Permutation(qbit_num, [0, 1, 2, 3]) + + def test_permutation_creation_invalid_range(self): + """ + Test that creating permutation with out-of-range indices raises error + """ + qbit_num = 3 + # Negative index + with pytest.raises(ValueError, match="out of range"): + Permutation(qbit_num, [-1, 1, 2]) + + # Index too large + with pytest.raises(ValueError, match="out of range"): + Permutation(qbit_num, [0, 1, 3]) + + def test_permutation_creation_duplicates(self): + """ + Test that creating permutation with duplicate values raises error + """ + qbit_num = 3 + # Duplicate values + with pytest.raises(ValueError, match="duplicate"): + Permutation(qbit_num, [0, 1, 1]) + + with pytest.raises(ValueError, match="duplicate"): + Permutation(qbit_num, [0, 0, 2]) + + def test_permutation_creation_invalid_type(self): + """ + Test that creating permutation with invalid type raises error + """ + qbit_num = 3 + # Tuple should work (converted to list) + perm_gate = Permutation(qbit_num, (0, 1, 2)) + assert perm_gate.get_Pattern() == [0, 1, 2] + + # Non-integer values + with pytest.raises(TypeError, match="pattern must contain integers"): + Permutation(qbit_num, [0.0, 1.0, 2.0]) + + with pytest.raises(TypeError, match="pattern must contain integers"): + Permutation(qbit_num, ["0", "1", "2"]) + + # Invalid type (not list or tuple) + with pytest.raises(TypeError, match="pattern must be a list or tuple"): + Permutation(qbit_num, "012") + + def test_permutation_get_pattern(self): + """ + Test getting pattern from permutation gate + """ + for qbit_num in range(1, 5): + for pattern_tuple in permutations(range(qbit_num)): + pattern = list(pattern_tuple) + perm_gate = Permutation(qbit_num, pattern) + retrieved_pattern = perm_gate.get_Pattern() + assert retrieved_pattern == pattern + + def test_permutation_tuple_conversion(self): + """ + Test that tuples are properly converted to lists + """ + for qbit_num in range(1, 5): + for pattern_tuple in permutations(range(qbit_num)): + # Create with tuple + perm_gate = Permutation(qbit_num, pattern_tuple) + retrieved_pattern = perm_gate.get_Pattern() + # Should return as list + assert retrieved_pattern == list(pattern_tuple) + assert isinstance(retrieved_pattern, list) + + # Set with tuple + perm_gate.set_Pattern(pattern_tuple) + retrieved_pattern = perm_gate.get_Pattern() + assert retrieved_pattern == list(pattern_tuple) + assert isinstance(retrieved_pattern, list) + + def test_permutation_set_pattern(self): + """ + Test setting pattern on permutation gate + """ + qbit_num = 4 + initial_pattern = [0, 1, 2, 3] + perm_gate = Permutation(qbit_num, initial_pattern) + + # Set new pattern + new_pattern = [3, 2, 1, 0] + perm_gate.set_Pattern(new_pattern) + assert perm_gate.get_Pattern() == new_pattern + + # Set another pattern + another_pattern = [1, 0, 3, 2] + perm_gate.set_Pattern(another_pattern) + assert perm_gate.get_Pattern() == another_pattern + + def test_permutation_set_pattern_invalid(self): + """ + Test that setting invalid pattern raises error + """ + qbit_num = 3 + perm_gate = Permutation(qbit_num, [0, 1, 2]) + + # Wrong size + with pytest.raises(ValueError, match="Pattern size.*does not match"): + perm_gate.set_Pattern([0, 1]) + + # Out of range + with pytest.raises(ValueError, match="out of range"): + perm_gate.set_Pattern([0, 1, 3]) + + # Duplicates + with pytest.raises(ValueError, match="duplicate"): + perm_gate.set_Pattern([0, 1, 1]) + + # Invalid type (not list or tuple) + with pytest.raises(TypeError, match="Pattern must be a list or tuple"): + perm_gate.set_Pattern("012") + + # Tuple should work (converted to list) + perm_gate.set_Pattern((0, 1, 2)) + assert perm_gate.get_Pattern() == [0, 1, 2] + + # Tuple with different pattern + perm_gate.set_Pattern((2, 0, 1)) + assert perm_gate.get_Pattern() == [2, 0, 1] + + def test_permutation_get_matrix_identity(self): + """ + Test that identity permutation gives identity matrix + """ + for qbit_num in range(1, 5): + pattern = list(range(qbit_num)) + perm_gate = Permutation(qbit_num, pattern) + matrix = perm_gate.get_Matrix() + + expected = np.eye(2**qbit_num, dtype=np.complex128) + error = np.linalg.norm(matrix - expected) + assert error < 1e-10, f"Identity permutation failed for {qbit_num} qubits" + + def test_permutation_get_matrix_swap(self): + """ + Test permutation matrix for swap operation + """ + qbit_num = 2 + # Swap qubits: [1, 0] + pattern = [1, 0] + perm_gate = Permutation(qbit_num, pattern) + matrix = perm_gate.get_Matrix() + + # For 2 qubits, swap should exchange |01> and |10> + # Identity: |00> -> |00>, |01> -> |01>, |10> -> |10>, |11> -> |11> + # Swap: |00> -> |00>, |01> -> |10>, |10> -> |01>, |11> -> |11> + expected = np.array([ + [1, 0, 0, 0], + [0, 0, 1, 0], + [0, 1, 0, 0], + [0, 0, 0, 1] + ], dtype=np.complex128) + + error = np.linalg.norm(matrix - expected) + assert error < 1e-10, "Swap permutation matrix incorrect" + + def test_permutation_get_matrix_unitary(self): + """ + Test that permutation matrices are unitary + """ + for qbit_num in range(1, 5): + pattern = list(range(qbit_num)) + np.random.shuffle(pattern) + perm_gate = Permutation(qbit_num, pattern) + matrix = perm_gate.get_Matrix() + + # Check unitarity: U @ U^dagger = I + unitary_check = matrix @ matrix.conj().T + identity = np.eye(2**qbit_num, dtype=np.complex128) + error = np.linalg.norm(unitary_check - identity) + assert error < 1e-10, f"Matrix not unitary for pattern {pattern}" + + def test_permutation_apply_to_identity(self): + """ + Test applying identity permutation to a state + """ + for qbit_num in range(1, 5): + pattern = list(range(qbit_num)) + perm_gate = Permutation(qbit_num, pattern) + + # Create random state + matrix_size = 2**qbit_num + state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size) + state = state / np.linalg.norm(state) + + state_copy = state.copy() + perm_gate.apply_to(state_copy) + + # Identity should not change the state + error = np.linalg.norm(state_copy - state) + assert error < 1e-10, "Identity permutation changed state" + + def test_permutation_apply_to_swap(self): + """ + Test applying swap permutation to a state + """ + qbit_num = 2 + pattern = [1, 0] # Swap qubits + perm_gate = Permutation(qbit_num, pattern) + + # Create test state |01> = [0, 1, 0, 0] + state = np.array([0, 1, 0, 0], dtype=np.complex128) + perm_gate.apply_to(state) + + # After swap, should be |10> = [0, 0, 1, 0] + expected = np.array([0, 0, 1, 0], dtype=np.complex128) + error = np.linalg.norm(state - expected) + assert error < 1e-10, "Swap permutation incorrect" + + def test_permutation_apply_to_matrix(self): + """ + Test applying permutation to a matrix + """ + qbit_num = 3 + pattern = [2, 0, 1] # Rotate: 0->2, 1->0, 2->1 + perm_gate = Permutation(qbit_num, pattern) + + # Create test matrix + matrix_size = 2**qbit_num + test_matrix = np.random.rand(matrix_size, matrix_size) + 1j * np.random.rand(matrix_size, matrix_size) + test_matrix = test_matrix / np.linalg.norm(test_matrix) + + # Apply permutation + test_matrix_copy = test_matrix.copy() + perm_gate.apply_to(test_matrix_copy) + + # Check that it's different (unless it's identity) + if pattern != list(range(qbit_num)): + assert not np.allclose(test_matrix_copy, test_matrix), "Permutation should change matrix" + + def test_permutation_composition(self): + """ + Test that applying two permutations is equivalent to their composition + """ + qbit_num = 3 + pattern1 = [1, 2, 0] # Rotate left + pattern2 = [2, 0, 1] # Rotate right + + perm1 = Permutation(qbit_num, pattern1) + perm2 = Permutation(qbit_num, pattern2) + + # Compose patterns: pattern2(pattern1(x)) + composed_pattern = [pattern2[pattern1[i]] for i in range(qbit_num)] + perm_composed = Permutation(qbit_num, composed_pattern) + + # Create test state + matrix_size = 2**qbit_num + state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size) + state = state / np.linalg.norm(state) + + # Apply sequentially + state_seq = state.copy() + perm1.apply_to(state_seq) + perm2.apply_to(state_seq) + + # Apply composed + state_comp = state.copy() + perm_composed.apply_to(state_comp) + + error = np.linalg.norm(state_seq - state_comp) + assert error < 1e-10, "Composition of permutations incorrect" + + def test_permutation_inverse(self): + """ + Test that applying permutation and its inverse gives identity + """ + for qbit_num in range(2, 5): + pattern = list(range(qbit_num)) + np.random.shuffle(pattern) + + # Compute inverse permutation + inverse_pattern = [0] * qbit_num + for i in range(qbit_num): + inverse_pattern[pattern[i]] = i + + perm = Permutation(qbit_num, pattern) + perm_inv = Permutation(qbit_num, inverse_pattern) + + # Create test state + matrix_size = 2**qbit_num + state = np.random.rand(matrix_size) + 1j * np.random.rand(matrix_size) + state = state / np.linalg.norm(state) + + # Apply permutation then inverse + state_transformed = state.copy() + perm.apply_to(state_transformed) + perm_inv.apply_to(state_transformed) + + error = np.linalg.norm(state_transformed - state) + assert error < 1e-10, f"Inverse permutation failed for pattern {pattern}" + + def test_permutation_circuit_integration(self): + """ + Test adding permutation gate to circuit + """ + qbit_num = 3 + pattern = [2, 0, 1] + + circuit = qgd_Circuit(qbit_num) + circuit.add_Permutation(pattern) + + gates = circuit.get_Gates() + assert len(gates) == 1 + + gate = gates[0] + assert gate.get_Name() == "Permutation" + retrieved_pattern = gate.get_Pattern() + assert retrieved_pattern == pattern + + def test_permutation_circuit_multiple(self): + """ + Test adding multiple permutation gates to circuit + """ + qbit_num = 3 + + circuit = qgd_Circuit(qbit_num) + pattern1 = [1, 2, 0] + pattern2 = [2, 0, 1] + + circuit.add_Permutation(pattern1) + circuit.add_Permutation(pattern2) + + gates = circuit.get_Gates() + assert len(gates) == 2 + + assert gates[0].get_Pattern() == pattern1 + assert gates[1].get_Pattern() == pattern2 + + def test_permutation_get_involved_qubits(self): + """ + Test getting involved qubits from permutation gate + """ + for qbit_num in range(1, 5): + pattern = list(range(qbit_num)) + perm_gate = Permutation(qbit_num, pattern) + + involved_qbits = perm_gate.get_Involved_Qbits() + # Permutation gate involves all qubits + assert involved_qbits == list(range(qbit_num)) + + def test_permutation_get_target_qubits(self): + """ + Test getting target qubits from permutation gate + """ + for qbit_num in range(1, 5): + pattern = list(range(qbit_num)) + perm_gate = Permutation(qbit_num, pattern) + + target_qbits = perm_gate.get_Target_Qbits() + # Permutation gate targets all qubits + assert target_qbits == list(range(qbit_num)) + + def test_permutation_get_control_qubits(self): + """ + Test getting control qubits from permutation gate (should be empty) + """ + for qbit_num in range(1, 5): + pattern = list(range(qbit_num)) + perm_gate = Permutation(qbit_num, pattern) + + control_qbits = perm_gate.get_Control_Qbits() + # Permutation gate has no control qubits + assert control_qbits == [] + + def test_permutation_large_patterns(self): + """ + Test permutation gates with larger numbers of qubits + """ + for qbit_num in [5, 6, 7]: + # Test identity + pattern = list(range(qbit_num)) + perm_gate = Permutation(qbit_num, pattern) + matrix = perm_gate.get_Matrix() + + expected = np.eye(2**qbit_num, dtype=np.complex128) + error = np.linalg.norm(matrix - expected) + assert error < 1e-10, f"Large identity permutation failed for {qbit_num} qubits" + + # Test random permutation + np.random.seed(42) + pattern = list(range(qbit_num)) + np.random.shuffle(pattern) + perm_gate = Permutation(qbit_num, pattern) + + # Check unitarity + matrix = perm_gate.get_Matrix() + unitary_check = matrix @ matrix.conj().T + identity = np.eye(2**qbit_num, dtype=np.complex128) + error = np.linalg.norm(unitary_check - identity) + assert error < 1e-10, f"Large permutation not unitary for {qbit_num} qubits" + diff --git a/tests/gates/test_gates.py b/tests/gates/test_gates.py index e75e7136d..59b5a0161 100644 --- a/tests/gates/test_gates.py +++ b/tests/gates/test_gates.py @@ -49,7 +49,8 @@ def _discover_gate_names(): ALL_GATE_NAMES = _discover_gate_names() -QISKIT_EXCLUDED_GATES = {"SYC", "CR", "CROT"} +QISKIT_EXCLUDED_GATES = {"SYC", "CR", "CROT", "Permutation"} +CIRCUIT_UNSUPPORTED_GATES = {"Gate", "Permutation"} QISKIT_MATRIX_UNSUPPORTED = {"Gate"} | QISKIT_EXCLUDED_GATES NATIVE_UNSAFE_MATRIX_GATES = {"Gate"} NATIVE_UNSAFE_APPLY_GATES = {"Gate"} @@ -72,7 +73,7 @@ def _discover_parameterized_gate_names(): def _discover_multi_qubit_gate_names(): names = [] for gate_name in ALL_GATE_NAMES: - if gate_name == "Gate": + if gate_name in CIRCUIT_UNSUPPORTED_GATES: continue gate_obj = _instantiate_gate(gate_name) if len(gate_obj.get_Involved_Qbits()) >= 2: @@ -95,6 +96,8 @@ def _instantiate_gate(gate_name, qbit_num=4): return gate_cls(qbit_num, 0, qbit_num - 1) if gate_name.startswith("C"): return gate_cls(qbit_num, 0, qbit_num - 1) + if gate_name == "Permutation": + return gate_cls(qbit_num, list(range(qbit_num))) return gate_cls(qbit_num, 0) @@ -847,7 +850,7 @@ def test_qiskit_io_roundtrip_per_gate(self, gate_name): @pytest.mark.parametrize( "gate_name", - [name for name in ALL_GATE_NAMES if name != "Gate"], + [name for name in ALL_GATE_NAMES if name not in CIRCUIT_UNSUPPORTED_GATES], ) def test_squander_invert_circuit(self, gate_name): script = f"""