Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallelize sampling measure #2049

Merged
merged 10 commits into from
Mar 19, 2024
5 changes: 4 additions & 1 deletion qiskit_aer/backends/aerbackend.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,10 @@ def target(self):
if self._target is not None:
return self._target

return convert_to_target(self.configuration(), self.properties(), None, NAME_MAPPING)
tgt = convert_to_target(self.configuration(), self.properties(), None, NAME_MAPPING)
if self._coupling_map is not None:
tgt._coupling_graph = self._coupling_map.graph.copy()
return tgt

def clear_options(self):
"""Reset the simulator options to default values."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
features:
- |
Added BitVector class to store classical bits instead of using reg_t
to save memory usage and memory bandwidth.
upgrade:
- |
Parallelize un-parallelized loops in sampling measure to speed up
for simulation with large number of shots.
17 changes: 6 additions & 11 deletions src/controllers/state_controller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1458,10 +1458,10 @@ std::vector<std::string> AerState::sample_memory(const reg_t &qubits,

std::vector<std::string> ret;
ret.reserve(shots);
std::vector<reg_t> samples = state_->sample_measure(qubits, shots, rng_);
std::vector<SampleVector> samples =
state_->sample_measure(qubits, shots, rng_);
for (auto &sample : samples) {
ret.push_back(
Utils::int2string(Utils::reg2int(sample, 2), 2, qubits.size()));
ret.push_back(sample.to_string());
}
return ret;
}
Expand All @@ -1472,16 +1472,11 @@ std::unordered_map<uint_t, uint_t> AerState::sample_counts(const reg_t &qubits,

flush_ops();

std::vector<reg_t> samples = state_->sample_measure(qubits, shots, rng_);
std::vector<SampleVector> samples =
state_->sample_measure(qubits, shots, rng_);
std::unordered_map<uint_t, uint_t> ret;
for (const auto &sample : samples) {
uint_t sample_u = 0ULL;
uint_t mask = 1ULL;
for (const auto b : sample) {
if (b)
sample_u |= mask;
mask <<= 1;
}
uint_t sample_u = sample(0); // only the first 64bits is used
if (ret.find(sample_u) == ret.end())
ret[sample_u] = 1ULL;
else
Expand Down
101 changes: 70 additions & 31 deletions src/simulators/circuit_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ class Executor : public Base {
template <typename InputIterator>
void measure_sampler(InputIterator first_meas, InputIterator last_meas,
uint_t shots, state_t &state, ExperimentResult &result,
RngEngine &rng, bool save_creg_to_state = false) const;
RngEngine &rng) const;

#ifdef AER_MPI
void gather_creg_memory(std::vector<ClassicalRegister> &cregs,
Expand All @@ -219,14 +219,14 @@ class Executor : public Base {

// Sample n-measurement outcomes without applying the measure operation
// to the system state
virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) const {
std::vector<reg_t> ret;
virtual std::vector<SampleVector>
sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) const {
std::vector<SampleVector> ret;
return ret;
};
virtual std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
uint_t shots,
std::vector<RngEngine> &rng) const {
virtual std::vector<SampleVector>
sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
std::vector<RngEngine> &rng) const {
// this is for single rng, impement in sub-class for multi-shots case
return state.sample_measure(qubits, shots, rng[0]);
}
Expand Down Expand Up @@ -1033,8 +1033,7 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
InputIterator last_meas, uint_t shots,
state_t &state,
ExperimentResult &result,
RngEngine &rng,
bool save_creg_to_state) const {
RngEngine &rng) const {
// Check if meas_circ is empty, and if so return initial creg
if (first_meas == last_meas) {
while (shots-- > 0) {
Expand Down Expand Up @@ -1065,7 +1064,7 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,

// Generate the samples
auto timer_start = myclock_t::now();
std::vector<reg_t> all_samples;
std::vector<SampleVector> all_samples;
all_samples = state.sample_measure(meas_qubits, shots, rng);
auto time_taken =
std::chrono::duration<double>(myclock_t::now() - timer_start).count();
Expand Down Expand Up @@ -1095,30 +1094,70 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
(memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
uint_t num_registers =
(register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
ClassicalRegister creg;
for (int_t i = all_samples.size() - 1; i >= 0; i--) {
creg.initialize(num_memory, num_registers);

// process memory bit measurements
for (const auto &pair : memory_map) {
creg.store_measure(reg_t({all_samples[i][pair.second]}),
reg_t({pair.first}), reg_t());
}
// process register bit measurements
for (const auto &pair : register_map) {
creg.store_measure(reg_t({all_samples[i][pair.second]}), reg_t(),
reg_t({pair.first}));
}

// process read out errors for memory and registers
for (const Operations::Op &roerror : roerror_ops)
creg.apply_roerror(roerror, rng);
if (roerror_ops.size() > 0) {
// can not parallelize for read out error because of rng
ClassicalRegister creg;
for (uint_t is = 0; is < all_samples.size(); is++) {
uint_t i = all_samples.size() - is - 1;
creg.initialize(num_memory, num_registers);

// process memory bit measurements
for (const auto &pair : memory_map) {
creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
reg_t({pair.first}), reg_t());
}
// process register bit measurements
for (const auto &pair : register_map) {
creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
reg_t(), reg_t({pair.first}));
}

// process read out errors for memory and registers
for (const Operations::Op &roerror : roerror_ops)
creg.apply_roerror(roerror, rng);

// Save count data
if (save_creg_to_state)
state.creg() = creg;
else
// Save count data
result.save_count_data(creg, save_creg_memory_);
}
} else {
uint_t npar = parallel_state_update_;
if (npar > all_samples.size())
npar = all_samples.size();

std::vector<ExperimentResult> par_results(npar);
auto copy_samples_lambda = [this, &par_results, num_memory, num_registers,
memory_map, register_map, npar,
&all_samples](int_t ip) {
ClassicalRegister creg;
uint_t is, ie;
is = all_samples.size() * ip / npar;
ie = all_samples.size() * (ip + 1) / npar;
for (; is < ie; is++) {
uint_t i = all_samples.size() - is - 1;
creg.initialize(num_memory, num_registers);

// process memory bit measurements
for (const auto &pair : memory_map) {
creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
reg_t({pair.first}), reg_t());
}
// process register bit measurements
for (const auto &pair : register_map) {
creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
reg_t(), reg_t({pair.first}));
}

// Save count data
par_results[ip].save_count_data(creg, save_creg_memory_);
}
};
Utils::apply_omp_parallel_for((npar > 1), 0, npar, copy_samples_lambda,
npar);

for (int_t i = 0; i < npar; i++) {
result.combine(std::move(par_results[i]));
}
}
}

Expand Down
66 changes: 34 additions & 32 deletions src/simulators/density_matrix/densitymatrix_executor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,

// Sample n-measurement outcomes without applying the measure operation
// to the system state
std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) const override;
std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) const override;

rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root,
const reg_t &qubits);
Expand All @@ -180,9 +180,9 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
const reg_t &cmemory, const reg_t &cregister);

std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
uint_t shots,
std::vector<RngEngine> &rng) const override;
std::vector<SampleVector>
sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
std::vector<RngEngine> &rng) const override;

//-----------------------------------------------------------------------
// Functions for multi-chunk distribution
Expand Down Expand Up @@ -1215,15 +1215,14 @@ void Executor<densmat_t>::measure_reset_update(const reg_t &qubits,
}

template <class densmat_t>
std::vector<reg_t> Executor<densmat_t>::sample_measure(const reg_t &qubits,
uint_t shots,
RngEngine &rng) const {
std::vector<SampleVector>
Executor<densmat_t>::sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) const {
// Generate flat register for storing
std::vector<double> rnds;
rnds.reserve(shots);
for (uint_t i = 0; i < shots; ++i)
rnds.push_back(rng.rand(0, 1));
reg_t allbit_samples(shots, 0);

uint_t i, j;
std::vector<double> chunkSum(Base::states_.size() + 1, 0);
Expand Down Expand Up @@ -1322,20 +1321,27 @@ std::vector<reg_t> Executor<densmat_t>::sample_measure(const reg_t &qubits,
#ifdef AER_MPI
BasePar::reduce_sum(local_samples);
#endif
allbit_samples = local_samples;

// Convert to reg_t format
std::vector<reg_t> all_samples;
all_samples.reserve(shots);
for (int_t val : allbit_samples) {
reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
reg_t sample;
sample.reserve(qubits.size());
for (uint_t qubit : qubits) {
sample.push_back(allbit_sample[qubit]);
// Convert to SampleVector format
int_t npar = Base::parallel_state_update_;
if (npar > local_samples.size())
npar = local_samples.size();
std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));

auto convert_to_bit_lambda = [this, &local_samples, &all_samples, shots,
qubits, npar](int_t i) {
uint_t ishot, iend;
ishot = local_samples.size() * i / npar;
iend = local_samples.size() * (i + 1) / npar;
for (; ishot < iend; ishot++) {
SampleVector allbit_sample;
allbit_sample.from_uint(local_samples[ishot], qubits.size());
all_samples[ishot].map(allbit_sample, qubits);
}
all_samples.push_back(sample);
}
};
Utils::apply_omp_parallel_for(
(npar > 1 && BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1), 0,
npar, convert_to_bit_lambda, npar);
return all_samples;
}

Expand Down Expand Up @@ -1439,7 +1445,7 @@ void Executor<state_t>::apply_measure(CircuitExecutor::Branch &root,
}

template <class state_t>
std::vector<reg_t>
std::vector<SampleVector>
Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
uint_t shots,
std::vector<RngEngine> &rng) const {
Expand All @@ -1454,17 +1460,13 @@ Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
auto allbit_samples = state.qreg().sample_measure(rnds);
state.qreg().enable_batch(flg);

// Convert to reg_t format
std::vector<reg_t> all_samples;
all_samples.reserve(shots);
// Convert to bit format
std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));
i = 0;
for (int_t val : allbit_samples) {
reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
reg_t sample;
sample.reserve(qubits.size());
for (uint_t qubit : qubits) {
sample.push_back(allbit_sample[qubit]);
}
all_samples.push_back(sample);
SampleVector allbit_sample;
allbit_sample.from_uint(val, qubits.size());
all_samples[i++].map(allbit_sample, qubits);
}
return all_samples;
}
Expand Down
40 changes: 24 additions & 16 deletions src/simulators/density_matrix/densitymatrix_state.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,8 @@ class State : public QuantumState::State<densmat_t> {

// Sample n-measurement outcomes without applying the measure operation
// to the system state
std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) override;
std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
RngEngine &rng) override;

// Helper function for computing expectation value
double expval_pauli(const reg_t &qubits, const std::string &pauli) override;
Expand Down Expand Up @@ -987,9 +987,9 @@ void State<densmat_t>::measure_reset_update(const reg_t &qubits,
}

template <class densmat_t>
std::vector<reg_t> State<densmat_t>::sample_measure(const reg_t &qubits,
uint_t shots,
RngEngine &rng) {
std::vector<SampleVector> State<densmat_t>::sample_measure(const reg_t &qubits,
uint_t shots,
RngEngine &rng) {
// Generate flat register for storing
std::vector<double> rnds;
rnds.reserve(shots);
Expand All @@ -999,18 +999,26 @@ std::vector<reg_t> State<densmat_t>::sample_measure(const reg_t &qubits,

allbit_samples = BaseState::qreg_.sample_measure(rnds);

// Convert to reg_t format
std::vector<reg_t> all_samples;
all_samples.reserve(shots);
for (int_t val : allbit_samples) {
reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::qreg_.num_qubits());
reg_t sample;
sample.reserve(qubits.size());
for (uint_t qubit : qubits) {
sample.push_back(allbit_sample[qubit]);
// Convert to bit format
int_t npar = BaseState::threads_;
if (npar > shots)
npar = shots;
std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));

auto convert_to_bit_lambda = [this, &allbit_samples, &all_samples, shots,
qubits, npar](int_t i) {
uint_t ishot, iend;
ishot = shots * i / npar;
iend = shots * (i + 1) / npar;
for (; ishot < iend; ishot++) {
SampleVector allbit_sample;
allbit_sample.from_uint(allbit_samples[ishot], qubits.size());
all_samples[ishot].map(allbit_sample, qubits);
}
all_samples.push_back(sample);
}
};
Utils::apply_omp_parallel_for((npar > 1), 0, npar, convert_to_bit_lambda,
npar);

return all_samples;
}

Expand Down
Loading
Loading