From a8ba061cdaa2813fb0dd72031e3855f42a0154d0 Mon Sep 17 00:00:00 2001
From: Jun Doi <doichan@jp.ibm.com>
Date: Mon, 5 Feb 2024 14:00:23 +0900
Subject: [PATCH 1/3] parallelize sampling measure

---
 qiskit_aer/backends/aerbackend.py             |   5 +-
 ...ize_sampling_measure-18bda46a281e48d2.yaml |   9 +
 src/controllers/state_controller.hpp          |  15 +-
 src/framework/bitvector.hpp                   | 207 ++++++++++++++++++
 src/simulators/circuit_executor.hpp           | 101 ++++++---
 .../density_matrix/densitymatrix_executor.hpp |  66 +++---
 .../density_matrix/densitymatrix_state.hpp    |  40 ++--
 .../extended_stabilizer_state.hpp             |  14 +-
 .../matrix_product_state.hpp                  |  24 +-
 src/simulators/multi_state_executor.hpp       |   6 +-
 src/simulators/parallel_state_executor.hpp    |  86 ++++++--
 .../stabilizer/stabilizer_state.hpp           |  15 +-
 src/simulators/state.hpp                      |  11 +-
 .../statevector/statevector_executor.hpp      |  67 +++---
 .../statevector/statevector_state.hpp         |  39 ++--
 src/simulators/tensor_network/tensor_net.hpp  |  14 +-
 .../tensor_network/tensor_net_executor.hpp    |  18 +-
 .../tensor_network/tensor_net_state.hpp       |  20 +-
 18 files changed, 532 insertions(+), 225 deletions(-)
 create mode 100644 releasenotes/notes/parallelize_sampling_measure-18bda46a281e48d2.yaml
 create mode 100644 src/framework/bitvector.hpp
diff --git a/qiskit_aer/backends/aerbackend.py b/qiskit_aer/backends/aerbackend.py
index 36545ffe8c..17b50651fb 100644
--- a/qiskit_aer/backends/aerbackend.py
+++ b/qiskit_aer/backends/aerbackend.py
@@ -345,7 +345,10 @@ def target(self):
         if self._target is not None:
             return self._target
 
-        return convert_to_target(self.configuration(), self.properties(), None, NAME_MAPPING)
+        tgt = convert_to_target(self.configuration(), self.properties(), None, NAME_MAPPING)
+        if self._coupling_map is not None:
+            tgt._coupling_graph = self._coupling_map.graph.copy()
+        return tgt
 
     def clear_options(self):
         """Reset the simulator options to default values."""
diff --git a/releasenotes/notes/parallelize_sampling_measure-18bda46a281e48d2.yaml b/releasenotes/notes/parallelize_sampling_measure-18bda46a281e48d2.yaml
new file mode 100644
index 0000000000..63b66e89da
--- /dev/null
+++ b/releasenotes/notes/parallelize_sampling_measure-18bda46a281e48d2.yaml
@@ -0,0 +1,9 @@
+---
+features:
+  - |
+    Added BitVector class to store classical bits instead of using reg_t
+    to save memory usage and memory bandwidth.
+upgrade:
+  - |
+    Parallelize un-parallelized loops in sampling measure to speed up
+    for simulation with large number of shots.
diff --git a/src/controllers/state_controller.hpp b/src/controllers/state_controller.hpp
index 028806e822..5a3d2a81a9 100644
--- a/src/controllers/state_controller.hpp
+++ b/src/controllers/state_controller.hpp
@@ -1458,10 +1458,9 @@ std::vector<std::string> AerState::sample_memory(const reg_t &qubits,
 
   std::vector<std::string> ret;
   ret.reserve(shots);
-  std::vector<reg_t> samples = state_->sample_measure(qubits, shots, rng_);
+  std::vector<BitVector> samples = state_->sample_measure(qubits, shots, rng_);
   for (auto &sample : samples) {
-    ret.push_back(
-        Utils::int2string(Utils::reg2int(sample, 2), 2, qubits.size()));
+    ret.push_back(sample.to_string());
   }
   return ret;
 }
@@ -1472,16 +1471,10 @@ std::unordered_map<uint_t, uint_t> AerState::sample_counts(const reg_t &qubits,
 
   flush_ops();
 
-  std::vector<reg_t> samples = state_->sample_measure(qubits, shots, rng_);
+  std::vector<BitVector> samples = state_->sample_measure(qubits, shots, rng_);
   std::unordered_map<uint_t, uint_t> ret;
   for (const auto &sample : samples) {
-    uint_t sample_u = 0ULL;
-    uint_t mask = 1ULL;
-    for (const auto b : sample) {
-      if (b)
-        sample_u |= mask;
-      mask <<= 1;
-    }
+    uint_t sample_u = sample(0); // only the first 64bits is used
     if (ret.find(sample_u) == ret.end())
       ret[sample_u] = 1ULL;
     else
diff --git a/src/framework/bitvector.hpp b/src/framework/bitvector.hpp
new file mode 100644
index 0000000000..4f78b3e186
--- /dev/null
+++ b/src/framework/bitvector.hpp
@@ -0,0 +1,207 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019, 2024.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _aer_framework_bitvector_hpp_
+#define _aer_framework_bitvector_hpp_
+
+#include "framework/types.hpp"
+
+namespace AER {
+
+//============================================================================
+// Bit vestor class
+//============================================================================
+
+class BitVector {
+protected:
+  reg_t bits_;
+  uint_t num_bits_;
+  const static size_t REG_SIZE = 64;
+  const static size_t REG_BITS = 6;
+  const static size_t REG_MASK = (1ull << REG_BITS) - 1;
+
+public:
+  BitVector() { num_bits_ = 0; }
+  BitVector(uint_t nbits) { allocate(nbits); }
+  BitVector(const BitVector &src) {
+    bits_ = src.bits_;
+    num_bits_ = src.num_bits_;
+  }
+
+  uint_t num_bits() { return num_bits_; }
+  uint_t length() { return bits_.size(); }
+
+  void allocate(uint_t n) {
+    uint_t size = n >> REG_BITS;
+    if (size == 0)
+      size = 1;
+    bits_.resize(size, 0ull);
+    num_bits_ = n;
+  }
+
+  BitVector &operator=(const BitVector &src) {
+    bits_ = src.bits_;
+    num_bits_ = src.num_bits_;
+    return *this;
+  }
+  BitVector &operator=(const std::string &src) {
+    from_string(src);
+    return *this;
+  }
+  BitVector &operator=(const reg_t &src) {
+    from_vector(src);
+    return *this;
+  }
+
+  // copy with swap
+  void map(const BitVector &src, const reg_t map);
+
+  // bit access
+  inline bool get(const uint_t idx) const {
+    uint_t pos = idx >> REG_BITS;
+    uint_t bit = idx & REG_MASK;
+    return (((bits_[pos] >> bit) & 1ull) == 1ull);
+  }
+  inline bool operator[](const uint_t idx) const { return get(idx); }
+  inline uint_t &operator()(const uint_t pos) { return bits_[pos]; }
+  inline uint_t operator()(const uint_t pos) const { return bits_[pos]; }
+
+  void set(const uint_t idx, const bool val) {
+    uint_t pos = idx >> REG_BITS;
+    uint_t bit = idx & REG_MASK;
+    uint_t mask = ~(1ull << bit);
+    bits_[pos] &= mask;
+    bits_[pos] |= (((uint_t)val) << bit);
+  }
+
+  // convert from other data
+  void from_uint(const uint_t nbits, const uint_t src);
+  void from_string(const std::string &src);
+  void from_vector(const reg_t &src);
+  void from_vector_with_map(const reg_t &src, const reg_t &map);
+
+  // convert to other data types
+  std::string to_string();
+  std::string to_hex_string(bool prefix = true);
+  reg_t to_vector();
+};
+
+void BitVector::map(const BitVector &src, const reg_t map) {
+  allocate(map.size());
+
+  for (uint_t i = 0; i < map.size(); i++) {
+    set(i, src[map[i]]);
+  }
+}
+
+void BitVector::from_uint(const uint_t nbits, const uint_t src) {
+  allocate(nbits);
+  bits_[0] = src;
+}
+
+void BitVector::from_string(const std::string &src) {
+  allocate(src.size());
+
+  uint_t pos = 0;
+  for (uint_t i = 0; i < bits_.size(); i++) {
+    uint_t n = REG_SIZE;
+    uint_t val = 0;
+    if (n > num_bits_ - pos)
+      n = num_bits_ - pos;
+    for (uint_t j = 0; j < n; j++) {
+      val |= (((uint_t)(src[num_bits_ - 1 - pos] == '1')) << j);
+      pos++;
+    }
+    bits_[i] = val;
+  }
+}
+
+void BitVector::from_vector(const reg_t &src) {
+  allocate(src.size());
+
+  uint_t pos = 0;
+  for (uint_t i = 0; i < bits_.size(); i++) {
+    uint_t n = REG_SIZE;
+    uint_t val = 0;
+    if (n > num_bits_ - pos)
+      n = num_bits_ - pos;
+    for (uint_t j = 0; j < n; j++) {
+      val |= ((src[pos++] & 1ull) << j);
+    }
+    bits_[i] = val;
+  }
+}
+
+void BitVector::from_vector_with_map(const reg_t &src, const reg_t &map) {
+  allocate(src.size());
+
+  uint_t pos = 0;
+  for (uint_t i = 0; i < bits_.size(); i++) {
+    uint_t n = REG_SIZE;
+    uint_t val = 0;
+    if (n > num_bits_ - pos)
+      n = num_bits_ - pos;
+    for (uint_t j = 0; j < n; j++) {
+      val |= ((src[map[pos++]] & 1ull) << j);
+    }
+    bits_[i] = val;
+  }
+}
+
+std::string BitVector::to_string(void) {
+  std::string str;
+  for (uint_t i = 0; i < num_bits_; i++) {
+    if (get(num_bits_ - 1 - i))
+      str += '1';
+    else
+      str += '0';
+  }
+  return str;
+}
+
+std::string BitVector::to_hex_string(bool prefix) {
+  // initialize output string
+  std::string hex = (prefix) ? "0x" : "";
+
+  for (uint_t i = 0; i < bits_.size(); i++) {
+    if (i == 0) {
+      uint_t n = num_bits_ & (REG_SIZE - 1);
+      uint_t val = bits_[bits_.size() - 1] & ((1ull << n) - 1);
+
+      std::stringstream ss;
+      ss << std::hex << val;
+      hex += ss.str();
+    } else {
+      std::stringstream ss;
+      ss << std::hex << bits_[bits_.size() - 1 - i];
+      std::string part = ss.str();
+      part.insert(0, (REG_SIZE / 4) - part.size(), '0');
+      hex += part;
+    }
+  }
+  return hex;
+}
+
+reg_t BitVector::to_vector(void) {
+  reg_t ret(num_bits_);
+  for (uint_t i = 0; i < num_bits_; i++) {
+    ret[i] = (uint_t)get(i);
+  }
+  return ret;
+}
+
+//------------------------------------------------------------------------------
+} // end namespace AER
+//------------------------------------------------------------------------------
+#endif // _aer_framework_bitvector_hpp_
diff --git a/src/simulators/circuit_executor.hpp b/src/simulators/circuit_executor.hpp
index e49eef13f2..17fc473b08 100644
--- a/src/simulators/circuit_executor.hpp
+++ b/src/simulators/circuit_executor.hpp
@@ -209,7 +209,7 @@ class Executor : public Base {
   template <typename InputIterator>
   void measure_sampler(InputIterator first_meas, InputIterator last_meas,
                        uint_t shots, state_t &state, ExperimentResult &result,
-                       RngEngine &rng, bool save_creg_to_state = false) const;
+                       RngEngine &rng) const;
 
 #ifdef AER_MPI
   void gather_creg_memory(std::vector<ClassicalRegister> &cregs,
@@ -218,14 +218,14 @@ class Executor : public Base {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                            RngEngine &rng) const {
-    std::vector<reg_t> ret;
+  virtual std::vector<BitVector>
+  sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) const {
+    std::vector<BitVector> ret;
     return ret;
   };
-  virtual std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
-                                            uint_t shots,
-                                            std::vector<RngEngine> &rng) const {
+  virtual std::vector<BitVector>
+  sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
+                 std::vector<RngEngine> &rng) const {
     // this is for single rng, impement in sub-class for multi-shots case
     return state.sample_measure(qubits, shots, rng[0]);
   }
@@ -1024,8 +1024,7 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
                                         InputIterator last_meas, uint_t shots,
                                         state_t &state,
                                         ExperimentResult &result,
-                                        RngEngine &rng,
-                                        bool save_creg_to_state) const {
+                                        RngEngine &rng) const {
   // Check if meas_circ is empty, and if so return initial creg
   if (first_meas == last_meas) {
     while (shots-- > 0) {
@@ -1056,7 +1055,7 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
 
   // Generate the samples
   auto timer_start = myclock_t::now();
-  std::vector<reg_t> all_samples;
+  std::vector<BitVector> all_samples;
   all_samples = state.sample_measure(meas_qubits, shots, rng);
   auto time_taken =
       std::chrono::duration<double>(myclock_t::now() - timer_start).count();
@@ -1086,30 +1085,70 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
       (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
   uint_t num_registers =
       (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
-  ClassicalRegister creg;
-  for (int_t i = all_samples.size() - 1; i >= 0; i--) {
-    creg.initialize(num_memory, num_registers);
-
-    // process memory bit measurements
-    for (const auto &pair : memory_map) {
-      creg.store_measure(reg_t({all_samples[i][pair.second]}),
-                         reg_t({pair.first}), reg_t());
-    }
-    // process register bit measurements
-    for (const auto &pair : register_map) {
-      creg.store_measure(reg_t({all_samples[i][pair.second]}), reg_t(),
-                         reg_t({pair.first}));
-    }
 
-    // process read out errors for memory and registers
-    for (const Operations::Op &roerror : roerror_ops)
-      creg.apply_roerror(roerror, rng);
+  if (roerror_ops.size() > 0) {
+    // can not parallelize for read out error because of rng
+    ClassicalRegister creg;
+    for (uint_t is = 0; is < all_samples.size(); is++) {
+      uint_t i = all_samples.size() - is - 1;
+      creg.initialize(num_memory, num_registers);
+
+      // process memory bit measurements
+      for (const auto &pair : memory_map) {
+        creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
+                           reg_t({pair.first}), reg_t());
+      }
+      // process register bit measurements
+      for (const auto &pair : register_map) {
+        creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
+                           reg_t(), reg_t({pair.first}));
+      }
+
+      // process read out errors for memory and registers
+      for (const Operations::Op &roerror : roerror_ops)
+        creg.apply_roerror(roerror, rng);
 
-    // Save count data
-    if (save_creg_to_state)
-      state.creg() = creg;
-    else
+      // Save count data
       result.save_count_data(creg, save_creg_memory_);
+    }
+  } else {
+    uint_t npar = parallel_state_update_;
+    if (npar > all_samples.size())
+      npar = all_samples.size();
+
+    std::vector<ExperimentResult> par_results(npar);
+    auto copy_samples_lambda = [this, &par_results, num_memory, num_registers,
+                                memory_map, register_map, npar,
+                                &all_samples](int_t ip) {
+      ClassicalRegister creg;
+      uint_t is, ie;
+      is = all_samples.size() * ip / npar;
+      ie = all_samples.size() * (ip + 1) / npar;
+      for (; is < ie; is++) {
+        uint_t i = all_samples.size() - is - 1;
+        creg.initialize(num_memory, num_registers);
+
+        // process memory bit measurements
+        for (const auto &pair : memory_map) {
+          creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
+                             reg_t({pair.first}), reg_t());
+        }
+        // process register bit measurements
+        for (const auto &pair : register_map) {
+          creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
+                             reg_t(), reg_t({pair.first}));
+        }
+
+        // Save count data
+        par_results[ip].save_count_data(creg, save_creg_memory_);
+      }
+    };
+    Utils::apply_omp_parallel_for((npar > 1), 0, npar, copy_samples_lambda,
+                                  npar);
+
+    for (int_t i = 0; i < npar; i++) {
+      result.combine(std::move(par_results[i]));
+    }
   }
 }
 
diff --git a/src/simulators/density_matrix/densitymatrix_executor.hpp b/src/simulators/density_matrix/densitymatrix_executor.hpp
index 96429ed804..041521e242 100644
--- a/src/simulators/density_matrix/densitymatrix_executor.hpp
+++ b/src/simulators/density_matrix/densitymatrix_executor.hpp
@@ -168,8 +168,8 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                    RngEngine &rng) const override;
+  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                        RngEngine &rng) const override;
 
   rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root,
                                      const reg_t &qubits);
@@ -180,9 +180,9 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
                      const reg_t &cmemory, const reg_t &cregister);
 
-  std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
-                                    uint_t shots,
-                                    std::vector<RngEngine> &rng) const override;
+  std::vector<BitVector>
+  sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
+                 std::vector<RngEngine> &rng) const override;
 
   //-----------------------------------------------------------------------
   // Functions for multi-chunk distribution
@@ -1215,15 +1215,14 @@ void Executor<densmat_t>::measure_reset_update(const reg_t &qubits,
 }
 
 template <class densmat_t>
-std::vector<reg_t> Executor<densmat_t>::sample_measure(const reg_t &qubits,
-                                                       uint_t shots,
-                                                       RngEngine &rng) const {
+std::vector<BitVector>
+Executor<densmat_t>::sample_measure(const reg_t &qubits, uint_t shots,
+                                    RngEngine &rng) const {
   // Generate flat register for storing
   std::vector<double> rnds;
   rnds.reserve(shots);
   for (uint_t i = 0; i < shots; ++i)
     rnds.push_back(rng.rand(0, 1));
-  reg_t allbit_samples(shots, 0);
 
   uint_t i, j;
   std::vector<double> chunkSum(Base::states_.size() + 1, 0);
@@ -1322,20 +1321,27 @@ std::vector<reg_t> Executor<densmat_t>::sample_measure(const reg_t &qubits,
 #ifdef AER_MPI
   BasePar::reduce_sum(local_samples);
 #endif
-  allbit_samples = local_samples;
 
-  // Convert to reg_t format
-  std::vector<reg_t> all_samples;
-  all_samples.reserve(shots);
-  for (int_t val : allbit_samples) {
-    reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
-    reg_t sample;
-    sample.reserve(qubits.size());
-    for (uint_t qubit : qubits) {
-      sample.push_back(allbit_sample[qubit]);
+  // Convert to BitVector format
+  int_t npar = Base::parallel_state_update_;
+  if (npar > local_samples.size())
+    npar = local_samples.size();
+  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+
+  auto convert_to_bit_lambda = [this, &local_samples, &all_samples, shots,
+                                qubits, npar](int_t i) {
+    uint_t ishot, iend;
+    ishot = local_samples.size() * i / npar;
+    iend = local_samples.size() * (i + 1) / npar;
+    for (; ishot < iend; ishot++) {
+      BitVector allbit_sample;
+      allbit_sample.from_uint(qubits.size(), local_samples[ishot]);
+      all_samples[ishot].map(allbit_sample, qubits);
     }
-    all_samples.push_back(sample);
-  }
+  };
+  Utils::apply_omp_parallel_for(
+      (npar > 1 && BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1), 0,
+      npar, convert_to_bit_lambda, npar);
   return all_samples;
 }
 
@@ -1439,7 +1445,7 @@ void Executor<state_t>::apply_measure(CircuitExecutor::Branch &root,
 }
 
 template <class state_t>
-std::vector<reg_t>
+std::vector<BitVector>
 Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
                                   uint_t shots,
                                   std::vector<RngEngine> &rng) const {
@@ -1454,17 +1460,13 @@ Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
   auto allbit_samples = state.qreg().sample_measure(rnds);
   state.qreg().enable_batch(flg);
 
-  // Convert to reg_t format
-  std::vector<reg_t> all_samples;
-  all_samples.reserve(shots);
+  // Convert to bit format
+  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+  i = 0;
   for (int_t val : allbit_samples) {
-    reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
-    reg_t sample;
-    sample.reserve(qubits.size());
-    for (uint_t qubit : qubits) {
-      sample.push_back(allbit_sample[qubit]);
-    }
-    all_samples.push_back(sample);
+    BitVector allbit_sample;
+    allbit_sample.from_uint(qubits.size(), val);
+    all_samples[i++].map(allbit_sample, qubits);
   }
   return all_samples;
 }
diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp
index 91637166e2..41fd836f1e 100644
--- a/src/simulators/density_matrix/densitymatrix_state.hpp
+++ b/src/simulators/density_matrix/densitymatrix_state.hpp
@@ -130,8 +130,8 @@ class State : public QuantumState::State<densmat_t> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                    RngEngine &rng) override;
+  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                        RngEngine &rng) override;
 
   // Helper function for computing expectation value
   double expval_pauli(const reg_t &qubits, const std::string &pauli) override;
@@ -987,9 +987,9 @@ void State<densmat_t>::measure_reset_update(const reg_t &qubits,
 }
 
 template <class densmat_t>
-std::vector<reg_t> State<densmat_t>::sample_measure(const reg_t &qubits,
-                                                    uint_t shots,
-                                                    RngEngine &rng) {
+std::vector<BitVector> State<densmat_t>::sample_measure(const reg_t &qubits,
+                                                        uint_t shots,
+                                                        RngEngine &rng) {
   // Generate flat register for storing
   std::vector<double> rnds;
   rnds.reserve(shots);
@@ -999,18 +999,26 @@ std::vector<reg_t> State<densmat_t>::sample_measure(const reg_t &qubits,
 
   allbit_samples = BaseState::qreg_.sample_measure(rnds);
 
-  // Convert to reg_t format
-  std::vector<reg_t> all_samples;
-  all_samples.reserve(shots);
-  for (int_t val : allbit_samples) {
-    reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::qreg_.num_qubits());
-    reg_t sample;
-    sample.reserve(qubits.size());
-    for (uint_t qubit : qubits) {
-      sample.push_back(allbit_sample[qubit]);
+  // Convert to bit format
+  int_t npar = BaseState::threads_;
+  if (npar > shots)
+    npar = shots;
+  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+
+  auto convert_to_bit_lambda = [this, &allbit_samples, &all_samples, shots,
+                                qubits, npar](int_t i) {
+    uint_t ishot, iend;
+    ishot = shots * i / npar;
+    iend = shots * (i + 1) / npar;
+    for (; ishot < iend; ishot++) {
+      BitVector allbit_sample;
+      allbit_sample.from_uint(qubits.size(), allbit_samples[ishot]);
+      all_samples[ishot].map(allbit_sample, qubits);
     }
-    all_samples.push_back(sample);
-  }
+  };
+  Utils::apply_omp_parallel_for((npar > 1), 0, npar, convert_to_bit_lambda,
+                                npar);
+
   return all_samples;
 }
 
diff --git a/src/simulators/extended_stabilizer/extended_stabilizer_state.hpp b/src/simulators/extended_stabilizer/extended_stabilizer_state.hpp
index be6a8af609..2c4c00e7e3 100644
--- a/src/simulators/extended_stabilizer/extended_stabilizer_state.hpp
+++ b/src/simulators/extended_stabilizer/extended_stabilizer_state.hpp
@@ -86,8 +86,8 @@ class State : public QuantumState::State<chstate_t> {
 
   void set_config(const Config &config) override;
 
-  std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                    RngEngine &rng) override;
+  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                        RngEngine &rng) override;
 
 protected:
   // Alongside the sample measure optimisaiton, we can parallelise
@@ -415,8 +415,8 @@ void State::apply_ops(InputIterator first, InputIterator last,
   }
 }
 
-std::vector<reg_t> State::sample_measure(const reg_t &qubits, uint_t shots,
-                                         RngEngine &rng) {
+std::vector<BitVector> State::sample_measure(const reg_t &qubits, uint_t shots,
+                                             RngEngine &rng) {
   std::vector<uint_t> output_samples;
   if (BaseState::qreg_.get_num_states() == 1) {
     output_samples = BaseState::qreg_.stabilizer_sampler(shots, rng);
@@ -439,13 +439,13 @@ std::vector<reg_t> State::sample_measure(const reg_t &qubits, uint_t shots,
       }
     }
   }
-  std::vector<reg_t> all_samples;
+  std::vector<BitVector> all_samples;
   all_samples.reserve(shots);
   for (uint_t sample : output_samples) {
-    reg_t sample_bits(qubits.size(), 0ULL);
+    BitVector sample_bits(qubits.size());
     for (size_t i = 0; i < qubits.size(); i++) {
       if ((sample >> qubits[i]) & 1ULL) {
-        sample_bits[i] = 1ULL;
+        sample_bits.set(i, true);
       }
     }
     all_samples.push_back(sample_bits);
diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
index 6621c0371e..6e41c2f126 100644
--- a/src/simulators/matrix_product_state/matrix_product_state.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -131,16 +131,16 @@ class State : public QuantumState::State<matrixproductstate_t> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                            RngEngine &rng) override;
+  virtual std::vector<BitVector>
+  sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) override;
 
   // Computes sample_measure by copying the MPS to a temporary structure, and
   // applying a measurement on the temporary MPS. This is done for every shot,
   // so is not efficient for a large number of shots
-  std::vector<reg_t> sample_measure_using_apply_measure(const reg_t &qubits,
-                                                        uint_t shots,
-                                                        RngEngine &rng);
-  std::vector<reg_t> sample_measure_all(uint_t shots, RngEngine &rng);
+  std::vector<BitVector> sample_measure_using_apply_measure(const reg_t &qubits,
+                                                            uint_t shots,
+                                                            RngEngine &rng);
+  std::vector<BitVector> sample_measure_all(uint_t shots, RngEngine &rng);
   //-----------------------------------------------------------------------
   // Additional methods
   //-----------------------------------------------------------------------
@@ -759,8 +759,8 @@ rvector_t State::measure_probs(const reg_t &qubits) const {
   return probvector;
 }
 
-std::vector<reg_t> State::sample_measure(const reg_t &qubits, uint_t shots,
-                                         RngEngine &rng) {
+std::vector<BitVector> State::sample_measure(const reg_t &qubits, uint_t shots,
+                                             RngEngine &rng) {
   // There are two alternative algorithms for sample measure
   // We choose the one that is optimal relative to the total number
   // of qubits,and the number of shots.
@@ -774,10 +774,10 @@ std::vector<reg_t> State::sample_measure(const reg_t &qubits, uint_t shots,
   return sample_measure_using_apply_measure(qubits, shots, rng);
 }
 
-std::vector<reg_t>
+std::vector<BitVector>
 State::sample_measure_using_apply_measure(const reg_t &qubits, uint_t shots,
                                           RngEngine &rng) {
-  std::vector<reg_t> all_samples;
+  std::vector<BitVector> all_samples;
   all_samples.resize(shots);
   std::vector<rvector_t> rnds_list;
   rnds_list.reserve(shots);
@@ -803,8 +803,8 @@ State::sample_measure_using_apply_measure(const reg_t &qubits, uint_t shots,
   return all_samples;
 }
 
-std::vector<reg_t> State::sample_measure_all(uint_t shots, RngEngine &rng) {
-  std::vector<reg_t> all_samples;
+std::vector<BitVector> State::sample_measure_all(uint_t shots, RngEngine &rng) {
+  std::vector<BitVector> all_samples;
   all_samples.resize(shots);
 
 #pragma omp parallel for if (getenv("PRL_PROB_MEAS"))
diff --git a/src/simulators/multi_state_executor.hpp b/src/simulators/multi_state_executor.hpp
index a420e9e9d3..903e9b4bdf 100644
--- a/src/simulators/multi_state_executor.hpp
+++ b/src/simulators/multi_state_executor.hpp
@@ -827,7 +827,7 @@ void MultiStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
                     meas_qubits.end());
 
   // Generate the samples
-  std::vector<reg_t> all_samples;
+  std::vector<BitVector> all_samples;
   all_samples = this->sample_measure(state, meas_qubits, shots, rng);
 
   // Make qubit map of position in vector of measured qubits
@@ -855,12 +855,12 @@ void MultiStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
 
     // process memory bit measurements
     for (const auto &pair : memory_map) {
-      creg.store_measure(reg_t({all_samples[i][pair.second]}),
+      creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
                          reg_t({pair.first}), reg_t());
     }
     // process register bit measurements
     for (const auto &pair : register_map) {
-      creg.store_measure(reg_t({all_samples[i][pair.second]}), reg_t(),
+      creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}), reg_t(),
                          reg_t({pair.first}));
     }
 
diff --git a/src/simulators/parallel_state_executor.hpp b/src/simulators/parallel_state_executor.hpp
index 7cb26bc735..cbae13f521 100644
--- a/src/simulators/parallel_state_executor.hpp
+++ b/src/simulators/parallel_state_executor.hpp
@@ -637,32 +637,74 @@ void ParallelStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
       (memory_map.empty()) ? 0ULL : 1 + memory_map.rbegin()->first;
   uint_t num_registers =
       (register_map.empty()) ? 0ULL : 1 + register_map.rbegin()->first;
-  ClassicalRegister creg;
-  while (!all_samples.empty()) {
-    auto sample = all_samples.back();
-    creg.initialize(num_memory, num_registers);
-
-    // process memory bit measurements
-    for (const auto &pair : memory_map) {
-      creg.store_measure(reg_t({sample[pair.second]}), reg_t({pair.first}),
-                         reg_t());
-    }
-    // process register bit measurements
-    for (const auto &pair : register_map) {
-      creg.store_measure(reg_t({sample[pair.second]}), reg_t(),
-                         reg_t({pair.first}));
-    }
 
-    // process read out errors for memory and registers
-    for (const Operations::Op &roerror : roerror_ops) {
-      creg.apply_roerror(roerror, rng);
+  if (roerror_ops.size() > 0) {
+    // can not parallelize for read out error because of rng
+    ClassicalRegister creg;
+    while (!all_samples.empty()) {
+      auto sample = all_samples.back();
+      creg.initialize(num_memory, num_registers);
+
+      // process memory bit measurements
+      for (const auto &pair : memory_map) {
+        creg.store_measure(reg_t({sample[pair.second]}), reg_t({pair.first}),
+                           reg_t());
+      }
+      // process register bit measurements
+      for (const auto &pair : register_map) {
+        creg.store_measure(reg_t({sample[pair.second]}), reg_t(),
+                           reg_t({pair.first}));
+      }
+
+      // process read out errors for memory and registers
+      for (const Operations::Op &roerror : roerror_ops) {
+        creg.apply_roerror(roerror, rng);
+      }
+
+      // Save count data
+      result.save_count_data(creg, Base::save_creg_memory_);
+
+      // pop off processed sample
+      all_samples.pop_back();
     }
+  } else {
+    uint_t npar = Base::parallel_state_update_;
+    if (npar > all_samples.size())
+      npar = all_samples.size();
+
+    std::vector<ExperimentResult> par_results(npar);
+    auto copy_samples_lambda = [this, &par_results, num_memory, num_registers,
+                                memory_map, register_map, npar,
+                                &all_samples](int_t ip) {
+      ClassicalRegister creg;
+      uint_t is, ie;
+      is = all_samples.size() * ip / npar;
+      ie = all_samples.size() * (ip + 1) / npar;
+      for (; is < ie; is++) {
+        uint_t i = all_samples.size() - is - 1;
+        creg.initialize(num_memory, num_registers);
+
+        // process memory bit measurements
+        for (const auto &pair : memory_map) {
+          creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
+                             reg_t({pair.first}), reg_t());
+        }
+        // process register bit measurements
+        for (const auto &pair : register_map) {
+          creg.store_measure(reg_t({(uint_t)all_samples[i][pair.second]}),
+                             reg_t(), reg_t({pair.first}));
+        }
 
-    // Save count data
-    result.save_count_data(creg, Base::save_creg_memory_);
+        // Save count data
+        par_results[ip].save_count_data(creg, Base::save_creg_memory_);
+      }
+    };
+    Utils::apply_omp_parallel_for((npar > 1), 0, npar, copy_samples_lambda,
+                                  npar);
 
-    // pop off processed sample
-    all_samples.pop_back();
+    for (int_t i = 0; i < npar; i++) {
+      result.combine(std::move(par_results[i]));
+    }
   }
 }
 
diff --git a/src/simulators/stabilizer/stabilizer_state.hpp b/src/simulators/stabilizer/stabilizer_state.hpp
index 1a2df3410e..a51359faa6 100644
--- a/src/simulators/stabilizer/stabilizer_state.hpp
+++ b/src/simulators/stabilizer/stabilizer_state.hpp
@@ -101,8 +101,8 @@ class State : public QuantumState::State<Clifford::Clifford> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                            RngEngine &rng) override;
+  virtual std::vector<BitVector>
+  sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) override;
 
   bool
   validate_parameters(const std::vector<Operations::Op> &ops) const override;
@@ -512,15 +512,14 @@ reg_t State::apply_measure_and_update(const reg_t &qubits, RngEngine &rng) {
   return outcome;
 }
 
-std::vector<reg_t> State::sample_measure(const reg_t &qubits, uint_t shots,
-                                         RngEngine &rng) {
+std::vector<BitVector> State::sample_measure(const reg_t &qubits, uint_t shots,
+                                             RngEngine &rng) {
   // TODO: see if we can improve efficiency by directly sampling from Clifford
   // table
   auto qreg_cache = BaseState::qreg_;
-  std::vector<reg_t> samples;
-  samples.reserve(shots);
-  while (shots-- > 0) { // loop over shots
-    samples.push_back(apply_measure_and_update(qubits, rng));
+  std::vector<BitVector> samples(shots);
+  for (int_t ishot = 0; ishot < shots; ishot++) {
+    samples[ishot] = apply_measure_and_update(qubits, rng);
     BaseState::qreg_ = qreg_cache; // restore pre-measurement data from cache
   }
   return samples;
diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp
index ee5613328a..3b6790f270 100644
--- a/src/simulators/state.hpp
+++ b/src/simulators/state.hpp
@@ -15,6 +15,7 @@
 #ifndef _aer_base_state_hpp_
 #define _aer_base_state_hpp_
 
+#include "framework/bitvector.hpp"
 #include "framework/config.hpp"
 #include "framework/creg.hpp"
 #include "framework/json.hpp"
@@ -194,8 +195,8 @@ class Base {
   // to the system state. Even though this method is not marked as const
   // at the end of sample the system should be left in the same state
   // as before sampling
-  virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                            RngEngine &rng);
+  virtual std::vector<BitVector> sample_measure(const reg_t &qubits,
+                                                uint_t shots, RngEngine &rng);
 
   //-----------------------------------------------------------------------
   // Config Settings
@@ -283,11 +284,11 @@ void Base::set_config(const Config &config) {
   }
 }
 
-std::vector<reg_t> Base::sample_measure(const reg_t &qubits, uint_t shots,
-                                        RngEngine &rng) {
+std::vector<BitVector> Base::sample_measure(const reg_t &qubits, uint_t shots,
+                                            RngEngine &rng) {
   (ignore_argument) qubits;
   (ignore_argument) shots;
-  return std::vector<reg_t>();
+  return std::vector<BitVector>();
 }
 
 void Base::apply_ops(const OpItr first, const OpItr last,
diff --git a/src/simulators/statevector/statevector_executor.hpp b/src/simulators/statevector/statevector_executor.hpp
index 5301035660..a000ee5310 100644
--- a/src/simulators/statevector/statevector_executor.hpp
+++ b/src/simulators/statevector/statevector_executor.hpp
@@ -194,17 +194,17 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
                      const reg_t &cmemory, const reg_t &cregister);
 
-  std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
-                                    uint_t shots,
-                                    std::vector<RngEngine> &rng) const override;
+  std::vector<BitVector>
+  sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
+                 std::vector<RngEngine> &rng) const override;
 
   // Return the reduced density matrix for the simulator
   cmatrix_t density_matrix(const reg_t &qubits);
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                    RngEngine &rng) const override;
+  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                        RngEngine &rng) const override;
 };
 
 template <class state_t>
@@ -1145,14 +1145,13 @@ void Executor<state_t>::measure_reset_update(const std::vector<uint_t> &qubits,
 }
 
 template <class state_t>
-std::vector<reg_t> Executor<state_t>::sample_measure(const reg_t &qubits,
-                                                     uint_t shots,
-                                                     RngEngine &rng) const {
+std::vector<BitVector> Executor<state_t>::sample_measure(const reg_t &qubits,
+                                                         uint_t shots,
+                                                         RngEngine &rng) const {
   uint_t i, j;
   // Generate flat register for storing
   std::vector<double> rnds;
   rnds.reserve(shots);
-  reg_t allbit_samples(shots, 0);
 
   for (i = 0; i < shots; ++i)
     rnds.push_back(rng.rand(0, 1));
@@ -1240,21 +1239,27 @@ std::vector<reg_t> Executor<state_t>::sample_measure(const reg_t &qubits,
 #ifdef AER_MPI
   BasePar::reduce_sum(local_samples);
 #endif
-  allbit_samples = local_samples;
 
-  // Convert to reg_t format
-  std::vector<reg_t> all_samples;
-  all_samples.reserve(shots);
-  for (int_t val : allbit_samples) {
-    reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
-    reg_t sample;
-    sample.reserve(qubits.size());
-    for (uint_t qubit : qubits) {
-      sample.push_back(allbit_sample[qubit]);
+  // Convert to BitVector format
+  int_t npar = Base::parallel_state_update_;
+  if (npar > local_samples.size())
+    npar = local_samples.size();
+  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+
+  auto convert_to_bit_lambda = [this, &local_samples, &all_samples, shots,
+                                qubits, npar](int_t i) {
+    uint_t ishot, iend;
+    ishot = local_samples.size() * i / npar;
+    iend = local_samples.size() * (i + 1) / npar;
+    for (; ishot < iend; ishot++) {
+      BitVector allbit_sample;
+      allbit_sample.from_uint(qubits.size(), local_samples[ishot]);
+      all_samples[ishot].map(allbit_sample, qubits);
     }
-    all_samples.push_back(sample);
-  }
-
+  };
+  Utils::apply_omp_parallel_for(
+      (npar > 1 && BasePar::chunk_omp_parallel_ && Base::num_groups_ > 1), 0,
+      npar, convert_to_bit_lambda, npar);
   return all_samples;
 }
 
@@ -1892,7 +1897,7 @@ void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
 }
 
 template <class state_t>
-std::vector<reg_t>
+std::vector<BitVector>
 Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
                                   uint_t shots,
                                   std::vector<RngEngine> &rng) const {
@@ -1907,17 +1912,13 @@ Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
   auto allbit_samples = state.qreg().sample_measure(rnds);
   state.qreg().enable_batch(flg);
 
-  // Convert to reg_t format
-  std::vector<reg_t> all_samples;
-  all_samples.reserve(shots);
+  // Convert to bit format
+  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+  i = 0;
   for (int_t val : allbit_samples) {
-    reg_t allbit_sample = Utils::int2reg(val, 2, Base::num_qubits_);
-    reg_t sample;
-    sample.reserve(qubits.size());
-    for (uint_t qubit : qubits) {
-      sample.push_back(allbit_sample[qubit]);
-    }
-    all_samples.push_back(sample);
+    BitVector allbit_sample;
+    allbit_sample.from_uint(qubits.size(), val);
+    all_samples[i++].map(allbit_sample, qubits);
   }
   return all_samples;
 }
diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp
index 8408290b3d..160db21ce5 100755
--- a/src/simulators/statevector/statevector_state.hpp
+++ b/src/simulators/statevector/statevector_state.hpp
@@ -153,8 +153,8 @@ class State : public QuantumState::State<statevec_t> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                            RngEngine &rng) override;
+  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                        RngEngine &rng) override;
 
   // Helper function for computing expectation value
   virtual double expval_pauli(const reg_t &qubits,
@@ -1020,9 +1020,9 @@ void State<statevec_t>::measure_reset_update(const std::vector<uint_t> &qubits,
 }
 
 template <class statevec_t>
-std::vector<reg_t> State<statevec_t>::sample_measure(const reg_t &qubits,
-                                                     uint_t shots,
-                                                     RngEngine &rng) {
+std::vector<BitVector> State<statevec_t>::sample_measure(const reg_t &qubits,
+                                                         uint_t shots,
+                                                         RngEngine &rng) {
   uint_t i;
   // Generate flat register for storing
   std::vector<double> rnds;
@@ -1034,18 +1034,25 @@ std::vector<reg_t> State<statevec_t>::sample_measure(const reg_t &qubits,
 
   allbit_samples = BaseState::qreg_.sample_measure(rnds);
 
-  // Convert to reg_t format
-  std::vector<reg_t> all_samples;
-  all_samples.reserve(shots);
-  for (int_t val : allbit_samples) {
-    reg_t allbit_sample = Utils::int2reg(val, 2, BaseState::qreg_.num_qubits());
-    reg_t sample;
-    sample.reserve(qubits.size());
-    for (uint_t qubit : qubits) {
-      sample.push_back(allbit_sample[qubit]);
+  // Convert to BitVector format
+  int_t npar = BaseState::threads_;
+  if (npar > shots)
+    npar = shots;
+  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+
+  auto convert_to_bit_lambda = [this, &allbit_samples, &all_samples, shots,
+                                qubits, npar](int_t i) {
+    uint_t ishot, iend;
+    ishot = shots * i / npar;
+    iend = shots * (i + 1) / npar;
+    for (; ishot < iend; ishot++) {
+      BitVector allbit_sample;
+      allbit_sample.from_uint(qubits.size(), allbit_samples[ishot]);
+      all_samples[ishot].map(allbit_sample, qubits);
     }
-    all_samples.push_back(sample);
-  }
+  };
+  Utils::apply_omp_parallel_for((npar > 1), 0, npar, convert_to_bit_lambda,
+                                npar);
 
   return all_samples;
 }
diff --git a/src/simulators/tensor_network/tensor_net.hpp b/src/simulators/tensor_network/tensor_net.hpp
index 32b7d52c0e..7effeb2145 100644
--- a/src/simulators/tensor_network/tensor_net.hpp
+++ b/src/simulators/tensor_network/tensor_net.hpp
@@ -248,7 +248,7 @@ class TensorNet {
   // Return M sampled outcomes for Z-basis measurement of all qubits
   // The input is a length M list of random reals between [0, 1) used for
   // generating samples.
-  std::vector<reg_t> sample_measure(const std::vector<double> &rnds) const;
+  std::vector<BitVector> sample_measure(const std::vector<double> &rnds) const;
 
   void apply_reset(const reg_t &qubits);
 
@@ -320,7 +320,7 @@ class TensorNet {
 
   void buffer_statevector(void) const;
 
-  void sample_measure_branch(std::vector<reg_t> &samples,
+  void sample_measure_branch(std::vector<BitVector> &samples,
                              const std::vector<double> &rnds,
                              const reg_t &input_sample_index,
                              const reg_t &input_shot_index,
@@ -1175,10 +1175,10 @@ void TensorNet<data_t>::apply_reset(const reg_t &qubits) {
 // Sample measure outcomes
 //------------------------------------------------------------------------------
 template <typename data_t>
-std::vector<reg_t>
+std::vector<BitVector>
 TensorNet<data_t>::sample_measure(const std::vector<double> &rnds) const {
   const int_t SHOTS = rnds.size();
-  std::vector<reg_t> samples(SHOTS);
+  std::vector<BitVector> samples(SHOTS);
   reg_t sample_index(SHOTS);
   reg_t shot_index(SHOTS);
   reg_t probs(num_qubits_, 0);
@@ -1193,7 +1193,7 @@ TensorNet<data_t>::sample_measure(const std::vector<double> &rnds) const {
 }
 
 template <typename data_t>
-void TensorNet<data_t>::sample_measure_branch(std::vector<reg_t> &samples,
+void TensorNet<data_t>::sample_measure_branch(std::vector<BitVector> &samples,
                                               const std::vector<double> &rnds,
                                               const reg_t &input_sample_index,
                                               const reg_t &input_shot_index,
@@ -1350,9 +1350,9 @@ void TensorNet<data_t>::sample_measure_branch(std::vector<reg_t> &samples,
           sample[pos_measured + i] = ((ib >> i) & 1);
         for (uint_t i = 0; i < shots[ib].size(); i++) {
           uint_t shot_id = shot_index[ib][i];
-          samples[shot_id] = sample;
+          samples[shot_id].from_vector(sample);
           for (uint_t j = 0; j < nqubits; j++) {
-            samples[shot_id][j] = ((sample_index[ib][i] >> j) & 1);
+            samples[shot_id].set(j, ((sample_index[ib][i] >> j) & 1) != 0);
           }
         }
       }
diff --git a/src/simulators/tensor_network/tensor_net_executor.hpp b/src/simulators/tensor_network/tensor_net_executor.hpp
index 53d24faf96..32a040b1d4 100644
--- a/src/simulators/tensor_network/tensor_net_executor.hpp
+++ b/src/simulators/tensor_network/tensor_net_executor.hpp
@@ -67,9 +67,9 @@ class Executor : public CircuitExecutor::MultiStateExecutor<state_t> {
   void apply_kraus(CircuitExecutor::Branch &root, const reg_t &qubits,
                    const std::vector<cmatrix_t> &kmats);
 
-  std::vector<reg_t> sample_measure(state_t &state, const reg_t &qubits,
-                                    uint_t shots,
-                                    std::vector<RngEngine> &rng) const override;
+  std::vector<BitVector>
+  sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
+                 std::vector<RngEngine> &rng) const override;
 
   // Helper functions for shot-branching
   void apply_save_density_matrix(CircuitExecutor::Branch &root,
@@ -529,7 +529,7 @@ void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
 }
 
 template <class state_t>
-std::vector<reg_t>
+std::vector<BitVector>
 Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
                                   uint_t shots,
                                   std::vector<RngEngine> &rng) const {
@@ -540,21 +540,19 @@ Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
   for (i = 0; i < (int_t)shots; ++i)
     rnds.push_back(rng[i].rand(0, 1));
 
-  std::vector<reg_t> samples = state.qreg().sample_measure(rnds);
-  std::vector<reg_t> ret(shots);
+  std::vector<BitVector> samples = state.qreg().sample_measure(rnds);
+  std::vector<BitVector> ret(shots, BitVector(qubits.size()));
 
   if (omp_get_num_threads() > 1) {
     for (i = 0; i < (int_t)shots; ++i) {
-      ret[i].resize(qubits.size());
       for (j = 0; j < (int_t)qubits.size(); j++)
-        ret[i][j] = samples[i][qubits[j]];
+        ret[i].set(j, samples[i][qubits[j]]);
     }
   } else {
 #pragma omp parallel for private(j)
     for (i = 0; i < (int_t)shots; ++i) {
-      ret[i].resize(qubits.size());
       for (j = 0; j < (int_t)qubits.size(); j++)
-        ret[i][j] = samples[i][qubits[j]];
+        ret[i].set(j, samples[i][qubits[j]]);
     }
   }
   return ret;
diff --git a/src/simulators/tensor_network/tensor_net_state.hpp b/src/simulators/tensor_network/tensor_net_state.hpp
index ef0bbf3a10..4d50230298 100644
--- a/src/simulators/tensor_network/tensor_net_state.hpp
+++ b/src/simulators/tensor_network/tensor_net_state.hpp
@@ -139,8 +139,8 @@ class State : public QuantumState::State<tensor_net_t> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<reg_t> sample_measure(const reg_t &qubits, uint_t shots,
-                                            RngEngine &rng) override;
+  virtual std::vector<BitVector>
+  sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) override;
 
   // Load the threshold for applying OpenMP parallelization
   // if the controller/engine allows threads for it
@@ -896,30 +896,28 @@ void State<tensor_net_t>::measure_reset_update(
 }
 
 template <class tensor_net_t>
-std::vector<reg_t> State<tensor_net_t>::sample_measure(const reg_t &qubits,
-                                                       uint_t shots,
-                                                       RngEngine &rng) {
+std::vector<BitVector> State<tensor_net_t>::sample_measure(const reg_t &qubits,
+                                                           uint_t shots,
+                                                           RngEngine &rng) {
   // Generate flat register for storing
   std::vector<double> rnds(shots);
 
   for (uint_t i = 0; i < shots; ++i)
     rnds[i] = rng.rand(0, 1);
 
-  std::vector<reg_t> samples = BaseState::qreg_.sample_measure(rnds);
-  std::vector<reg_t> ret(shots);
+  std::vector<BitVector> samples = BaseState::qreg_.sample_measure(rnds);
+  std::vector<BitVector> ret(shots, BitVector(qubits.size()));
 
   if (omp_get_num_threads() > 1) {
     for (uint_t i = 0; i < shots; ++i) {
-      ret[i].resize(qubits.size());
       for (uint_t j = 0; j < qubits.size(); j++)
-        ret[i][j] = samples[i][qubits[j]];
+        ret[i].set(j, samples[i][qubits[j]]);
     }
   } else {
 #pragma omp parallel for
     for (int_t i = 0; i < (int_t)shots; ++i) {
-      ret[i].resize(qubits.size());
       for (uint_t j = 0; j < qubits.size(); j++)
-        ret[i][j] = samples[i][qubits[j]];
+        ret[i].set(j, samples[i][qubits[j]]);
     }
   }
   return ret;

From b02635c5191370d8b8d399270ac13b505dcd793c Mon Sep 17 00:00:00 2001
From: Jun Doi <doichan@jp.ibm.com>
Date: Fri, 1 Mar 2024 17:14:11 +0900
Subject: [PATCH 2/3] replace BitVector to SampleVector is special class for
 sampling measure

---
 src/controllers/state_controller.hpp          |   4 +-
 src/framework/bitvector.hpp                   | 207 -----------------
 src/simulators/circuit_executor.hpp           |   8 +-
 .../density_matrix/densitymatrix_executor.hpp |  24 +-
 .../density_matrix/densitymatrix_state.hpp    |  16 +-
 .../extended_stabilizer_state.hpp             |  12 +-
 .../matrix_product_state.hpp                  |  23 +-
 src/simulators/multi_state_executor.hpp       |   2 +-
 src/simulators/sample_vector.hpp              | 219 ++++++++++++++++++
 .../stabilizer/stabilizer_state.hpp           |   8 +-
 src/simulators/state.hpp                      |  13 +-
 .../statevector/statevector_executor.hpp      |  28 +--
 .../statevector/statevector_state.hpp         |  18 +-
 src/simulators/tensor_network/tensor_net.hpp  |  19 +-
 .../tensor_network/tensor_net_executor.hpp    |   8 +-
 .../tensor_network/tensor_net_state.hpp       |  12 +-
 16 files changed, 317 insertions(+), 304 deletions(-)
 delete mode 100644 src/framework/bitvector.hpp
 create mode 100644 src/simulators/sample_vector.hpp

diff --git a/src/controllers/state_controller.hpp b/src/controllers/state_controller.hpp
index 5a3d2a81a9..0335f4c493 100644
--- a/src/controllers/state_controller.hpp
+++ b/src/controllers/state_controller.hpp
@@ -1458,7 +1458,7 @@ std::vector<std::string> AerState::sample_memory(const reg_t &qubits,
 
   std::vector<std::string> ret;
   ret.reserve(shots);
-  std::vector<BitVector> samples = state_->sample_measure(qubits, shots, rng_);
+  std::vector<SampleVector> samples = state_->sample_measure(qubits, shots, rng_);
   for (auto &sample : samples) {
     ret.push_back(sample.to_string());
   }
@@ -1471,7 +1471,7 @@ std::unordered_map<uint_t, uint_t> AerState::sample_counts(const reg_t &qubits,
 
   flush_ops();
 
-  std::vector<BitVector> samples = state_->sample_measure(qubits, shots, rng_);
+  std::vector<SampleVector> samples = state_->sample_measure(qubits, shots, rng_);
   std::unordered_map<uint_t, uint_t> ret;
   for (const auto &sample : samples) {
     uint_t sample_u = sample(0); // only the first 64bits is used
diff --git a/src/framework/bitvector.hpp b/src/framework/bitvector.hpp
deleted file mode 100644
index 4f78b3e186..0000000000
--- a/src/framework/bitvector.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/**
- * This code is part of Qiskit.
- *
- * (C) Copyright IBM 2018, 2019, 2024.
- *
- * This code is licensed under the Apache License, Version 2.0. You may
- * obtain a copy of this license in the LICENSE.txt file in the root directory
- * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
- *
- * Any modifications or derivative works of this code must retain this
- * copyright notice, and modified files need to carry a notice indicating
- * that they have been altered from the originals.
- */
-
-#ifndef _aer_framework_bitvector_hpp_
-#define _aer_framework_bitvector_hpp_
-
-#include "framework/types.hpp"
-
-namespace AER {
-
-//============================================================================
-// Bit vestor class
-//============================================================================
-
-class BitVector {
-protected:
-  reg_t bits_;
-  uint_t num_bits_;
-  const static size_t REG_SIZE = 64;
-  const static size_t REG_BITS = 6;
-  const static size_t REG_MASK = (1ull << REG_BITS) - 1;
-
-public:
-  BitVector() { num_bits_ = 0; }
-  BitVector(uint_t nbits) { allocate(nbits); }
-  BitVector(const BitVector &src) {
-    bits_ = src.bits_;
-    num_bits_ = src.num_bits_;
-  }
-
-  uint_t num_bits() { return num_bits_; }
-  uint_t length() { return bits_.size(); }
-
-  void allocate(uint_t n) {
-    uint_t size = n >> REG_BITS;
-    if (size == 0)
-      size = 1;
-    bits_.resize(size, 0ull);
-    num_bits_ = n;
-  }
-
-  BitVector &operator=(const BitVector &src) {
-    bits_ = src.bits_;
-    num_bits_ = src.num_bits_;
-    return *this;
-  }
-  BitVector &operator=(const std::string &src) {
-    from_string(src);
-    return *this;
-  }
-  BitVector &operator=(const reg_t &src) {
-    from_vector(src);
-    return *this;
-  }
-
-  // copy with swap
-  void map(const BitVector &src, const reg_t map);
-
-  // bit access
-  inline bool get(const uint_t idx) const {
-    uint_t pos = idx >> REG_BITS;
-    uint_t bit = idx & REG_MASK;
-    return (((bits_[pos] >> bit) & 1ull) == 1ull);
-  }
-  inline bool operator[](const uint_t idx) const { return get(idx); }
-  inline uint_t &operator()(const uint_t pos) { return bits_[pos]; }
-  inline uint_t operator()(const uint_t pos) const { return bits_[pos]; }
-
-  void set(const uint_t idx, const bool val) {
-    uint_t pos = idx >> REG_BITS;
-    uint_t bit = idx & REG_MASK;
-    uint_t mask = ~(1ull << bit);
-    bits_[pos] &= mask;
-    bits_[pos] |= (((uint_t)val) << bit);
-  }
-
-  // convert from other data
-  void from_uint(const uint_t nbits, const uint_t src);
-  void from_string(const std::string &src);
-  void from_vector(const reg_t &src);
-  void from_vector_with_map(const reg_t &src, const reg_t &map);
-
-  // convert to other data types
-  std::string to_string();
-  std::string to_hex_string(bool prefix = true);
-  reg_t to_vector();
-};
-
-void BitVector::map(const BitVector &src, const reg_t map) {
-  allocate(map.size());
-
-  for (uint_t i = 0; i < map.size(); i++) {
-    set(i, src[map[i]]);
-  }
-}
-
-void BitVector::from_uint(const uint_t nbits, const uint_t src) {
-  allocate(nbits);
-  bits_[0] = src;
-}
-
-void BitVector::from_string(const std::string &src) {
-  allocate(src.size());
-
-  uint_t pos = 0;
-  for (uint_t i = 0; i < bits_.size(); i++) {
-    uint_t n = REG_SIZE;
-    uint_t val = 0;
-    if (n > num_bits_ - pos)
-      n = num_bits_ - pos;
-    for (uint_t j = 0; j < n; j++) {
-      val |= (((uint_t)(src[num_bits_ - 1 - pos] == '1')) << j);
-      pos++;
-    }
-    bits_[i] = val;
-  }
-}
-
-void BitVector::from_vector(const reg_t &src) {
-  allocate(src.size());
-
-  uint_t pos = 0;
-  for (uint_t i = 0; i < bits_.size(); i++) {
-    uint_t n = REG_SIZE;
-    uint_t val = 0;
-    if (n > num_bits_ - pos)
-      n = num_bits_ - pos;
-    for (uint_t j = 0; j < n; j++) {
-      val |= ((src[pos++] & 1ull) << j);
-    }
-    bits_[i] = val;
-  }
-}
-
-void BitVector::from_vector_with_map(const reg_t &src, const reg_t &map) {
-  allocate(src.size());
-
-  uint_t pos = 0;
-  for (uint_t i = 0; i < bits_.size(); i++) {
-    uint_t n = REG_SIZE;
-    uint_t val = 0;
-    if (n > num_bits_ - pos)
-      n = num_bits_ - pos;
-    for (uint_t j = 0; j < n; j++) {
-      val |= ((src[map[pos++]] & 1ull) << j);
-    }
-    bits_[i] = val;
-  }
-}
-
-std::string BitVector::to_string(void) {
-  std::string str;
-  for (uint_t i = 0; i < num_bits_; i++) {
-    if (get(num_bits_ - 1 - i))
-      str += '1';
-    else
-      str += '0';
-  }
-  return str;
-}
-
-std::string BitVector::to_hex_string(bool prefix) {
-  // initialize output string
-  std::string hex = (prefix) ? "0x" : "";
-
-  for (uint_t i = 0; i < bits_.size(); i++) {
-    if (i == 0) {
-      uint_t n = num_bits_ & (REG_SIZE - 1);
-      uint_t val = bits_[bits_.size() - 1] & ((1ull << n) - 1);
-
-      std::stringstream ss;
-      ss << std::hex << val;
-      hex += ss.str();
-    } else {
-      std::stringstream ss;
-      ss << std::hex << bits_[bits_.size() - 1 - i];
-      std::string part = ss.str();
-      part.insert(0, (REG_SIZE / 4) - part.size(), '0');
-      hex += part;
-    }
-  }
-  return hex;
-}
-
-reg_t BitVector::to_vector(void) {
-  reg_t ret(num_bits_);
-  for (uint_t i = 0; i < num_bits_; i++) {
-    ret[i] = (uint_t)get(i);
-  }
-  return ret;
-}
-
-//------------------------------------------------------------------------------
-} // end namespace AER
-//------------------------------------------------------------------------------
-#endif // _aer_framework_bitvector_hpp_
diff --git a/src/simulators/circuit_executor.hpp b/src/simulators/circuit_executor.hpp
index 680686490a..eb0ae1886f 100644
--- a/src/simulators/circuit_executor.hpp
+++ b/src/simulators/circuit_executor.hpp
@@ -219,12 +219,12 @@ class Executor : public Base {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<BitVector>
+  virtual std::vector<SampleVector>
   sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) const {
-    std::vector<BitVector> ret;
+    std::vector<SampleVector> ret;
     return ret;
   };
-  virtual std::vector<BitVector>
+  virtual std::vector<SampleVector>
   sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
                  std::vector<RngEngine> &rng) const {
     // this is for single rng, impement in sub-class for multi-shots case
@@ -1064,7 +1064,7 @@ void Executor<state_t>::measure_sampler(InputIterator first_meas,
 
   // Generate the samples
   auto timer_start = myclock_t::now();
-  std::vector<BitVector> all_samples;
+  std::vector<SampleVector> all_samples;
   all_samples = state.sample_measure(meas_qubits, shots, rng);
   auto time_taken =
       std::chrono::duration<double>(myclock_t::now() - timer_start).count();
diff --git a/src/simulators/density_matrix/densitymatrix_executor.hpp b/src/simulators/density_matrix/densitymatrix_executor.hpp
index 041521e242..b6bdb67146 100644
--- a/src/simulators/density_matrix/densitymatrix_executor.hpp
+++ b/src/simulators/density_matrix/densitymatrix_executor.hpp
@@ -168,8 +168,8 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
-                                        RngEngine &rng) const override;
+  std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                           RngEngine &rng) const override;
 
   rvector_t sample_measure_with_prob(CircuitExecutor::Branch &root,
                                      const reg_t &qubits);
@@ -180,7 +180,7 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
                      const reg_t &cmemory, const reg_t &cregister);
 
-  std::vector<BitVector>
+  std::vector<SampleVector>
   sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
                  std::vector<RngEngine> &rng) const override;
 
@@ -1215,7 +1215,7 @@ void Executor<densmat_t>::measure_reset_update(const reg_t &qubits,
 }
 
 template <class densmat_t>
-std::vector<BitVector>
+std::vector<SampleVector>
 Executor<densmat_t>::sample_measure(const reg_t &qubits, uint_t shots,
                                     RngEngine &rng) const {
   // Generate flat register for storing
@@ -1322,11 +1322,11 @@ Executor<densmat_t>::sample_measure(const reg_t &qubits, uint_t shots,
   BasePar::reduce_sum(local_samples);
 #endif
 
-  // Convert to BitVector format
+  // Convert to SampleVector format
   int_t npar = Base::parallel_state_update_;
   if (npar > local_samples.size())
     npar = local_samples.size();
-  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+  std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));
 
   auto convert_to_bit_lambda = [this, &local_samples, &all_samples, shots,
                                 qubits, npar](int_t i) {
@@ -1334,8 +1334,8 @@ Executor<densmat_t>::sample_measure(const reg_t &qubits, uint_t shots,
     ishot = local_samples.size() * i / npar;
     iend = local_samples.size() * (i + 1) / npar;
     for (; ishot < iend; ishot++) {
-      BitVector allbit_sample;
-      allbit_sample.from_uint(qubits.size(), local_samples[ishot]);
+      SampleVector allbit_sample;
+      allbit_sample.from_uint(local_samples[ishot], qubits.size());
       all_samples[ishot].map(allbit_sample, qubits);
     }
   };
@@ -1445,7 +1445,7 @@ void Executor<state_t>::apply_measure(CircuitExecutor::Branch &root,
 }
 
 template <class state_t>
-std::vector<BitVector>
+std::vector<SampleVector>
 Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
                                   uint_t shots,
                                   std::vector<RngEngine> &rng) const {
@@ -1461,11 +1461,11 @@ Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
   state.qreg().enable_batch(flg);
 
   // Convert to bit format
-  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+  std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));
   i = 0;
   for (int_t val : allbit_samples) {
-    BitVector allbit_sample;
-    allbit_sample.from_uint(qubits.size(), val);
+    SampleVector allbit_sample;
+    allbit_sample.from_uint(val, qubits.size());
     all_samples[i++].map(allbit_sample, qubits);
   }
   return all_samples;
diff --git a/src/simulators/density_matrix/densitymatrix_state.hpp b/src/simulators/density_matrix/densitymatrix_state.hpp
index 41fd836f1e..5ce6889d49 100644
--- a/src/simulators/density_matrix/densitymatrix_state.hpp
+++ b/src/simulators/density_matrix/densitymatrix_state.hpp
@@ -130,8 +130,8 @@ class State : public QuantumState::State<densmat_t> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
-                                        RngEngine &rng) override;
+  std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                           RngEngine &rng) override;
 
   // Helper function for computing expectation value
   double expval_pauli(const reg_t &qubits, const std::string &pauli) override;
@@ -987,9 +987,9 @@ void State<densmat_t>::measure_reset_update(const reg_t &qubits,
 }
 
 template <class densmat_t>
-std::vector<BitVector> State<densmat_t>::sample_measure(const reg_t &qubits,
-                                                        uint_t shots,
-                                                        RngEngine &rng) {
+std::vector<SampleVector> State<densmat_t>::sample_measure(const reg_t &qubits,
+                                                           uint_t shots,
+                                                           RngEngine &rng) {
   // Generate flat register for storing
   std::vector<double> rnds;
   rnds.reserve(shots);
@@ -1003,7 +1003,7 @@ std::vector<BitVector> State<densmat_t>::sample_measure(const reg_t &qubits,
   int_t npar = BaseState::threads_;
   if (npar > shots)
     npar = shots;
-  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+  std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));
 
   auto convert_to_bit_lambda = [this, &allbit_samples, &all_samples, shots,
                                 qubits, npar](int_t i) {
@@ -1011,8 +1011,8 @@ std::vector<BitVector> State<densmat_t>::sample_measure(const reg_t &qubits,
     ishot = shots * i / npar;
     iend = shots * (i + 1) / npar;
     for (; ishot < iend; ishot++) {
-      BitVector allbit_sample;
-      allbit_sample.from_uint(qubits.size(), allbit_samples[ishot]);
+      SampleVector allbit_sample;
+      allbit_sample.from_uint(allbit_samples[ishot], qubits.size());
       all_samples[ishot].map(allbit_sample, qubits);
     }
   };
diff --git a/src/simulators/extended_stabilizer/extended_stabilizer_state.hpp b/src/simulators/extended_stabilizer/extended_stabilizer_state.hpp
index 2c4c00e7e3..cc8c91adf1 100644
--- a/src/simulators/extended_stabilizer/extended_stabilizer_state.hpp
+++ b/src/simulators/extended_stabilizer/extended_stabilizer_state.hpp
@@ -86,8 +86,8 @@ class State : public QuantumState::State<chstate_t> {
 
   void set_config(const Config &config) override;
 
-  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
-                                        RngEngine &rng) override;
+  std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                           RngEngine &rng) override;
 
 protected:
   // Alongside the sample measure optimisaiton, we can parallelise
@@ -415,8 +415,8 @@ void State::apply_ops(InputIterator first, InputIterator last,
   }
 }
 
-std::vector<BitVector> State::sample_measure(const reg_t &qubits, uint_t shots,
-                                             RngEngine &rng) {
+std::vector<SampleVector> State::sample_measure(const reg_t &qubits,
+                                                uint_t shots, RngEngine &rng) {
   std::vector<uint_t> output_samples;
   if (BaseState::qreg_.get_num_states() == 1) {
     output_samples = BaseState::qreg_.stabilizer_sampler(shots, rng);
@@ -439,10 +439,10 @@ std::vector<BitVector> State::sample_measure(const reg_t &qubits, uint_t shots,
       }
     }
   }
-  std::vector<BitVector> all_samples;
+  std::vector<SampleVector> all_samples;
   all_samples.reserve(shots);
   for (uint_t sample : output_samples) {
-    BitVector sample_bits(qubits.size());
+    SampleVector sample_bits(qubits.size());
     for (size_t i = 0; i < qubits.size(); i++) {
       if ((sample >> qubits[i]) & 1ULL) {
         sample_bits.set(i, true);
diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
index b415c9cfa8..60cba8195f 100644
--- a/src/simulators/matrix_product_state/matrix_product_state.hpp
+++ b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -131,16 +131,16 @@ class State : public QuantumState::State<matrixproductstate_t> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<BitVector>
+  virtual std::vector<SampleVector>
   sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) override;
 
   // Computes sample_measure by copying the MPS to a temporary structure, and
   // applying a measurement on the temporary MPS. This is done for every shot,
   // so is not efficient for a large number of shots
-  std::vector<BitVector> sample_measure_using_apply_measure(const reg_t &qubits,
-                                                            uint_t shots,
-                                                            RngEngine &rng);
-  std::vector<BitVector> sample_measure_all(uint_t shots, RngEngine &rng);
+  std::vector<SampleVector>
+  sample_measure_using_apply_measure(const reg_t &qubits, uint_t shots,
+                                     RngEngine &rng);
+  std::vector<SampleVector> sample_measure_all(uint_t shots, RngEngine &rng);
   //-----------------------------------------------------------------------
   // Additional methods
   //-----------------------------------------------------------------------
@@ -759,8 +759,8 @@ rvector_t State::measure_probs(const reg_t &qubits) const {
   return probvector;
 }
 
-std::vector<BitVector> State::sample_measure(const reg_t &qubits, uint_t shots,
-                                             RngEngine &rng) {
+std::vector<SampleVector> State::sample_measure(const reg_t &qubits,
+                                                uint_t shots, RngEngine &rng) {
   // There are two alternative algorithms for sample measure
   // We choose the one that is optimal relative to the total number
   // of qubits,and the number of shots.
@@ -774,10 +774,10 @@ std::vector<BitVector> State::sample_measure(const reg_t &qubits, uint_t shots,
   return sample_measure_using_apply_measure(qubits, shots, rng);
 }
 
-std::vector<BitVector>
+std::vector<SampleVector>
 State::sample_measure_using_apply_measure(const reg_t &qubits, uint_t shots,
                                           RngEngine &rng) {
-  std::vector<BitVector> all_samples;
+  std::vector<SampleVector> all_samples;
   all_samples.resize(shots);
   std::vector<rvector_t> rnds_list;
   rnds_list.reserve(shots);
@@ -803,8 +803,9 @@ State::sample_measure_using_apply_measure(const reg_t &qubits, uint_t shots,
   return all_samples;
 }
 
-std::vector<BitVector> State::sample_measure_all(uint_t shots, RngEngine &rng) {
-  std::vector<BitVector> all_samples;
+std::vector<SampleVector> State::sample_measure_all(uint_t shots,
+                                                    RngEngine &rng) {
+  std::vector<SampleVector> all_samples;
   all_samples.resize(shots);
 
 #pragma omp parallel for if (getenv("PRL_PROB_MEAS"))
diff --git a/src/simulators/multi_state_executor.hpp b/src/simulators/multi_state_executor.hpp
index 903e9b4bdf..3f180bea1b 100644
--- a/src/simulators/multi_state_executor.hpp
+++ b/src/simulators/multi_state_executor.hpp
@@ -827,7 +827,7 @@ void MultiStateExecutor<state_t>::measure_sampler(InputIterator first_meas,
                     meas_qubits.end());
 
   // Generate the samples
-  std::vector<BitVector> all_samples;
+  std::vector<SampleVector> all_samples;
   all_samples = this->sample_measure(state, meas_qubits, shots, rng);
 
   // Make qubit map of position in vector of measured qubits
diff --git a/src/simulators/sample_vector.hpp b/src/simulators/sample_vector.hpp
new file mode 100644
index 0000000000..36717bfc4f
--- /dev/null
+++ b/src/simulators/sample_vector.hpp
@@ -0,0 +1,219 @@
+/**
+ * This code is part of Qiskit.
+ *
+ * (C) Copyright IBM 2018, 2019, 2024.
+ *
+ * This code is licensed under the Apache License, Version 2.0. You may
+ * obtain a copy of this license in the LICENSE.txt file in the root directory
+ * of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+ *
+ * Any modifications or derivative works of this code must retain this
+ * copyright notice, and modified files need to carry a notice indicating
+ * that they have been altered from the originals.
+ */
+
+#ifndef _aer_simulator_sample_vector_hpp_
+#define _aer_simulator_sample_vector_hpp_
+
+#include "framework/types.hpp"
+
+namespace AER {
+
+//============================================================================
+// stroage for sampling measure results
+//============================================================================
+
+class SampleVector {
+protected:
+  reg_t bits_;
+  uint_t size_;
+  uint_t base_;
+  uint_t elem_shift_bits_;
+  uint_t elem_mask_;
+  uint_t vec_shift_bits_;
+  uint_t vec_mask_;
+  const static size_t REG_SIZE = 64;
+
+public:
+  SampleVector() {
+    base_ = 2;
+    size_ = 0;
+  }
+  SampleVector(uint_t nbits, uint_t base = 2) { allocate(nbits, base); }
+  SampleVector(const SampleVector &src) {
+    bits_ = src.bits_;
+    size_ = src.size_;
+    base_ = src.base_;
+    elem_shift_bits_ = src.elem_shift_bits_;
+    elem_mask_ = src.elem_mask_;
+    vec_shift_bits_ = src.vec_shift_bits_;
+    vec_mask_ = src.vec_mask_;
+  }
+
+  uint_t size() { return size_; }
+  uint_t length() { return bits_.size(); }
+
+  void allocate(uint_t n, uint_t base);
+
+  SampleVector &operator=(const SampleVector &src) {
+    bits_ = src.bits_;
+    size_ = src.size_;
+    base_ = src.base_;
+    elem_shift_bits_ = src.elem_shift_bits_;
+    elem_mask_ = src.elem_mask_;
+    vec_shift_bits_ = src.vec_shift_bits_;
+    vec_mask_ = src.vec_mask_;
+    return *this;
+  }
+  SampleVector &operator=(const std::string &src) {
+    from_string(src);
+    return *this;
+  }
+  SampleVector &operator=(const reg_t &src) {
+    from_vector(src);
+    return *this;
+  }
+
+  // copy with swap
+  void map(const SampleVector &src, const reg_t map);
+
+  // bit access
+  inline uint_t get(const uint_t idx) const {
+    uint_t vpos = idx >> vec_shift_bits_;
+    uint_t bpos = (idx & vec_mask_) << elem_shift_bits_;
+    return ((bits_[vpos] >> bpos) & elem_mask_);
+  }
+  inline uint_t operator[](const uint_t idx) const { return get(idx); }
+  inline uint_t &operator()(const uint_t pos) { return bits_[pos]; }
+  inline uint_t operator()(const uint_t pos) const { return bits_[pos]; }
+
+  inline void set(const uint_t idx, const uint_t val) {
+    uint_t vpos = idx >> vec_shift_bits_;
+    uint_t bpos = (idx & vec_mask_) << elem_shift_bits_;
+
+    uint_t mask = ~(elem_mask_ << bpos);
+    bits_[vpos] &= mask;
+    bits_[vpos] |= ((val & elem_mask_) << bpos);
+  }
+
+  // convert from other data
+  void from_uint(const uint_t src, const uint_t n, const uint_t base = 2);
+  void from_string(const std::string &src, const uint_t base = 2);
+  void from_vector(const reg_t &src, const uint_t base = 2);
+  void from_vector_with_map(const reg_t &src, const reg_t &map,
+                            const uint_t base = 2);
+
+  // convert to other data types
+  std::string to_string();
+  reg_t to_vector();
+};
+
+void SampleVector::allocate(uint_t n, uint_t base) {
+  vec_shift_bits_ = 6;
+  uint_t t = 1;
+  elem_shift_bits_ = 0;
+  for (uint_t i = 0; i < 6; i++) {
+    t <<= 1;
+    if (t >= base) {
+      break;
+    }
+    vec_shift_bits_--;
+    elem_shift_bits_++;
+  }
+  elem_mask_ = (1ull << (elem_shift_bits_ + 1)) - 1;
+  vec_mask_ = (1ull << vec_shift_bits_) - 1;
+
+  uint_t size = n >> vec_shift_bits_;
+  if (size == 0)
+    size = 1;
+  bits_.resize(size, 0ull);
+  size_ = n;
+}
+
+void SampleVector::map(const SampleVector &src, const reg_t map) {
+  allocate(map.size(), src.base_);
+
+  for (uint_t i = 0; i < map.size(); i++) {
+    set(i, src[map[i]]);
+  }
+}
+
+void SampleVector::from_uint(const uint_t src, const uint_t n,
+                             const uint_t base) {
+  allocate(n, base);
+  bits_[0] = src;
+}
+
+void SampleVector::from_string(const std::string &src, const uint_t base) {
+  allocate(src.size(), base);
+
+  uint_t pos = 0;
+  uint_t n = REG_SIZE >> elem_shift_bits_;
+  for (uint_t i = 0; i < bits_.size(); i++) {
+    uint_t val = 0;
+    if (n > size_ - pos)
+      n = size_ - pos;
+    for (uint_t j = 0; j < n; j++) {
+      val |= (((uint_t)(src[size_ - 1 - pos] - '0') & elem_mask_)
+              << (j << elem_shift_bits_));
+      pos++;
+    }
+    bits_[i] = val;
+  }
+}
+
+void SampleVector::from_vector(const reg_t &src, const uint_t base) {
+  allocate(src.size(), base);
+
+  uint_t pos = 0;
+  uint_t n = REG_SIZE >> elem_shift_bits_;
+  for (uint_t i = 0; i < bits_.size(); i++) {
+    uint_t val = 0;
+    if (n > size_ - pos)
+      n = size_ - pos;
+    for (uint_t j = 0; j < n; j++) {
+      val |= ((src[pos++] & elem_mask_) << (j << elem_shift_bits_));
+    }
+    bits_[i] = val;
+  }
+}
+
+void SampleVector::from_vector_with_map(const reg_t &src, const reg_t &map,
+                                        const uint_t base) {
+  allocate(src.size(), base);
+
+  uint_t pos = 0;
+  uint_t n = REG_SIZE >> elem_shift_bits_;
+  for (uint_t i = 0; i < bits_.size(); i++) {
+    uint_t n = REG_SIZE;
+    uint_t val = 0;
+    if (n > size_ - pos)
+      n = size_ - pos;
+    for (uint_t j = 0; j < n; j++) {
+      val |= ((src[map[pos++]] & elem_mask_) << (j << elem_shift_bits_));
+    }
+    bits_[i] = val;
+  }
+}
+
+std::string SampleVector::to_string(void) {
+  std::string str;
+  for (uint_t i = 0; i < size_; i++) {
+    uint_t val = get(size_ - 1 - i);
+    str += std::to_string(val);
+  }
+  return str;
+}
+
+reg_t SampleVector::to_vector(void) {
+  reg_t ret(size_);
+  for (uint_t i = 0; i < size_; i++) {
+    ret[i] = get(i);
+  }
+  return ret;
+}
+
+//------------------------------------------------------------------------------
+} // end namespace AER
+//------------------------------------------------------------------------------
+#endif // _aer_simulator_sample_vector_hpp_
diff --git a/src/simulators/stabilizer/stabilizer_state.hpp b/src/simulators/stabilizer/stabilizer_state.hpp
index a51359faa6..e695411b61 100644
--- a/src/simulators/stabilizer/stabilizer_state.hpp
+++ b/src/simulators/stabilizer/stabilizer_state.hpp
@@ -101,7 +101,7 @@ class State : public QuantumState::State<Clifford::Clifford> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<BitVector>
+  virtual std::vector<SampleVector>
   sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) override;
 
   bool
@@ -512,12 +512,12 @@ reg_t State::apply_measure_and_update(const reg_t &qubits, RngEngine &rng) {
   return outcome;
 }
 
-std::vector<BitVector> State::sample_measure(const reg_t &qubits, uint_t shots,
-                                             RngEngine &rng) {
+std::vector<SampleVector> State::sample_measure(const reg_t &qubits,
+                                                uint_t shots, RngEngine &rng) {
   // TODO: see if we can improve efficiency by directly sampling from Clifford
   // table
   auto qreg_cache = BaseState::qreg_;
-  std::vector<BitVector> samples(shots);
+  std::vector<SampleVector> samples(shots);
   for (int_t ishot = 0; ishot < shots; ishot++) {
     samples[ishot] = apply_measure_and_update(qubits, rng);
     BaseState::qreg_ = qreg_cache; // restore pre-measurement data from cache
diff --git a/src/simulators/state.hpp b/src/simulators/state.hpp
index 3b6790f270..e0aba472b2 100644
--- a/src/simulators/state.hpp
+++ b/src/simulators/state.hpp
@@ -15,7 +15,6 @@
 #ifndef _aer_base_state_hpp_
 #define _aer_base_state_hpp_
 
-#include "framework/bitvector.hpp"
 #include "framework/config.hpp"
 #include "framework/creg.hpp"
 #include "framework/json.hpp"
@@ -25,6 +24,8 @@
 
 #include "noise/noise_model.hpp"
 
+#include "simulators/sample_vector.hpp"
+
 namespace AER {
 
 namespace QuantumState {
@@ -195,8 +196,8 @@ class Base {
   // to the system state. Even though this method is not marked as const
   // at the end of sample the system should be left in the same state
   // as before sampling
-  virtual std::vector<BitVector> sample_measure(const reg_t &qubits,
-                                                uint_t shots, RngEngine &rng);
+  virtual std::vector<SampleVector>
+  sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng);
 
   //-----------------------------------------------------------------------
   // Config Settings
@@ -284,11 +285,11 @@ void Base::set_config(const Config &config) {
   }
 }
 
-std::vector<BitVector> Base::sample_measure(const reg_t &qubits, uint_t shots,
-                                            RngEngine &rng) {
+std::vector<SampleVector> Base::sample_measure(const reg_t &qubits,
+                                               uint_t shots, RngEngine &rng) {
   (ignore_argument) qubits;
   (ignore_argument) shots;
-  return std::vector<BitVector>();
+  return std::vector<SampleVector>();
 }
 
 void Base::apply_ops(const OpItr first, const OpItr last,
diff --git a/src/simulators/statevector/statevector_executor.hpp b/src/simulators/statevector/statevector_executor.hpp
index a000ee5310..826642fc6a 100644
--- a/src/simulators/statevector/statevector_executor.hpp
+++ b/src/simulators/statevector/statevector_executor.hpp
@@ -194,7 +194,7 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
   void apply_measure(CircuitExecutor::Branch &root, const reg_t &qubits,
                      const reg_t &cmemory, const reg_t &cregister);
 
-  std::vector<BitVector>
+  std::vector<SampleVector>
   sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
                  std::vector<RngEngine> &rng) const override;
 
@@ -203,8 +203,8 @@ class Executor : public CircuitExecutor::ParallelStateExecutor<state_t>,
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
-                                        RngEngine &rng) const override;
+  std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                           RngEngine &rng) const override;
 };
 
 template <class state_t>
@@ -1145,9 +1145,9 @@ void Executor<state_t>::measure_reset_update(const std::vector<uint_t> &qubits,
 }
 
 template <class state_t>
-std::vector<BitVector> Executor<state_t>::sample_measure(const reg_t &qubits,
-                                                         uint_t shots,
-                                                         RngEngine &rng) const {
+std::vector<SampleVector>
+Executor<state_t>::sample_measure(const reg_t &qubits, uint_t shots,
+                                  RngEngine &rng) const {
   uint_t i, j;
   // Generate flat register for storing
   std::vector<double> rnds;
@@ -1240,11 +1240,11 @@ std::vector<BitVector> Executor<state_t>::sample_measure(const reg_t &qubits,
   BasePar::reduce_sum(local_samples);
 #endif
 
-  // Convert to BitVector format
+  // Convert to SampleVector format
   int_t npar = Base::parallel_state_update_;
   if (npar > local_samples.size())
     npar = local_samples.size();
-  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+  std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));
 
   auto convert_to_bit_lambda = [this, &local_samples, &all_samples, shots,
                                 qubits, npar](int_t i) {
@@ -1252,8 +1252,8 @@ std::vector<BitVector> Executor<state_t>::sample_measure(const reg_t &qubits,
     ishot = local_samples.size() * i / npar;
     iend = local_samples.size() * (i + 1) / npar;
     for (; ishot < iend; ishot++) {
-      BitVector allbit_sample;
-      allbit_sample.from_uint(qubits.size(), local_samples[ishot]);
+      SampleVector allbit_sample;
+      allbit_sample.from_uint(local_samples[ishot], qubits.size());
       all_samples[ishot].map(allbit_sample, qubits);
     }
   };
@@ -1897,7 +1897,7 @@ void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
 }
 
 template <class state_t>
-std::vector<BitVector>
+std::vector<SampleVector>
 Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
                                   uint_t shots,
                                   std::vector<RngEngine> &rng) const {
@@ -1913,11 +1913,11 @@ Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
   state.qreg().enable_batch(flg);
 
   // Convert to bit format
-  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+  std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));
   i = 0;
   for (int_t val : allbit_samples) {
-    BitVector allbit_sample;
-    allbit_sample.from_uint(qubits.size(), val);
+    SampleVector allbit_sample;
+    allbit_sample.from_uint(val, qubits.size());
     all_samples[i++].map(allbit_sample, qubits);
   }
   return all_samples;
diff --git a/src/simulators/statevector/statevector_state.hpp b/src/simulators/statevector/statevector_state.hpp
index 160db21ce5..32da868010 100755
--- a/src/simulators/statevector/statevector_state.hpp
+++ b/src/simulators/statevector/statevector_state.hpp
@@ -153,8 +153,8 @@ class State : public QuantumState::State<statevec_t> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  std::vector<BitVector> sample_measure(const reg_t &qubits, uint_t shots,
-                                        RngEngine &rng) override;
+  std::vector<SampleVector> sample_measure(const reg_t &qubits, uint_t shots,
+                                           RngEngine &rng) override;
 
   // Helper function for computing expectation value
   virtual double expval_pauli(const reg_t &qubits,
@@ -1020,9 +1020,9 @@ void State<statevec_t>::measure_reset_update(const std::vector<uint_t> &qubits,
 }
 
 template <class statevec_t>
-std::vector<BitVector> State<statevec_t>::sample_measure(const reg_t &qubits,
-                                                         uint_t shots,
-                                                         RngEngine &rng) {
+std::vector<SampleVector> State<statevec_t>::sample_measure(const reg_t &qubits,
+                                                            uint_t shots,
+                                                            RngEngine &rng) {
   uint_t i;
   // Generate flat register for storing
   std::vector<double> rnds;
@@ -1034,11 +1034,11 @@ std::vector<BitVector> State<statevec_t>::sample_measure(const reg_t &qubits,
 
   allbit_samples = BaseState::qreg_.sample_measure(rnds);
 
-  // Convert to BitVector format
+  // Convert to SampleVector format
   int_t npar = BaseState::threads_;
   if (npar > shots)
     npar = shots;
-  std::vector<BitVector> all_samples(shots, BitVector(qubits.size()));
+  std::vector<SampleVector> all_samples(shots, SampleVector(qubits.size()));
 
   auto convert_to_bit_lambda = [this, &allbit_samples, &all_samples, shots,
                                 qubits, npar](int_t i) {
@@ -1046,8 +1046,8 @@ std::vector<BitVector> State<statevec_t>::sample_measure(const reg_t &qubits,
     ishot = shots * i / npar;
     iend = shots * (i + 1) / npar;
     for (; ishot < iend; ishot++) {
-      BitVector allbit_sample;
-      allbit_sample.from_uint(qubits.size(), allbit_samples[ishot]);
+      SampleVector allbit_sample;
+      allbit_sample.from_uint(allbit_samples[ishot], qubits.size());
       all_samples[ishot].map(allbit_sample, qubits);
     }
   };
diff --git a/src/simulators/tensor_network/tensor_net.hpp b/src/simulators/tensor_network/tensor_net.hpp
index 7effeb2145..f2a9ba115d 100644
--- a/src/simulators/tensor_network/tensor_net.hpp
+++ b/src/simulators/tensor_network/tensor_net.hpp
@@ -248,7 +248,8 @@ class TensorNet {
   // Return M sampled outcomes for Z-basis measurement of all qubits
   // The input is a length M list of random reals between [0, 1) used for
   // generating samples.
-  std::vector<BitVector> sample_measure(const std::vector<double> &rnds) const;
+  std::vector<SampleVector>
+  sample_measure(const std::vector<double> &rnds) const;
 
   void apply_reset(const reg_t &qubits);
 
@@ -320,7 +321,7 @@ class TensorNet {
 
   void buffer_statevector(void) const;
 
-  void sample_measure_branch(std::vector<BitVector> &samples,
+  void sample_measure_branch(std::vector<SampleVector> &samples,
                              const std::vector<double> &rnds,
                              const reg_t &input_sample_index,
                              const reg_t &input_shot_index,
@@ -1175,10 +1176,10 @@ void TensorNet<data_t>::apply_reset(const reg_t &qubits) {
 // Sample measure outcomes
 //------------------------------------------------------------------------------
 template <typename data_t>
-std::vector<BitVector>
+std::vector<SampleVector>
 TensorNet<data_t>::sample_measure(const std::vector<double> &rnds) const {
   const int_t SHOTS = rnds.size();
-  std::vector<BitVector> samples(SHOTS);
+  std::vector<SampleVector> samples(SHOTS);
   reg_t sample_index(SHOTS);
   reg_t shot_index(SHOTS);
   reg_t probs(num_qubits_, 0);
@@ -1193,12 +1194,10 @@ TensorNet<data_t>::sample_measure(const std::vector<double> &rnds) const {
 }
 
 template <typename data_t>
-void TensorNet<data_t>::sample_measure_branch(std::vector<BitVector> &samples,
-                                              const std::vector<double> &rnds,
-                                              const reg_t &input_sample_index,
-                                              const reg_t &input_shot_index,
-                                              const reg_t &input_measured_probs,
-                                              const uint_t pos_measured) const {
+void TensorNet<data_t>::sample_measure_branch(
+    std::vector<SampleVector> &samples, const std::vector<double> &rnds,
+    const reg_t &input_sample_index, const reg_t &input_shot_index,
+    const reg_t &input_measured_probs, const uint_t pos_measured) const {
   const uint_t SHOTS = rnds.size();
 
   /*---------------------------------------------------------------------------
diff --git a/src/simulators/tensor_network/tensor_net_executor.hpp b/src/simulators/tensor_network/tensor_net_executor.hpp
index 32a040b1d4..5bcc47532f 100644
--- a/src/simulators/tensor_network/tensor_net_executor.hpp
+++ b/src/simulators/tensor_network/tensor_net_executor.hpp
@@ -67,7 +67,7 @@ class Executor : public CircuitExecutor::MultiStateExecutor<state_t> {
   void apply_kraus(CircuitExecutor::Branch &root, const reg_t &qubits,
                    const std::vector<cmatrix_t> &kmats);
 
-  std::vector<BitVector>
+  std::vector<SampleVector>
   sample_measure(state_t &state, const reg_t &qubits, uint_t shots,
                  std::vector<RngEngine> &rng) const override;
 
@@ -529,7 +529,7 @@ void Executor<state_t>::apply_save_amplitudes(CircuitExecutor::Branch &root,
 }
 
 template <class state_t>
-std::vector<BitVector>
+std::vector<SampleVector>
 Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
                                   uint_t shots,
                                   std::vector<RngEngine> &rng) const {
@@ -540,8 +540,8 @@ Executor<state_t>::sample_measure(state_t &state, const reg_t &qubits,
   for (i = 0; i < (int_t)shots; ++i)
     rnds.push_back(rng[i].rand(0, 1));
 
-  std::vector<BitVector> samples = state.qreg().sample_measure(rnds);
-  std::vector<BitVector> ret(shots, BitVector(qubits.size()));
+  std::vector<SampleVector> samples = state.qreg().sample_measure(rnds);
+  std::vector<SampleVector> ret(shots, SampleVector(qubits.size()));
 
   if (omp_get_num_threads() > 1) {
     for (i = 0; i < (int_t)shots; ++i) {
diff --git a/src/simulators/tensor_network/tensor_net_state.hpp b/src/simulators/tensor_network/tensor_net_state.hpp
index 4d50230298..b7e5c36368 100644
--- a/src/simulators/tensor_network/tensor_net_state.hpp
+++ b/src/simulators/tensor_network/tensor_net_state.hpp
@@ -139,7 +139,7 @@ class State : public QuantumState::State<tensor_net_t> {
 
   // Sample n-measurement outcomes without applying the measure operation
   // to the system state
-  virtual std::vector<BitVector>
+  virtual std::vector<SampleVector>
   sample_measure(const reg_t &qubits, uint_t shots, RngEngine &rng) override;
 
   // Load the threshold for applying OpenMP parallelization
@@ -896,17 +896,17 @@ void State<tensor_net_t>::measure_reset_update(
 }
 
 template <class tensor_net_t>
-std::vector<BitVector> State<tensor_net_t>::sample_measure(const reg_t &qubits,
-                                                           uint_t shots,
-                                                           RngEngine &rng) {
+std::vector<SampleVector>
+State<tensor_net_t>::sample_measure(const reg_t &qubits, uint_t shots,
+                                    RngEngine &rng) {
   // Generate flat register for storing
   std::vector<double> rnds(shots);
 
   for (uint_t i = 0; i < shots; ++i)
     rnds[i] = rng.rand(0, 1);
 
-  std::vector<BitVector> samples = BaseState::qreg_.sample_measure(rnds);
-  std::vector<BitVector> ret(shots, BitVector(qubits.size()));
+  std::vector<SampleVector> samples = BaseState::qreg_.sample_measure(rnds);
+  std::vector<SampleVector> ret(shots, SampleVector(qubits.size()));
 
   if (omp_get_num_threads() > 1) {
     for (uint_t i = 0; i < shots; ++i) {

From 8467863a37611f86b2e7ee4dd49ecf367a803833 Mon Sep 17 00:00:00 2001
From: Jun Doi <doichan@jp.ibm.com>
Date: Fri, 1 Mar 2024 17:19:53 +0900
Subject: [PATCH 3/3] format

---
 src/controllers/state_controller.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/controllers/state_controller.hpp b/src/controllers/state_controller.hpp
index 0335f4c493..7cf4843cb2 100644
--- a/src/controllers/state_controller.hpp
+++ b/src/controllers/state_controller.hpp
@@ -1458,7 +1458,8 @@ std::vector<std::string> AerState::sample_memory(const reg_t &qubits,
 
   std::vector<std::string> ret;
   ret.reserve(shots);
-  std::vector<SampleVector> samples = state_->sample_measure(qubits, shots, rng_);
+  std::vector<SampleVector> samples =
+      state_->sample_measure(qubits, shots, rng_);
   for (auto &sample : samples) {
     ret.push_back(sample.to_string());
   }
@@ -1471,7 +1472,8 @@ std::unordered_map<uint_t, uint_t> AerState::sample_counts(const reg_t &qubits,
 
   flush_ops();
 
-  std::vector<SampleVector> samples = state_->sample_measure(qubits, shots, rng_);
+  std::vector<SampleVector> samples =
+      state_->sample_measure(qubits, shots, rng_);
   std::unordered_map<uint_t, uint_t> ret;
   for (const auto &sample : samples) {
     uint_t sample_u = sample(0); // only the first 64bits is used