From cf49fd902596310d4aad8b74fe5f25501172cbb9 Mon Sep 17 00:00:00 2001
From: Jiakun Yan <jiakunyan1998@gmail.com>
Date: Mon, 29 Sep 2025 11:24:36 -0500
Subject: [PATCH] fix the clang-format ci and clang-format all files

---
 .clang-format                               |   39 +-
 .github/workflows/clang-format.yaml         |   19 +-
 include/converse.h                          |   71 +-
 src/barrier.h                               |   24 +-
 src/cldb.none.cpp                           |    8 +-
 src/cldb.rand.cpp                           |    5 +-
 src/cmirdmautils.cpp                        |   96 +-
 src/collectives.cpp                         |  366 +-
 src/comm_backend/comm_backend.h             |    6 +-
 src/comm_backend/comm_backend_internal.cpp  |   13 +-
 src/comm_backend/comm_backend_internal.h    |    5 +-
 src/comm_backend/lci2/comm_backend_lci2.cpp |    5 +-
 src/comm_backend/lci2/comm_backend_lci2.h   |   11 +-
 src/concurrentqueue.h                       | 7400 ++++++++++---------
 src/conv-conds.cpp                          |   23 +-
 src/conv-rdma.cpp                           |   97 +-
 src/conv-topology.cpp                       |  433 +-
 src/conv-topology.h                         |    5 +-
 src/convcore.cpp                            |   21 +-
 src/converse_internal.h                     |   10 +-
 src/cpuaffinity.cpp                         |  387 +-
 src/msgmgr.cpp                              |  123 +-
 src/queue.h                                 |  163 +-
 src/scheduler.cpp                           |   21 +-
 src/threads.cpp                             |   21 +-
 tests/ping_ack/ping.cpp                     |  184 +-
 tests/rdma_pingpong/pingpong.cpp            |   31 +-
 tests/self_send/self_send.cpp               |   12 +-
 28 files changed, 5024 insertions(+), 4575 deletions(-)

diff --git a/.clang-format b/.clang-format
index 1934577..b68024f 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,6 +1,6 @@
 ---
 Language:        Cpp
-BasedOnStyle:   LLVM
+# BasedOnStyle:  LLVM
 AccessModifierOffset: -2
 AlignAfterOpenBracket: Align
 AlignArrayOfStructures: None
@@ -36,29 +36,7 @@ AlignConsecutiveShortCaseStatements:
   Enabled:         false
   AcrossEmptyLines: false
   AcrossComments:  false
-  AlignCaseArrows: false
   AlignCaseColons: false
-AlignConsecutiveTableGenBreakingDAGArgColons:
-  Enabled:         false
-  AcrossEmptyLines: false
-  AcrossComments:  false
-  AlignCompound:   false
-  AlignFunctionPointers: false
-  PadOperators:    false
-AlignConsecutiveTableGenCondOperatorColons:
-  Enabled:         false
-  AcrossEmptyLines: false
-  AcrossComments:  false
-  AlignCompound:   false
-  AlignFunctionPointers: false
-  PadOperators:    false
-AlignConsecutiveTableGenDefinitionColons:
-  Enabled:         false
-  AcrossEmptyLines: false
-  AcrossComments:  false
-  AlignCompound:   false
-  AlignFunctionPointers: false
-  PadOperators:    false
 AlignEscapedNewlines: Right
 AlignOperands:   Align
 AlignTrailingComments:
@@ -68,7 +46,6 @@ AllowAllArgumentsOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowBreakBeforeNoexceptSpecifier: Never
 AllowShortBlocksOnASingleLine: Never
-AllowShortCaseExpressionOnASingleLine: true
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortCompoundRequirementOnASingleLine: true
 AllowShortEnumsOnASingleLine: true
@@ -77,7 +54,9 @@ AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: All
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
 AttributeMacros:
   - __capability
 BinPackArguments: true
@@ -105,7 +84,6 @@ BraceWrapping:
 BreakAdjacentStringLiterals: true
 BreakAfterAttributes: Leave
 BreakAfterJavaFieldAnnotations: false
-BreakAfterReturnType: None
 BreakArrays:     true
 BreakBeforeBinaryOperators: None
 BreakBeforeConceptDeclarations: Always
@@ -113,10 +91,8 @@ BreakBeforeBraces: Attach
 BreakBeforeInlineASMColon: OnlyMultiline
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializers: BeforeColon
-BreakFunctionDefinitionParameters: false
 BreakInheritanceList: BeforeColon
 BreakStringLiterals: true
-BreakTemplateDeclarations: MultiLine
 ColumnLimit:     80
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: false
@@ -172,15 +148,12 @@ IntegerLiteralSeparator:
   HexMinDigits:    0
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
-KeepEmptyLines:
-  AtEndOfFile:     false
-  AtStartOfBlock:  true
-  AtStartOfFile:   true
+KeepEmptyLinesAtTheStartOfBlocks: true
+KeepEmptyLinesAtEOF: false
 LambdaBodyIndentation: Signature
 LineEnding:      DeriveLF
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
-MainIncludeChar: Quote
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBinPackProtocolList: Auto
@@ -249,7 +222,6 @@ SpacesInLineCommentPrefix:
   Maximum:         -1
 SpacesInParens:  Never
 SpacesInParensOptions:
-  ExceptDoubleParentheses: false
   InCStyleCasts:   false
   InConditionalStatements: false
   InEmptyParentheses: false
@@ -261,7 +233,6 @@ StatementAttributeLikeMacros:
 StatementMacros:
   - Q_UNUSED
   - QT_REQUIRE_VERSION
-TableGenBreakInsideDAGArg: DontBreak
 TabWidth:        8
 UseTab:          Never
 VerilogBreakBetweenInstancePorts: true
diff --git a/.github/workflows/clang-format.yaml b/.github/workflows/clang-format.yaml
index 710accd..a53e3f9 100644
--- a/.github/workflows/clang-format.yaml
+++ b/.github/workflows/clang-format.yaml
@@ -21,10 +21,19 @@ jobs:
         sudo apt-get update
         sudo apt-get install clang-format
 
-    - name: Run clang-format
+    - name: Verify clang-format installation
       run: |
-        git diff --exit-code --ignore-submodules -- '*.cpp' '*.h' '*.c' '*.hpp'
-        if [ $? -ne 0 ]; then
-          echo "Clang-format failed. Please format your code."
+        clang-format --version
+
+    - name: Run clang-format on all files
+      run: |
+        # From anywhere inside the repo:
+        repo_root=$(git rev-parse --show-toplevel) && cd "$repo_root"
+        git ls-files -z -- '*.c' '*.cc' '*.cpp' '*.cxx' '*.h' '*.hh' '*.hpp' '*.hxx' \
+        | xargs -0 -r clang-format -i
+
+        # Fail if formatting made changes
+        git diff --exit-code || {
+          echo "clang-format changed files above. Please commit the formatting."
           exit 1
-        fi
\ No newline at end of file
+        }
\ No newline at end of file
diff --git a/include/converse.h b/include/converse.h
index 85bb459..bf74caa 100644
--- a/include/converse.h
+++ b/include/converse.h
@@ -302,7 +302,7 @@ int CmiMyRank();
 int CmiNumPes();
 int CmiNumNodes();
 // FIXME
-//#define CmiPhysicalNodeID(node) (node)
+// #define CmiPhysicalNodeID(node) (node)
 extern int CmiPhysicalNodeID(int pe);
 int CmiNodeOf(int pe);
 int CmiRankOf(int pe);
@@ -836,12 +836,12 @@ enum ncpyFreeNcpyOpInfoMode {
 #define CMK_SPANTREE_MAXSPAN 4
 #define CST_W (CMK_SPANTREE_MAXSPAN)
 #define CST_NN (CmiNumNodes())
-#define CmiNodeSpanTreeParent(n) ((n) ? (((n)-1) / CST_W) : (-1))
+#define CmiNodeSpanTreeParent(n) ((n) ? (((n) - 1) / CST_W) : (-1))
 #define CmiNodeSpanTreeChildren(n, c)                                          \
   do {                                                                         \
     int _i;                                                                    \
     for (_i = 0; _i < CST_W; _i++) {                                           \
-      int _x = (n)*CST_W + _i + 1;                                             \
+      int _x = (n) * CST_W + _i + 1;                                           \
       if (_x < CST_NN)                                                         \
         (c)[_i] = _x;                                                          \
     }                                                                          \
@@ -849,7 +849,7 @@ enum ncpyFreeNcpyOpInfoMode {
 #define CmiNumNodeSpanTreeChildren(n)                                          \
   ((((n) + 1) * CST_W < CST_NN)                                                \
        ? CST_W                                                                 \
-       : ((((n)*CST_W + 1) >= CST_NN) ? 0 : ((CST_NN - 1) - (n)*CST_W)))
+       : ((((n) * CST_W + 1) >= CST_NN) ? 0 : ((CST_NN - 1) - (n) * CST_W)))
 #define CST_R(p) (CmiRankOf(p))
 #define CST_NF(n) (CmiNodeFirst(n))
 #define CST_SP(n) (CmiNodeSpanTreeParent(n))
@@ -920,32 +920,42 @@ void registerTraceInit(void (*fn)(char **argv));
 
 int CmiDeliverMsgs(int maxmsgs);
 
-#define CmiMemoryReadFence()                 std::atomic_thread_fence(std::memory_order_seq_cst)
-#define CmiMemoryWriteFence()                std::atomic_thread_fence(std::memory_order_seq_cst)
+#define CmiMemoryReadFence() std::atomic_thread_fence(std::memory_order_seq_cst)
+#define CmiMemoryWriteFence()                                                  \
+  std::atomic_thread_fence(std::memory_order_seq_cst)
 
 extern CmiNodeLock CmiMemLock_lock;
-#define CmiMemLock() do{if (CmiMemLock_lock) CmiLock(CmiMemLock_lock);} while (0)
+#define CmiMemLock()                                                           \
+  do {                                                                         \
+    if (CmiMemLock_lock)                                                       \
+      CmiLock(CmiMemLock_lock);                                                \
+  } while (0)
 
-#define CmiMemUnlock() do{if (CmiMemLock_lock) CmiUnlock(CmiMemLock_lock);} while (0)
+#define CmiMemUnlock()                                                         \
+  do {                                                                         \
+    if (CmiMemLock_lock)                                                       \
+      CmiUnlock(CmiMemLock_lock);                                              \
+  } while (0)
 
 template <typename T> struct CmiIsAtomic : std::false_type {};
 template <typename T> struct CmiIsAtomic<std::atomic<T>> : std::true_type {};
 
 template <typename T>
 typename std::enable_if<CmiIsAtomic<T>::value, typename T::value_type>::type
-CmiAtomicFetchAndIncImpl(T& input) {
-    return std::atomic_fetch_add(&input, typename T::value_type(1));
+CmiAtomicFetchAndIncImpl(T &input) {
+  return std::atomic_fetch_add(&input, typename T::value_type(1));
 }
 
 template <typename T>
 typename std::enable_if<!CmiIsAtomic<T>::value, T>::type
-CmiAtomicFetchAndIncImpl(T& input) {
-    T old = input;
-    ++input;
-    return old;
+CmiAtomicFetchAndIncImpl(T &input) {
+  T old = input;
+  ++input;
+  return old;
 }
 
-#define CmiMemoryAtomicFetchAndInc(input, output) ((output) = CmiAtomicFetchAndIncImpl(input))
+#define CmiMemoryAtomicFetchAndInc(input, output)                              \
+  ((output) = CmiAtomicFetchAndIncImpl(input))
 
 #define CmiEnableUrgentSend(yn) /* intentionally left empty */
 
@@ -953,30 +963,29 @@ typedef struct CmmTableStruct *CmmTable;
 
 #define CmmWildCard (-1)
 
-//typedef void (*CmmPupMessageFn)(pup_er p,void **msg);
-//CmmTable CmmPup(pup_er p, CmmTable t, CmmPupMessageFn msgpup);
-
-CmmTable   CmmNew(void);
-void       CmmFree(CmmTable t);
-void	   CmmFreeAll(CmmTable t);
-void       CmmPut(CmmTable t, int ntags, int *tags, void *msg);
-void      *CmmFind(CmmTable t, int ntags, int *tags, int *returntags, int del);
-int        CmmEntries(CmmTable t);
-int 	   CmmGetLastTag(CmmTable t, int ntags, int *tags);
-#define    CmmGet(t,nt,tg,rt)   (CmmFind((t),(nt),(tg),(rt),1))
-#define    CmmProbe(t,nt,tg,rt) (CmmFind((t),(nt),(tg),(rt),0))
+// typedef void (*CmmPupMessageFn)(pup_er p,void **msg);
+// CmmTable CmmPup(pup_er p, CmmTable t, CmmPupMessageFn msgpup);
 
+CmmTable CmmNew(void);
+void CmmFree(CmmTable t);
+void CmmFreeAll(CmmTable t);
+void CmmPut(CmmTable t, int ntags, int *tags, void *msg);
+void *CmmFind(CmmTable t, int ntags, int *tags, int *returntags, int del);
+int CmmEntries(CmmTable t);
+int CmmGetLastTag(CmmTable t, int ntags, int *tags);
+#define CmmGet(t, nt, tg, rt) (CmmFind((t), (nt), (tg), (rt), 1))
+#define CmmProbe(t, nt, tg, rt) (CmmFind((t), (nt), (tg), (rt), 0))
 
 #ifndef CMI_CACHE_LINE_SIZE
 #ifdef __cpp_lib_hardware_interference_size
-# define CMI_CACHE_LINE_SIZE std::hardware_destructive_interference_size
+#define CMI_CACHE_LINE_SIZE std::hardware_destructive_interference_size
 #elif CMK_PPC64 || (defined __APPLE__ && defined __arm64__)
-# define CMI_CACHE_LINE_SIZE 128
+#define CMI_CACHE_LINE_SIZE 128
 #else
-# define CMI_CACHE_LINE_SIZE 64
+#define CMI_CACHE_LINE_SIZE 64
 #endif
 #endif
-//partitions
+// partitions
 
 typedef enum Partition_Type {
   PARTITION_SINGLETON,
diff --git a/src/barrier.h b/src/barrier.h
index a77a804..51f6f9f 100644
--- a/src/barrier.h
+++ b/src/barrier.h
@@ -1,22 +1,21 @@
-// Acknowledgement: adopted from https://github.com/uiuc-hpc/lci/blob/master/lct/tbarrier/tbarrier.cpp
+// Acknowledgement: adopted from
+// https://github.com/uiuc-hpc/lci/blob/master/lct/tbarrier/tbarrier.cpp
 
-#include <iostream>
 #include <atomic>
+#include <iostream>
 #include <thread>
 
 class Barrier {
- private:
+private:
   alignas(64) std::atomic<int> waiting;
   alignas(64) std::atomic<int64_t> step;
   alignas(64) int thread_num_;
 
- public:
+public:
   explicit Barrier(int thread_num)
-      : waiting(0), step(0), thread_num_(thread_num)
-  {}
+      : waiting(0), step(0), thread_num_(thread_num) {}
 
-  int64_t arrive()
-  {
+  int64_t arrive() {
     int64_t mstep = step.load();
     if (++waiting == thread_num_) {
       waiting = 0;
@@ -27,13 +26,12 @@ class Barrier {
 
   bool test_ticket(int64_t ticket) { return ticket != step; }
 
-  void wait_ticket(int64_t ticket)
-  {
-    while (!test_ticket(ticket)) continue;
+  void wait_ticket(int64_t ticket) {
+    while (!test_ticket(ticket))
+      continue;
   }
 
-  void wait()
-  {
+  void wait() {
     int64_t ticket = arrive();
     wait_ticket(ticket);
   }
diff --git a/src/cldb.none.cpp b/src/cldb.none.cpp
index 39d4aaa..3e625e8 100644
--- a/src/cldb.none.cpp
+++ b/src/cldb.none.cpp
@@ -25,9 +25,9 @@ void CldHandler(char *msg) {
             msg); // use priority queue when we add priority queue
 }
 
-void CldEnqueueGroup(CmiGroup grp, void *msg, int infofn)
-{
-  int len, queueing, priobits; unsigned int *prioptr;
+void CldEnqueueGroup(CmiGroup grp, void *msg, int infofn) {
+  int len, queueing, priobits;
+  unsigned int *prioptr;
   CldInfoFn ifn = (CldInfoFn)CmiHandlerToFunction(infofn);
   CldPackFn pfn;
   ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr);
@@ -36,7 +36,7 @@ void CldEnqueueGroup(CmiGroup grp, void *msg, int infofn)
     ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr);
   }
   CldSwitchHandler((char *)msg, CldHandlerIndex);
-  CmiSetInfo(msg,infofn);
+  CmiSetInfo(msg, infofn);
 
   CmiSyncMulticastAndFree(grp, len, msg);
 }
diff --git a/src/cldb.rand.cpp b/src/cldb.rand.cpp
index ccda7d4..e936339 100644
--- a/src/cldb.rand.cpp
+++ b/src/cldb.rand.cpp
@@ -103,9 +103,8 @@ void CldEnqueue(int pe, void *msg, int infofn) {
     /* CsdEnqueueGeneral is not thread or SIGIO safe */
     // CmiPrintf("   myself processor %d ==> %d, length=%d Timer:%f , priori=%d
     // \n", CmiMyPe(), pe, len, CmiWallTimer(), *prioptr);
-    //CsdEnqueueGeneral(msg, queueing, priobits, prioptr);
-    CmiPushPE(CmiMyPe(), len,
-              msg);
+    // CsdEnqueueGeneral(msg, queueing, priobits, prioptr);
+    CmiPushPE(CmiMyPe(), len, msg);
   } else {
     ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr);
     if (pfn && CmiNodeOf(pe) != CmiMyNode()) {
diff --git a/src/cmirdmautils.cpp b/src/cmirdmautils.cpp
index 990c4b5..ea77245 100644
--- a/src/cmirdmautils.cpp
+++ b/src/cmirdmautils.cpp
@@ -1,43 +1,25 @@
 #include "cmirdmautils.h"
 #include "converse.h" // for CmiAbort usage to avoid undeclared warning
 
+#include <limits>
 #include <stdio.h>
 #include <string.h>
-#include <limits>
 
-int getNcpyOpInfoTotalSize(
-  int srcLayerSize,
-  int srcAckSize,
-  int destLayerSize,
-  int destAckSize) {
-  return sizeof(NcpyOperationInfo) + srcLayerSize + destLayerSize + srcAckSize + destAckSize;
+int getNcpyOpInfoTotalSize(int srcLayerSize, int srcAckSize, int destLayerSize,
+                           int destAckSize) {
+  return sizeof(NcpyOperationInfo) + srcLayerSize + destLayerSize + srcAckSize +
+         destAckSize;
 }
 
 void setNcpyOpInfo(
-    const void *srcPtr,
-    char *srcLayerInfo,
-    int srcLayerSize,
-    char *srcAck,
-    int srcAckSize,
-    size_t srcSize,
-    unsigned short int srcRegMode,
-    unsigned short int srcDeregMode,
-    unsigned short int isSrcRegistered,
-    int srcPe,
-    const void *srcRef,
-    const void *destPtr,
-    char *destLayerInfo,
-    int destLayerSize,
-    char *destAck,
-    int destAckSize,
-    size_t destSize,
-    unsigned short int destRegMode,
-    unsigned short int destDeregMode,
-    unsigned short int isDestRegistered,
-    int destPe,
-    const void *destRef,
-    int rootNode,
-    NcpyOperationInfo *ncpyOpInfo) {
+    const void *srcPtr, char *srcLayerInfo, int srcLayerSize, char *srcAck,
+    int srcAckSize, size_t srcSize, unsigned short int srcRegMode,
+    unsigned short int srcDeregMode, unsigned short int isSrcRegistered,
+    int srcPe, const void *srcRef, const void *destPtr, char *destLayerInfo,
+    int destLayerSize, char *destAck, int destAckSize, size_t destSize,
+    unsigned short int destRegMode, unsigned short int destDeregMode,
+    unsigned short int isDestRegistered, int destPe, const void *destRef,
+    int rootNode, NcpyOperationInfo *ncpyOpInfo) {
 
   char *base = (char *)ncpyOpInfo + sizeof(NcpyOperationInfo);
 
@@ -47,28 +29,28 @@ void setNcpyOpInfo(
   ncpyOpInfo->destAck = NULL;
 
   // memcpy srcLayerInfo
-  if(srcLayerInfo != NULL && srcLayerSize != 0) {
+  if (srcLayerInfo != NULL && srcLayerSize != 0) {
     memcpy(base, srcLayerInfo, srcLayerSize);
     ncpyOpInfo->srcLayerInfo = base;
     base = base + srcLayerSize;
   }
 
   // memcpy srcAckInfo
-  if(srcAck != NULL && srcAckSize != 0) {
+  if (srcAck != NULL && srcAckSize != 0) {
     memcpy(base, srcAck, srcAckSize);
     ncpyOpInfo->srcAck = base;
     base = base + srcAckSize;
   }
 
   // memcpy destLayerInfo
-  if(srcLayerInfo != NULL && destLayerSize != 0) {
+  if (srcLayerInfo != NULL && destLayerSize != 0) {
     memcpy(base, destLayerInfo, destLayerSize);
     ncpyOpInfo->destLayerInfo = base;
     base = base + destLayerSize;
   }
 
   // memcpy destAck Info
-  if(destAck != NULL && destAckSize != 0) {
+  if (destAck != NULL && destAckSize != 0) {
     memcpy(base, destAck, destAckSize);
     ncpyOpInfo->destAck = base;
   }
@@ -103,50 +85,56 @@ void setNcpyOpInfo(
   CmiAssert(isDestRegistered <= std::numeric_limits<unsigned char>::max());
   ncpyOpInfo->isDestRegistered = (unsigned char)isDestRegistered;
 
-  ncpyOpInfo->opMode  = CMK_DIRECT_API; // default operation mode is CMK_DIRECT_API
-  ncpyOpInfo->ackMode = CMK_SRC_DEST_ACK; // default ack mode is CMK_SRC_DEST_ACK
-  ncpyOpInfo->freeMe  = CMK_FREE_NCPYOPINFO; // default freeMe mode is CMK_FREE_NCPYOPINFO
+  ncpyOpInfo->opMode =
+      CMK_DIRECT_API; // default operation mode is CMK_DIRECT_API
+  ncpyOpInfo->ackMode =
+      CMK_SRC_DEST_ACK; // default ack mode is CMK_SRC_DEST_ACK
+  ncpyOpInfo->freeMe =
+      CMK_FREE_NCPYOPINFO; // default freeMe mode is CMK_FREE_NCPYOPINFO
 
   ncpyOpInfo->rootNode = rootNode;
 
-  ncpyOpInfo->ncpyOpInfoSize = (unsigned short int)(sizeof(NcpyOperationInfo) + srcLayerSize + destLayerSize + srcAckSize + destAckSize);
+  ncpyOpInfo->ncpyOpInfoSize =
+      (unsigned short int)(sizeof(NcpyOperationInfo) + srcLayerSize +
+                           destLayerSize + srcAckSize + destAckSize);
 }
 
-
 void resetNcpyOpInfoPointers(NcpyOperationInfo *ncpyOpInfo) {
 
   char *base = (char *)ncpyOpInfo + sizeof(NcpyOperationInfo);
 
-  if(ncpyOpInfo->srcLayerInfo) {
+  if (ncpyOpInfo->srcLayerInfo) {
     ncpyOpInfo->srcLayerInfo = base;
     base = base + ncpyOpInfo->srcLayerSize;
   }
 
-  if(ncpyOpInfo->srcAck) {
+  if (ncpyOpInfo->srcAck) {
     ncpyOpInfo->srcAck = base;
     base = base + ncpyOpInfo->srcAckSize;
   }
 
-  if(ncpyOpInfo->destLayerInfo) {
+  if (ncpyOpInfo->destLayerInfo) {
     ncpyOpInfo->destLayerInfo = base;
     base = base + ncpyOpInfo->destLayerSize;
   }
 
-  if(ncpyOpInfo->destAck) {
+  if (ncpyOpInfo->destAck) {
     ncpyOpInfo->destAck = base;
   }
-
 }
 
 void setReverseModeForNcpyOpInfo(NcpyOperationInfo *ncpyOpInfo) {
-  switch(ncpyOpInfo->opMode) {
-    case CMK_EM_API          : ncpyOpInfo->opMode = CMK_EM_API_REVERSE;
-                               break;
-    case CMK_DIRECT_API      : // Do nothing
-                               break;
-    case CMK_BCAST_EM_API    : ncpyOpInfo->opMode = CMK_BCAST_EM_API_REVERSE;
-                               break;
-    default                  : CmiAbort("Unknown opcode");
-                               break;
+  switch (ncpyOpInfo->opMode) {
+  case CMK_EM_API:
+    ncpyOpInfo->opMode = CMK_EM_API_REVERSE;
+    break;
+  case CMK_DIRECT_API: // Do nothing
+    break;
+  case CMK_BCAST_EM_API:
+    ncpyOpInfo->opMode = CMK_BCAST_EM_API_REVERSE;
+    break;
+  default:
+    CmiAbort("Unknown opcode");
+    break;
   }
 }
diff --git a/src/collectives.cpp b/src/collectives.cpp
index 94f4b85..e0037b5 100644
--- a/src/collectives.cpp
+++ b/src/collectives.cpp
@@ -27,187 +27,187 @@ void collectiveInit(void) {
 
 /* Broadcast to everyone but the source pe. Source does not free. */
 void CmiSyncBroadcast(int size, void *msg) {
-    DEBUGF("[%d] CmiSyncBroadcast\n", CmiMyPe());
-    int pe = CmiMyPe();
-  
-    CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
-    header->messageSize = size;
-  
-  #ifdef SPANTREE
-  #if SPANTREE ON
-    DEBUGF("[%d] Spanning tree option\n", CmiMyPe());
-    CmiSetBcastSource(msg, pe); // used to skip the source
-    header->swapHandlerId = header->handlerId;
-    header->handlerId = Cmi_bcastHandler;
-    CmiSyncSend(0, size, msg);
-  #else
-    for (int i = pe + 1; i < CmiNumPes(); i++)
-        CmiSyncSend(i, size, msg);
-  
-    for (int i = 0; i < pe; i++)
-      CmiSyncSend(i, size, msg);
-  #endif
-  #else
-  
-    for (int i = pe + 1; i < CmiNumPes(); i++)
-      CmiSyncSend(i, size, msg);
-  
-    for (int i = 0; i < pe; i++)
-      CmiSyncSend(i, size, msg);
-  #endif
-  }
-  
-  void CmiSyncBroadcastAndFree(int size, void *msg) {
-    CmiSyncBroadcast(size, msg);
-    CmiFree(msg);
-  }
-  
-  void CmiSyncBroadcastAll(int size, void *msg) {
-    DEBUGF("[%d] CmiSyncBroadcastAll\n", CmiMyPe());
-    CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
-    header->messageSize = size;
-  
-  #ifdef SPANTREE
-  #if SPANTREE ON
-    CmiSetBcastSource(msg, -1); // don't skip the source
-    header->swapHandlerId = header->handlerId;
-  
-    header->handlerId = Cmi_bcastHandler;
-    CmiSyncSend(0, size, msg);
-  #else
-    for (int i = 0; i < CmiNumPes(); i++)
-      CmiSyncSend(i, size, msg);
-  #endif
-  #else
-    for (int i = 0; i < CmiNumPes(); i++)
-      CmiSyncSend(i, size, msg);
-  #endif
-  }
-  
-  void CmiSyncBroadcastAllAndFree(int size, void *msg) {
-    CmiSyncBroadcastAll(size, msg);
-    CmiFree(msg);
-  }
-  
-  void CmiWithinNodeBroadcast(int size, void *msg) {
-    for (int i = 0; i < CmiMyNodeSize(); i++) {
-      int destPe = CmiMyNode() * CmiMyNodeSize() + i;
-      CmiSyncSend(destPe, size, msg);
-    }
-  }
-  
-  void CmiSyncNodeBroadcast(unsigned int size, void *msg) {
-
-    int node = CmiMyNode();
-  
-    CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
-    header->messageSize = size;
-  
-  #ifdef SPANTREE
-  #if SPANTREE ON
-    CmiSetBcastSource(msg, node); // used to skip the source
-    header->swapHandlerId = header->handlerId;
-    header->handlerId = Cmi_nodeBcastHandler;
-    CmiSyncNodeSend(0, size, msg);
-  #else
-  
-    for (int i = node + 1; i < CmiNumNodes(); i++)
-      CmiSyncNodeSend(i, size, msg);
-  
-    for (int i = 0; i < node; i++)
-      CmiSyncNodeSend(i, size, msg);
-  #endif
-  #else
-  
-    for (int i = node + 1; i < CmiNumNodes(); i++)
-      CmiSyncNodeSend(i, size, msg);
-  
-    for (int i = 0; i < node; i++)
-      CmiSyncNodeSend(i, size, msg);
-  #endif
-  }
-  
-  void CmiSyncNodeBroadcastAndFree(unsigned int size, void *msg) {
-    CmiSyncNodeBroadcast(size, msg);
-    CmiFree(msg);
-  }
-  
-  void CmiSyncNodeBroadcastAll(unsigned int size, void *msg) {
-    CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
-    header->messageSize = size;
-  
-  #ifdef SPANTREE
-  #if SPANTREE ON
-    CmiSetBcastSource(msg, -1); // don't skip the source
-    header->swapHandlerId = header->handlerId;
-    header->handlerId = Cmi_nodeBcastHandler;
-    CmiSyncNodeSend(0, size, msg);
-  #else
-  
-    for (int i = 0; i < CmiNumNodes(); i++)
-      CmiSyncNodeSend(i, size, msg);
-  #endif
-  #else
-  
-    for (int i = 0; i < CmiNumNodes(); i++)
-      CmiSyncNodeSend(i, size, msg);
-  #endif
-  }
-  
-  void CmiSyncNodeBroadcastAllAndFree(unsigned int size, void *msg) {
-    CmiSyncNodeBroadcastAll(size, msg);
-    CmiFree(msg);
+  DEBUGF("[%d] CmiSyncBroadcast\n", CmiMyPe());
+  int pe = CmiMyPe();
+
+  CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
+  header->messageSize = size;
+
+#ifdef SPANTREE
+#if SPANTREE ON
+  DEBUGF("[%d] Spanning tree option\n", CmiMyPe());
+  CmiSetBcastSource(msg, pe); // used to skip the source
+  header->swapHandlerId = header->handlerId;
+  header->handlerId = Cmi_bcastHandler;
+  CmiSyncSend(0, size, msg);
+#else
+  for (int i = pe + 1; i < CmiNumPes(); i++)
+    CmiSyncSend(i, size, msg);
+
+  for (int i = 0; i < pe; i++)
+    CmiSyncSend(i, size, msg);
+#endif
+#else
+
+  for (int i = pe + 1; i < CmiNumPes(); i++)
+    CmiSyncSend(i, size, msg);
+
+  for (int i = 0; i < pe; i++)
+    CmiSyncSend(i, size, msg);
+#endif
+}
+
+void CmiSyncBroadcastAndFree(int size, void *msg) {
+  CmiSyncBroadcast(size, msg);
+  CmiFree(msg);
+}
+
+void CmiSyncBroadcastAll(int size, void *msg) {
+  DEBUGF("[%d] CmiSyncBroadcastAll\n", CmiMyPe());
+  CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
+  header->messageSize = size;
+
+#ifdef SPANTREE
+#if SPANTREE ON
+  CmiSetBcastSource(msg, -1); // don't skip the source
+  header->swapHandlerId = header->handlerId;
+
+  header->handlerId = Cmi_bcastHandler;
+  CmiSyncSend(0, size, msg);
+#else
+  for (int i = 0; i < CmiNumPes(); i++)
+    CmiSyncSend(i, size, msg);
+#endif
+#else
+  for (int i = 0; i < CmiNumPes(); i++)
+    CmiSyncSend(i, size, msg);
+#endif
+}
+
+void CmiSyncBroadcastAllAndFree(int size, void *msg) {
+  CmiSyncBroadcastAll(size, msg);
+  CmiFree(msg);
+}
+
+void CmiWithinNodeBroadcast(int size, void *msg) {
+  for (int i = 0; i < CmiMyNodeSize(); i++) {
+    int destPe = CmiMyNode() * CmiMyNodeSize() + i;
+    CmiSyncSend(destPe, size, msg);
   }
-  
-  /* Handler for broadcast via the spanning tree. */
-  void CmiBcastHandler(void *msg) {
-    int mype = CmiMyPe();
-    int numChildren = CmiNumSpanTreeChildren(mype);
-    int children[numChildren];
-    CmiSpanTreeChildren(mype, children);
-  
-    CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
-  
-    // send broadcast to all children
-    for (int i = 0; i < numChildren; i++) {
-      CmiSyncSend(children[i], header->messageSize, msg);
-    }
-  
-    // call handler locally (unless I am source of broadcast, and bcast is
-    // exclusive)
-    if (CmiGetBcastSource(msg) != mype) {
-      CmiCallHandler(header->swapHandlerId, msg);
-    }
+}
+
+void CmiSyncNodeBroadcast(unsigned int size, void *msg) {
+
+  int node = CmiMyNode();
+
+  CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
+  header->messageSize = size;
+
+#ifdef SPANTREE
+#if SPANTREE ON
+  CmiSetBcastSource(msg, node); // used to skip the source
+  header->swapHandlerId = header->handlerId;
+  header->handlerId = Cmi_nodeBcastHandler;
+  CmiSyncNodeSend(0, size, msg);
+#else
+
+  for (int i = node + 1; i < CmiNumNodes(); i++)
+    CmiSyncNodeSend(i, size, msg);
+
+  for (int i = 0; i < node; i++)
+    CmiSyncNodeSend(i, size, msg);
+#endif
+#else
+
+  for (int i = node + 1; i < CmiNumNodes(); i++)
+    CmiSyncNodeSend(i, size, msg);
+
+  for (int i = 0; i < node; i++)
+    CmiSyncNodeSend(i, size, msg);
+#endif
+}
+
+void CmiSyncNodeBroadcastAndFree(unsigned int size, void *msg) {
+  CmiSyncNodeBroadcast(size, msg);
+  CmiFree(msg);
+}
+
+void CmiSyncNodeBroadcastAll(unsigned int size, void *msg) {
+  CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
+  header->messageSize = size;
+
+#ifdef SPANTREE
+#if SPANTREE ON
+  CmiSetBcastSource(msg, -1); // don't skip the source
+  header->swapHandlerId = header->handlerId;
+  header->handlerId = Cmi_nodeBcastHandler;
+  CmiSyncNodeSend(0, size, msg);
+#else
+
+  for (int i = 0; i < CmiNumNodes(); i++)
+    CmiSyncNodeSend(i, size, msg);
+#endif
+#else
+
+  for (int i = 0; i < CmiNumNodes(); i++)
+    CmiSyncNodeSend(i, size, msg);
+#endif
+}
+
+void CmiSyncNodeBroadcastAllAndFree(unsigned int size, void *msg) {
+  CmiSyncNodeBroadcastAll(size, msg);
+  CmiFree(msg);
+}
+
+/* Handler for broadcast via the spanning tree. */
+void CmiBcastHandler(void *msg) {
+  int mype = CmiMyPe();
+  int numChildren = CmiNumSpanTreeChildren(mype);
+  int children[numChildren];
+  CmiSpanTreeChildren(mype, children);
+
+  CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
+
+  // send broadcast to all children
+  for (int i = 0; i < numChildren; i++) {
+    CmiSyncSend(children[i], header->messageSize, msg);
   }
-  
-  /* Handler for node broadcast via the spanning tree. */
-  void CmiNodeBcastHandler(void *msg) {
-    int mynode = CmiMyNode();
-    int numChildren = CmiNumNodeSpanTreeChildren(mynode);
-    int children[numChildren];
-    CmiNodeSpanTreeChildren(mynode, children);
-  
-    CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
-  
-    // send broadcast to node children
-    for (int i = 0; i < numChildren; i++) {
-      CmiSyncNodeSend(children[i], header->messageSize, msg);
-    }
-  
-    if (CmiGetBcastSource(msg) != mynode) {
-      CmiCallHandler(header->swapHandlerId, msg);
-    }
+
+  // call handler locally (unless I am source of broadcast, and bcast is
+  // exclusive)
+  if (CmiGetBcastSource(msg) != mype) {
+    CmiCallHandler(header->swapHandlerId, msg);
   }
-  
-  void CmiSetBcastSource(void *msg, CmiBroadcastSource source) {
-    CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
-    header->collectiveMetaInfo = source;
+}
+
+/* Handler for node broadcast via the spanning tree. */
+void CmiNodeBcastHandler(void *msg) {
+  int mynode = CmiMyNode();
+  int numChildren = CmiNumNodeSpanTreeChildren(mynode);
+  int children[numChildren];
+  CmiNodeSpanTreeChildren(mynode, children);
+
+  CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
+
+  // send broadcast to node children
+  for (int i = 0; i < numChildren; i++) {
+    CmiSyncNodeSend(children[i], header->messageSize, msg);
   }
-  
-  CmiBroadcastSource CmiGetBcastSource(void *msg) {
-    CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
-    return header->collectiveMetaInfo;
+
+  if (CmiGetBcastSource(msg) != mynode) {
+    CmiCallHandler(header->swapHandlerId, msg);
   }
+}
+
+void CmiSetBcastSource(void *msg, CmiBroadcastSource source) {
+  CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
+  header->collectiveMetaInfo = source;
+}
+
+CmiBroadcastSource CmiGetBcastSource(void *msg) {
+  CmiMessageHeader *header = static_cast<CmiMessageHeader *>(msg);
+  return header->collectiveMetaInfo;
+}
 
 /************* Reductions ***************/
 
@@ -239,7 +239,6 @@ void CmiReductionsInit(void) {
 
     // node reduction must be initialized with a valid lock
     nodered.lock = CmiCreateLock(); // in non-smp this would just be a nullptr
-
   }
   CsvAccess(_node_reduction_info) = noderedinfo;
   CsvAccess(_node_reduction_counter) = 0;
@@ -281,7 +280,6 @@ static inline CmiReductionID getNextID(std::atomic<CmiReductionID> &ctr) {
   return old;
 }
 
-
 unsigned CmiGetReductionIndex(CmiReductionID id) {
   // treating the id as the index into the reduction table
   // utilized in getCreateReduction and clearReduction to find the reduction
@@ -292,7 +290,6 @@ unsigned CmiGetReductionIndex(CmiReductionID id) {
   return id;
 }
 
-
 // PROCESS REDUCTIONS
 static void CmiClearReduction(CmiReductionID id) {
   auto &reduction_ref = CpvAccess(_reduction_info)[CmiGetReductionIndex(id)];
@@ -310,9 +307,7 @@ CmiReductionID CmiGetNextReductionID() {
   return getNextID(CpvAccess(_reduction_counter));
 }
 
-void CmiResetGlobalReduceSeqID(void) {
-  CpvAccess(_reduction_counter) = 0;
-}
+void CmiResetGlobalReduceSeqID(void) { CpvAccess(_reduction_counter) = 0; }
 
 static CmiReduction *CmiGetCreateReduction(CmiReductionID id) {
   // should handle the 2 cases:
@@ -437,7 +432,6 @@ void CmiNodeReduce(void *msg, int size, CmiReduceMergeFn mergeFn) {
   CmiReduction *red = CmiGetCreateNodeReduction(id);
   CmiInternalNodeReduce(msg, size, mergeFn, red);
 
-
   CmiUnlock(nodeRed.lock);
 }
 
@@ -445,9 +439,7 @@ CmiReductionID CmiGetNextNodeReductionID() {
   return getNextID(CsvAccess(_node_reduction_counter));
 }
 
-void CmiResetGlobalNodeReduceSeqID(){
-  CsvAccess(_node_reduction_counter) = 0;
-}
+void CmiResetGlobalNodeReduceSeqID() { CsvAccess(_node_reduction_counter) = 0; }
 
 static CmiReduction *CmiGetCreateNodeReduction(CmiReductionID id) {
   // should handle the 2 cases:
@@ -544,9 +536,7 @@ void CmiNodeReduceHandler(void *msg) {
   reduction->messagesReceived++;
   CmiSendNodeReduce(reduction);
 
-
   CmiUnlock(nodeRed.lock);
-
 }
 
 /************* Groups ***************/
diff --git a/src/comm_backend/comm_backend.h b/src/comm_backend/comm_backend.h
index d9939be..70562ff 100644
--- a/src/comm_backend/comm_backend.h
+++ b/src/comm_backend/comm_backend.h
@@ -57,12 +57,14 @@ void issueAm(int rank, const void *local_buf, size_t size, mr_t mr,
  * @brief Issue a remote get operation. Thread-safe.
  */
 void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr,
-               uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context);
+               uintptr_t remote_disp, void *rmr, CompHandler localComp,
+               void *user_context);
 /**
  * @brief Issue a remote put operation. Thread-safe.
  */
 void issueRput(int rank, const void *local_buf, size_t size, mr_t local_mr,
-               uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context);
+               uintptr_t remote_disp, void *rmr, CompHandler localComp,
+               void *user_context);
 /**
  * @brief Make progress on the communication backend. Thread-safe.
  */
diff --git a/src/comm_backend/comm_backend_internal.cpp b/src/comm_backend/comm_backend_internal.cpp
index 0cd14a6..43cc460 100644
--- a/src/comm_backend/comm_backend_internal.cpp
+++ b/src/comm_backend/comm_backend_internal.cpp
@@ -89,16 +89,18 @@ AmHandler registerAmHandler(CompHandler handler) {
   return gCommBackend->registerAmHandler(handler);
 }
 
-void issueAm(int rank, const void *msg, size_t size, mr_t mr, CompHandler localComp,
-             AmHandler remoteComp, void *user_context) {
+void issueAm(int rank, const void *msg, size_t size, mr_t mr,
+             CompHandler localComp, AmHandler remoteComp, void *user_context) {
   if (gCommBackend == nullptr) {
     return;
   }
-  gCommBackend->issueAm(rank, msg, size, mr, localComp, remoteComp, user_context);
+  gCommBackend->issueAm(rank, msg, size, mr, localComp, remoteComp,
+                        user_context);
 }
 
 void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr,
-               uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context) {
+               uintptr_t remote_disp, void *rmr, CompHandler localComp,
+               void *user_context) {
   if (gCommBackend == nullptr) {
     return;
   }
@@ -107,7 +109,8 @@ void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr,
 }
 
 void issueRput(int rank, const void *local_buf, size_t size, mr_t local_mr,
-               uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context) {
+               uintptr_t remote_disp, void *rmr, CompHandler localComp,
+               void *user_context) {
   if (gCommBackend == nullptr) {
     return;
   }
diff --git a/src/comm_backend/comm_backend_internal.h b/src/comm_backend/comm_backend_internal.h
index 8df6cbf..0cfde8d 100644
--- a/src/comm_backend/comm_backend_internal.h
+++ b/src/comm_backend/comm_backend_internal.h
@@ -20,7 +20,8 @@ class CommBackendBase {
   virtual bool isRMACapable() { return false; }
   virtual AmHandler registerAmHandler(CompHandler handler) = 0;
   virtual void issueAm(int rank, const void *local_buf, size_t size, mr_t mr,
-                       CompHandler localComp, AmHandler remoteComp, void *user_context) = 0;
+                       CompHandler localComp, AmHandler remoteComp,
+                       void *user_context) = 0;
   virtual void issueRget(int rank, const void *local_buf, size_t size,
                          mr_t local_mr, uintptr_t remote_disp, void *rmr,
                          CompHandler localComp, void *user_context) {
@@ -39,7 +40,7 @@ class CommBackendBase {
   virtual mr_t registerMemory(void *addr, size_t size) { return MR_NULL; }
   virtual size_t getRMR(mr_t mr, void *addr, size_t size) { return 0; }
   virtual void deregisterMemory(mr_t mr) {}
-  virtual ~CommBackendBase() {};
+  virtual ~CommBackendBase() = default;
 };
 
 } // namespace comm_backend
diff --git a/src/comm_backend/lci2/comm_backend_lci2.cpp b/src/comm_backend/lci2/comm_backend_lci2.cpp
index 0349d1c..5de5be0 100644
--- a/src/comm_backend/lci2/comm_backend_lci2.cpp
+++ b/src/comm_backend/lci2/comm_backend_lci2.cpp
@@ -87,8 +87,9 @@ AmHandler CommBackendLCI2::registerAmHandler(CompHandler handler) {
   return g_handlers.size() - 1;
 }
 
-void CommBackendLCI2::issueAm(int rank, const void *local_buf, size_t size, mr_t mr,
-                              CompHandler localComp, AmHandler remoteComp, void *user_context) {
+void CommBackendLCI2::issueAm(int rank, const void *local_buf, size_t size,
+                              mr_t mr, CompHandler localComp,
+                              AmHandler remoteComp, void *user_context) {
   auto args = new localCallbackArgs{localComp, user_context};
   lci::status_t status;
   do {
diff --git a/src/comm_backend/lci2/comm_backend_lci2.h b/src/comm_backend/lci2/comm_backend_lci2.h
index e1149ca..051afdf 100644
--- a/src/comm_backend/lci2/comm_backend_lci2.h
+++ b/src/comm_backend/lci2/comm_backend_lci2.h
@@ -24,13 +24,14 @@ class CommBackendLCI2 : public CommBackendBase {
   bool isRMACapable() override { return true; }
   AmHandler registerAmHandler(CompHandler handler) override;
   void issueAm(int rank, const void *local_buf, size_t size, mr_t mr,
-               CompHandler localComp, AmHandler remoteComp, void *user_context) override;
+               CompHandler localComp, AmHandler remoteComp,
+               void *user_context) override;
   void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr,
-                 uintptr_t remote_disp, void *rmr,
-                 CompHandler localComp, void *user_context) override;
+                 uintptr_t remote_disp, void *rmr, CompHandler localComp,
+                 void *user_context) override;
   void issueRput(int rank, const void *local_buf, size_t size, mr_t local_mr,
-                 uintptr_t remote_disp, void *rmr,
-                 CompHandler localComp, void *user_context) override;
+                 uintptr_t remote_disp, void *rmr, CompHandler localComp,
+                 void *user_context) override;
   bool progress(void) override;
   void barrier(void) override;
   mr_t registerMemory(void *addr, size_t size) override;
diff --git a/src/concurrentqueue.h b/src/concurrentqueue.h
index db4835b..593680b 100644
--- a/src/concurrentqueue.h
+++ b/src/concurrentqueue.h
@@ -1,5 +1,5 @@
-// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue.
-// An overview, including benchmark results, is provided here:
+// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free
+// queue. An overview, including benchmark results, is provided here:
 //     http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++
 // The full design is also described in excruciating detail at:
 //    http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue
@@ -8,24 +8,26 @@
 // Copyright (c) 2013-2020, Cameron Desrochers.
 // All rights reserved.
 //
-// Redistribution and use in source and binary forms, with or without modification,
-// are permitted provided that the following conditions are met:
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
 //
-// - Redistributions of source code must retain the above copyright notice, this list of
-// conditions and the following disclaimer.
-// - Redistributions in binary form must reproduce the above copyright notice, this list of
-// conditions and the following disclaimer in the documentation and/or other materials
-// provided with the distribution.
+// - Redistributions of source code must retain the above copyright notice, this
+// list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the following disclaimer in the documentation
+// and/or other materials provided with the distribution.
 //
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
-// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
-// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
-// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
-// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
 
 // Also dual-licensed under the Boost Software License (see LICENSE.md)
 
@@ -33,8 +35,8 @@
 
 #if defined(__GNUC__) && !defined(__INTEL_COMPILER)
 // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and
-// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings
-// upon assigning any computed values)
+// Traits::index_t are set to < 32 bits, causing integer promotion, causing
+// warnings upon assigning any computed values)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wconversion"
 
@@ -44,10 +46,11 @@
 #endif
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
-// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher
-// does not support `if constexpr`, so we have no choice but to simply disable the warning
+// VS2019 with /W4 warns about constant conditional expressions but unless
+// /std=c++17 or higher does not support `if constexpr`, so we have no choice
+// but to simply disable the warning
 #pragma warning(push)
-#pragma warning(disable: 4127)  // conditional expression is constant
+#pragma warning(disable : 4127) // conditional expression is constant
 #endif
 
 #if defined(__APPLE__)
@@ -57,92 +60,117 @@
 #ifdef MCDBGQ_USE_RELACY
 #include "relacy/relacy_std.hpp"
 #include "relacy_shims.h"
-// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations.
-// We'll override the default trait malloc ourselves without a macro.
+// We only use malloc/free anyway, and the delete macro messes up `= delete`
+// method declarations. We'll override the default trait malloc ourselves
+// without a macro.
 #undef new
 #undef delete
 #undef malloc
 #undef free
 #else
-#include <atomic>		// Requires C++11. Sorry VS2010.
+#include <atomic> // Requires C++11. Sorry VS2010.
 #include <cassert>
 #endif
-#include <cstddef>              // for max_align_t
+#include <algorithm>
+#include <array>
+#include <climits> // for CHAR_BIT
+#include <cstddef> // for max_align_t
 #include <cstdint>
 #include <cstdlib>
+#include <limits>
+#include <mutex> // used for thread exit synchronization
+#include <thread> // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
 #include <type_traits>
-#include <algorithm>
 #include <utility>
-#include <limits>
-#include <climits>		// for CHAR_BIT
-#include <array>
-#include <thread>		// partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading
-#include <mutex>        // used for thread exit synchronization
-
-// Platform-specific definitions of a numeric thread ID type and an invalid value
-namespace moodycamel { namespace details {
-	template<typename thread_id_t> struct thread_id_converter {
-		typedef thread_id_t thread_id_numeric_size_t;
-		typedef thread_id_t thread_id_hash_t;
-		static thread_id_hash_t prehash(thread_id_t const& x) { return x; }
-	};
-} }
+
+// Platform-specific definitions of a numeric thread ID type and an invalid
+// value
+namespace moodycamel {
+namespace details {
+template <typename thread_id_t> struct thread_id_converter {
+  typedef thread_id_t thread_id_numeric_size_t;
+  typedef thread_id_t thread_id_hash_t;
+  static thread_id_hash_t prehash(thread_id_t const &x) { return x; }
+};
+} // namespace details
+} // namespace moodycamel
 #if defined(MCDBGQ_USE_RELACY)
-namespace moodycamel { namespace details {
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0xFFFFFFFFU;
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
-	static inline thread_id_t thread_id() { return rl::thread_index(); }
-} }
+namespace moodycamel {
+namespace details {
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0xFFFFFFFFU;
+static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU;
+static inline thread_id_t thread_id() { return rl::thread_index(); }
+} // namespace details
+} // namespace moodycamel
 #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__)
-// No sense pulling in windows.h in a header, we'll manually declare the function
-// we use and rely on backwards-compatibility for this not to break
-extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
-namespace moodycamel { namespace details {
-	static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows");
-	typedef std::uint32_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;			// See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
-	static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU;	// Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4.
-	static inline thread_id_t thread_id() { return static_cast<thread_id_t>(::GetCurrentThreadId()); }
-} }
-#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL)
-namespace moodycamel { namespace details {
-	static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes");
-	
-	typedef std::thread::id thread_id_t;
-	static const thread_id_t invalid_thread_id;         // Default ctor creates invalid ID
-
-	// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's
-	// only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't
-	// be.
-	static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
-
-	template<std::size_t> struct thread_id_size { };
-	template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; };
-	template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; };
-
-	template<> struct thread_id_converter<thread_id_t> {
-		typedef thread_id_size<sizeof(thread_id_t)>::numeric_t thread_id_numeric_size_t;
+// No sense pulling in windows.h in a header, we'll manually declare the
+// function we use and rely on backwards-compatibility for this not to break
+extern "C"
+    __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void);
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(unsigned long) == sizeof(std::uint32_t),
+              "Expected size of unsigned long to be 32 bits on Windows");
+typedef std::uint32_t thread_id_t;
+static const thread_id_t invalid_thread_id =
+    0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx
+static const thread_id_t invalid_thread_id2 =
+    0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used
+                 // in practice. Note that all Win32 thread IDs are presently
+                 // multiples of 4.
+static inline thread_id_t thread_id() {
+  return static_cast<thread_id_t>(::GetCurrentThreadId());
+}
+} // namespace details
+} // namespace moodycamel
+#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) ||           \
+    (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) ||            \
+    defined(MOODYCAMEL_NO_THREAD_LOCAL)
+namespace moodycamel {
+namespace details {
+static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8,
+              "std::thread::id is expected to be either 4 or 8 bytes");
+
+typedef std::thread::id thread_id_t;
+static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID
+
+// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have
+// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined
+// anyway, which it won't be.
+static inline thread_id_t thread_id() { return std::this_thread::get_id(); }
+
+template <std::size_t> struct thread_id_size {};
+template <> struct thread_id_size<4> {
+  typedef std::uint32_t numeric_t;
+};
+template <> struct thread_id_size<8> {
+  typedef std::uint64_t numeric_t;
+};
+
+template <> struct thread_id_converter<thread_id_t> {
+  typedef thread_id_size<sizeof(thread_id_t)>::numeric_t
+      thread_id_numeric_size_t;
 #ifndef __APPLE__
-		typedef std::size_t thread_id_hash_t;
+  typedef std::size_t thread_id_hash_t;
 #else
-		typedef thread_id_numeric_size_t thread_id_hash_t;
+  typedef thread_id_numeric_size_t thread_id_hash_t;
 #endif
 
-		static thread_id_hash_t prehash(thread_id_t const& x)
-		{
+  static thread_id_hash_t prehash(thread_id_t const &x) {
 #ifndef __APPLE__
-			return std::hash<std::thread::id>()(x);
+    return std::hash<std::thread::id>()(x);
 #else
-			return *reinterpret_cast<thread_id_hash_t const*>(&x);
+    return *reinterpret_cast<thread_id_hash_t const *>(&x);
 #endif
-		}
-	};
-} }
+  }
+};
+}
+}
 #else
 // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475
-// In order to get a numeric thread ID in a platform-independent way, we use a thread-local
-// static variable's address as a thread identifier :-)
+// In order to get a numeric thread ID in a platform-independent way, we use a
+// thread-local static variable's address as a thread identifier :-)
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
 #define MOODYCAMEL_THREADLOCAL __thread
 #elif defined(_MSC_VER)
@@ -151,17 +179,25 @@ namespace moodycamel { namespace details {
 // Assume C++11 compliant compiler
 #define MOODYCAMEL_THREADLOCAL thread_local
 #endif
-namespace moodycamel { namespace details {
-	typedef std::uintptr_t thread_id_t;
-	static const thread_id_t invalid_thread_id  = 0;		// Address can't be nullptr
-	static const thread_id_t invalid_thread_id2 = 1;		// Member accesses off a null pointer are also generally invalid. Plus it's not aligned.
-	inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast<thread_id_t>(&x); }
-} }
+namespace moodycamel {
+namespace details {
+typedef std::uintptr_t thread_id_t;
+static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr
+static const thread_id_t invalid_thread_id2 =
+    1; // Member accesses off a null pointer are also generally invalid. Plus
+       // it's not aligned.
+inline thread_id_t thread_id() {
+  static MOODYCAMEL_THREADLOCAL int x;
+  return reinterpret_cast<thread_id_t>(&x);
+}
+}
+}
 #endif
 
 // Constexpr if
 #ifndef MOODYCAMEL_CONSTEXPR_IF
-#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L
+#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) ||                \
+    __cplusplus > 201402L
 #define MOODYCAMEL_CONSTEXPR_IF if constexpr
 #define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]]
 #else
@@ -172,18 +208,20 @@ namespace moodycamel { namespace details {
 
 // Exceptions
 #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED
-#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__))
+#if (defined(_MSC_VER) && defined(_CPPUNWIND)) ||                              \
+    (defined(__GNUC__) && defined(__EXCEPTIONS)) ||                            \
+    (!defined(_MSC_VER) && !defined(__GNUC__))
 #define MOODYCAMEL_EXCEPTIONS_ENABLED
 #endif
 #endif
 #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED
 #define MOODYCAMEL_TRY try
-#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__)
+#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__)
 #define MOODYCAMEL_RETHROW throw
-#define MOODYCAMEL_THROW(expr) throw (expr)
+#define MOODYCAMEL_THROW(expr) throw(expr)
 #else
-#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true)
-#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false)
+#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true)
+#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false)
 #define MOODYCAMEL_RETHROW
 #define MOODYCAMEL_THROW(expr)
 #endif
@@ -194,15 +232,40 @@ namespace moodycamel { namespace details {
 #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true
 #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800
-// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-(
-// We have to assume *all* non-trivial constructors may throw on VS2012!
+// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when
+// it shouldn't :-( We have to assume *all* non-trivial constructors may throw
+// on VS2012!
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)                        \
+  (std::is_rvalue_reference<valueType>::value &&                               \
+           std::is_move_constructible<type>::value                             \
+       ? std::is_trivially_move_constructible<type>::value                     \
+       : std::is_trivially_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)                      \
+  ((std::is_rvalue_reference<valueType>::value &&                              \
+            std::is_move_assignable<type>::value                               \
+        ? std::is_trivially_move_assignable<type>::value ||                    \
+              std::is_nothrow_move_assignable<type>::value                     \
+        : std::is_trivially_copy_assignable<type>::value ||                    \
+              std::is_nothrow_copy_assignable<type>::value) &&                 \
+   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900
 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT
-#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference<valueType>::value && std::is_move_constructible<type>::value ? std::is_trivially_move_constructible<type>::value || std::is_nothrow_move_constructible<type>::value : std::is_trivially_copy_constructible<type>::value || std::is_nothrow_copy_constructible<type>::value)
-#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference<valueType>::value && std::is_move_assignable<type>::value ? std::is_trivially_move_assignable<type>::value || std::is_nothrow_move_assignable<type>::value : std::is_trivially_copy_assignable<type>::value || std::is_nothrow_copy_assignable<type>::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
+#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)                        \
+  (std::is_rvalue_reference<valueType>::value &&                               \
+           std::is_move_constructible<type>::value                             \
+       ? std::is_trivially_move_constructible<type>::value ||                  \
+             std::is_nothrow_move_constructible<type>::value                   \
+       : std::is_trivially_copy_constructible<type>::value ||                  \
+             std::is_nothrow_copy_constructible<type>::value)
+#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr)                      \
+  ((std::is_rvalue_reference<valueType>::value &&                              \
+            std::is_move_assignable<type>::value                               \
+        ? std::is_trivially_move_assignable<type>::value ||                    \
+              std::is_nothrow_move_assignable<type>::value                     \
+        : std::is_trivially_copy_assignable<type>::value ||                    \
+              std::is_nothrow_copy_assignable<type>::value) &&                 \
+   MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr))
 #else
 #define MOODYCAMEL_NOEXCEPT noexcept
 #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr)
@@ -214,18 +277,31 @@ namespace moodycamel { namespace details {
 #ifdef MCDBGQ_USE_RELACY
 #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #else
-// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445
-// g++ <=4.7 doesn't support thread_local either.
-// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work
-#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
-// Assume `thread_local` is fully supported in all other C++11 compilers/platforms
-#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED    // tentatively enabled for now; years ago several users report having problems with it on
+// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a
+// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't
+// support thread_local either. Finally, iOS/ARM doesn't have support for it
+// either, and g++/ARM allows it to compile but it's unconfirmed to actually
+// work
+#if (!defined(_MSC_VER) || _MSC_VER >= 1900) &&                                \
+    (!defined(__MINGW32__) && !defined(__MINGW64__) ||                         \
+     !defined(__WINPTHREADS_VERSION)) &&                                       \
+    (!defined(__GNUC__) || __GNUC__ > 4 ||                                     \
+     (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) &&                                \
+    (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) &&         \
+    !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__)
+// Assume `thread_local` is fully supported in all other C++11
+// compilers/platforms
+#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now;
+                                                // years ago several users
+                                                // report having problems with
+                                                // it on
 #endif
 #endif
 #endif
 
-// VS2012 doesn't support deleted functions. 
-// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called.
+// VS2012 doesn't support deleted functions.
+// In this case, we declare the function normally but don't define it. A link
+// error will be generated if the function is called.
 #ifndef MOODYCAMEL_DELETE_FUNCTION
 #if defined(_MSC_VER) && _MSC_VER < 1800
 #define MOODYCAMEL_DELETE_FUNCTION
@@ -234,54 +310,82 @@ namespace moodycamel { namespace details {
 #endif
 #endif
 
-namespace moodycamel { namespace details {
+namespace moodycamel {
+namespace details {
 #ifndef MOODYCAMEL_ALIGNAS
-// VS2013 doesn't support alignas or alignof, and align() requires a constant literal
+// VS2013 doesn't support alignas or alignof, and align() requires a constant
+// literal
 #if defined(_MSC_VER) && _MSC_VER <= 1800
 #define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment))
 #define MOODYCAMEL_ALIGNOF(obj) __alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
-	template<int Align, typename T> struct Vs2013Aligned { };  // default, unsupported alignment
-	template<typename T> struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; };
-	template<typename T> struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; };
-	template<typename T> struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; };
-	template<typename T> struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; };
-	template<typename T> struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; };
-	template<typename T> struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; };
-	template<typename T> struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; };
-	template<typename T> struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; };
-	template<typename T> struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; };
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj)                                   \
+  typename details::Vs2013Aligned<std::alignment_of<obj>::value, T>::type
+template <int Align, typename T>
+struct Vs2013Aligned {}; // default, unsupported alignment
+template <typename T> struct Vs2013Aligned<1, T> {
+  typedef __declspec(align(1)) T type;
+};
+template <typename T> struct Vs2013Aligned<2, T> {
+  typedef __declspec(align(2)) T type;
+};
+template <typename T> struct Vs2013Aligned<4, T> {
+  typedef __declspec(align(4)) T type;
+};
+template <typename T> struct Vs2013Aligned<8, T> {
+  typedef __declspec(align(8)) T type;
+};
+template <typename T> struct Vs2013Aligned<16, T> {
+  typedef __declspec(align(16)) T type;
+};
+template <typename T> struct Vs2013Aligned<32, T> {
+  typedef __declspec(align(32)) T type;
+};
+template <typename T> struct Vs2013Aligned<64, T> {
+  typedef __declspec(align(64)) T type;
+};
+template <typename T> struct Vs2013Aligned<128, T> {
+  typedef __declspec(align(128)) T type;
+};
+template <typename T> struct Vs2013Aligned<256, T> {
+  typedef __declspec(align(256)) T type;
+};
 #else
-	template<typename T> struct identity { typedef T type; };
+template <typename T> struct identity {
+  typedef T type;
+};
 #define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment)
 #define MOODYCAMEL_ALIGNOF(obj) alignof(obj)
-#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity<T>::type
+#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj)                                   \
+  alignas(alignof(obj)) typename details::identity<T>::type
 #endif
 #endif
-} }
+} // namespace details
+} // namespace moodycamel
 
-
-// TSAN can false report races in lock-free code.  To enable TSAN to be used from projects that use this one,
-// we can apply per-function compile-time suppression.
-// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
+// TSAN can false report races in lock-free code.  To enable TSAN to be used
+// from projects that use this one, we can apply per-function compile-time
+// suppression. See
+// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer
 #define MOODYCAMEL_NO_TSAN
 #if defined(__has_feature)
- #if __has_feature(thread_sanitizer)
-  #undef MOODYCAMEL_NO_TSAN
-  #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
- #endif // TSAN
+#if __has_feature(thread_sanitizer)
+#undef MOODYCAMEL_NO_TSAN
+#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread")))
+#endif // TSAN
 #endif // TSAN
 
 // Compiler-specific likely/unlikely hints
-namespace moodycamel { namespace details {
+namespace moodycamel {
+namespace details {
 #if defined(__GNUC__)
-	static inline bool (likely)(bool x) { return __builtin_expect((x), true); }
-	static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); }
+static inline bool(likely)(bool x) { return __builtin_expect((x), true); }
+static inline bool(unlikely)(bool x) { return __builtin_expect((x), false); }
 #else
-	static inline bool (likely)(bool x) { return x; }
-	static inline bool (unlikely)(bool x) { return x; }
+static inline bool(likely)(bool x) { return x; }
+static inline bool(unlikely)(bool x) { return x; }
 #endif
-} }
+} // namespace details
+} // namespace moodycamel
 
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
 #include "internal/concurrentqueue_internal_debug.h"
@@ -289,28 +393,33 @@ namespace moodycamel { namespace details {
 
 namespace moodycamel {
 namespace details {
-	template<typename T>
-	struct const_numeric_max {
-		static_assert(std::is_integral<T>::value, "const_numeric_max can only be used with integers");
-		static const T value = std::numeric_limits<T>::is_signed
-			? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast<T>(1)
-			: static_cast<T>(-1);
-	};
+template <typename T> struct const_numeric_max {
+  static_assert(std::is_integral<T>::value,
+                "const_numeric_max can only be used with integers");
+  static const T value =
+      std::numeric_limits<T>::is_signed
+          ? (static_cast<T>(1) << (sizeof(T) * CHAR_BIT - 1)) -
+                static_cast<T>(1)
+          : static_cast<T>(-1);
+};
 
 #if defined(__GLIBCXX__)
-	typedef ::max_align_t std_max_align_t;      // libstdc++ forgot to add it to std:: for a while
+typedef ::max_align_t
+    std_max_align_t; // libstdc++ forgot to add it to std:: for a while
 #else
-	typedef std::max_align_t std_max_align_t;   // Others (e.g. MSVC) insist it can *only* be accessed via std::
+typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can
+                                          // *only* be accessed via std::
 #endif
 
-	// Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting
-	// 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64.
-	typedef union {
-		std_max_align_t x;
-		long long y;
-		void* z;
-	} max_align_t;
-}
+// Some platforms have incorrectly set max_align_t to a type with <8 bytes
+// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit
+// iOS). Work around this with our own union. See issue #64.
+typedef union {
+  std_max_align_t x;
+  long long y;
+  void *z;
+} max_align_t;
+} // namespace details
 
 // Default traits for the ConcurrentQueue. To change some of the
 // traits without re-implementing all of them, inherit from this
@@ -318,99 +427,103 @@ namespace details {
 // since the traits are used as a template type parameter, the
 // shadowed declarations will be used where defined, and the defaults
 // otherwise.
-struct ConcurrentQueueDefaultTraits
-{
-	// General-purpose size type. std::size_t is strongly recommended.
-	typedef std::size_t size_t;
-	
-	// The type used for the enqueue and dequeue indices. Must be at least as
-	// large as size_t. Should be significantly larger than the number of elements
-	// you expect to hold at once, especially if you have a high turnover rate;
-	// for example, on 32-bit x86, if you expect to have over a hundred million
-	// elements or pump several million elements through your queue in a very
-	// short space of time, using a 32-bit type *may* trigger a race condition.
-	// A 64-bit int type is recommended in that case, and in practice will
-	// prevent a race condition no matter the usage of the queue. Note that
-	// whether the queue is lock-free with a 64-int type depends on the whether
-	// std::atomic<std::uint64_t> is lock-free, which is platform-specific.
-	typedef std::size_t index_t;
-	
-	// Internally, all elements are enqueued and dequeued from multi-element
-	// blocks; this is the smallest controllable unit. If you expect few elements
-	// but many producers, a smaller block size should be favoured. For few producers
-	// and/or many elements, a larger block size is preferred. A sane default
-	// is provided. Must be a power of 2.
-	static const size_t BLOCK_SIZE = 32;
-	
-	// For explicit producers (i.e. when using a producer token), the block is
-	// checked for being empty by iterating through a list of flags, one per element.
-	// For large block sizes, this is too inefficient, and switching to an atomic
-	// counter-based approach is faster. The switch is made for block sizes strictly
-	// larger than this threshold.
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
-	
-	// How many full blocks can be expected for a single explicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
-	
-	// How many full blocks can be expected for a single implicit producer? This should
-	// reflect that number's maximum for optimal performance. Must be a power of 2.
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
-	
-	// The initial size of the hash table mapping thread IDs to implicit producers.
-	// Note that the hash is resized every time it becomes half full.
-	// Must be a power of two, and either 0 or at least 1. If 0, implicit production
-	// (using the enqueue methods without an explicit producer token) is disabled.
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
-	
-	// Controls the number of items that an explicit consumer (i.e. one with a token)
-	// must consume before it causes all consumers to rotate and move on to the next
-	// internal queue.
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256;
-	
-	// The maximum number of elements (inclusive) that can be enqueued to a sub-queue.
-	// Enqueue operations that would cause this limit to be surpassed will fail. Note
-	// that this limit is enforced at the block level (for performance reasons), i.e.
-	// it's rounded up to the nearest block size.
-	static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max<size_t>::value;
-
-	// The number of times to spin before sleeping when waiting on a semaphore.
-	// Recommended values are on the order of 1000-10000 unless the number of
-	// consumer threads exceeds the number of idle cores (in which case try 0-100).
-	// Only affects instances of the BlockingConcurrentQueue.
-	static const int MAX_SEMA_SPINS = 10000;
-
-	// Whether to recycle dynamically-allocated blocks into an internal free list or
-	// not. If false, only pre-allocated blocks (controlled by the constructor
-	// arguments) will be recycled, and all others will be `free`d back to the heap.
-	// Note that blocks consumed by explicit producers are only freed on destruction
-	// of the queue (not following destruction of the token) regardless of this trait.
-	static const bool RECYCLE_ALLOCATED_BLOCKS = false;
-
-	
+struct ConcurrentQueueDefaultTraits {
+  // General-purpose size type. std::size_t is strongly recommended.
+  typedef std::size_t size_t;
+
+  // The type used for the enqueue and dequeue indices. Must be at least as
+  // large as size_t. Should be significantly larger than the number of elements
+  // you expect to hold at once, especially if you have a high turnover rate;
+  // for example, on 32-bit x86, if you expect to have over a hundred million
+  // elements or pump several million elements through your queue in a very
+  // short space of time, using a 32-bit type *may* trigger a race condition.
+  // A 64-bit int type is recommended in that case, and in practice will
+  // prevent a race condition no matter the usage of the queue. Note that
+  // whether the queue is lock-free with a 64-int type depends on the whether
+  // std::atomic<std::uint64_t> is lock-free, which is platform-specific.
+  typedef std::size_t index_t;
+
+  // Internally, all elements are enqueued and dequeued from multi-element
+  // blocks; this is the smallest controllable unit. If you expect few elements
+  // but many producers, a smaller block size should be favoured. For few
+  // producers and/or many elements, a larger block size is preferred. A sane
+  // default is provided. Must be a power of 2.
+  static const size_t BLOCK_SIZE = 32;
+
+  // For explicit producers (i.e. when using a producer token), the block is
+  // checked for being empty by iterating through a list of flags, one per
+  // element. For large block sizes, this is too inefficient, and switching to
+  // an atomic counter-based approach is faster. The switch is made for block
+  // sizes strictly larger than this threshold.
+  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32;
+
+  // How many full blocks can be expected for a single explicit producer? This
+  // should reflect that number's maximum for optimal performance. Must be a
+  // power of 2.
+  static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32;
+
+  // How many full blocks can be expected for a single implicit producer? This
+  // should reflect that number's maximum for optimal performance. Must be a
+  // power of 2.
+  static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32;
+
+  // The initial size of the hash table mapping thread IDs to implicit
+  // producers. Note that the hash is resized every time it becomes half full.
+  // Must be a power of two, and either 0 or at least 1. If 0, implicit
+  // production (using the enqueue methods without an explicit producer token)
+  // is disabled.
+  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32;
+
+  // Controls the number of items that an explicit consumer (i.e. one with a
+  // token) must consume before it causes all consumers to rotate and move on to
+  // the next internal queue.
+  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+      256;
+
+  // The maximum number of elements (inclusive) that can be enqueued to a
+  // sub-queue. Enqueue operations that would cause this limit to be surpassed
+  // will fail. Note that this limit is enforced at the block level (for
+  // performance reasons), i.e. it's rounded up to the nearest block size.
+  static const size_t MAX_SUBQUEUE_SIZE =
+      details::const_numeric_max<size_t>::value;
+
+  // The number of times to spin before sleeping when waiting on a semaphore.
+  // Recommended values are on the order of 1000-10000 unless the number of
+  // consumer threads exceeds the number of idle cores (in which case try
+  // 0-100). Only affects instances of the BlockingConcurrentQueue.
+  static const int MAX_SEMA_SPINS = 10000;
+
+  // Whether to recycle dynamically-allocated blocks into an internal free list
+  // or not. If false, only pre-allocated blocks (controlled by the constructor
+  // arguments) will be recycled, and all others will be `free`d back to the
+  // heap. Note that blocks consumed by explicit producers are only freed on
+  // destruction of the queue (not following destruction of the token)
+  // regardless of this trait.
+  static const bool RECYCLE_ALLOCATED_BLOCKS = false;
+
 #ifndef MCDBGQ_USE_RELACY
-	// Memory allocation can be customized if needed.
-	// malloc should return nullptr on failure, and handle alignment like std::malloc.
+  // Memory allocation can be customized if needed.
+  // malloc should return nullptr on failure, and handle alignment like
+  // std::malloc.
 #if defined(malloc) || defined(free)
-	// Gah, this is 2015, stop defining macros that break standard code already!
-	// Work around malloc/free being special macros:
-	static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); }
-	static inline void WORKAROUND_free(void* ptr) { return free(ptr); }
-	static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); }
-	static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); }
+  // Gah, this is 2015, stop defining macros that break standard code already!
+  // Work around malloc/free being special macros:
+  static inline void *WORKAROUND_malloc(size_t size) { return malloc(size); }
+  static inline void WORKAROUND_free(void *ptr) { return free(ptr); }
+  static inline void *(malloc)(size_t size) { return WORKAROUND_malloc(size); }
+  static inline void(free)(void *ptr) { return WORKAROUND_free(ptr); }
 #else
-	static inline void* malloc(size_t size) { return std::malloc(size); }
-	static inline void free(void* ptr) { return std::free(ptr); }
+  static inline void *malloc(size_t size) { return std::malloc(size); }
+  static inline void free(void *ptr) { return std::free(ptr); }
 #endif
 #else
-	// Debug versions when running under the Relacy race detector (ignore
-	// these in user code)
-	static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); }
-	static inline void free(void* ptr) { return rl::rl_free(ptr, $); }
+  // Debug versions when running under the Relacy race detector (ignore
+  // these in user code)
+  static inline void *malloc(size_t size) { return rl::rl_malloc(size, $); }
+  static inline void free(void *ptr) { return rl::rl_free(ptr, $); }
 #endif
 };
 
-
 // When producing or consuming many elements, the most efficient way is to:
 //    1) Use one of the bulk-operation methods of the queue with a token
 //    2) Failing that, use the bulk-operation methods without a token
@@ -421,3322 +534,3729 @@ struct ConcurrentQueueDefaultTraits
 struct ProducerToken;
 struct ConsumerToken;
 
-template<typename T, typename Traits> class ConcurrentQueue;
-template<typename T, typename Traits> class BlockingConcurrentQueue;
+template <typename T, typename Traits> class ConcurrentQueue;
+template <typename T, typename Traits> class BlockingConcurrentQueue;
 class ConcurrentQueueTests;
 
+namespace details {
+struct ConcurrentQueueProducerTypelessBase {
+  ConcurrentQueueProducerTypelessBase *next;
+  std::atomic<bool> inactive;
+  ProducerToken *token;
 
-namespace details
-{
-	struct ConcurrentQueueProducerTypelessBase
-	{
-		ConcurrentQueueProducerTypelessBase* next;
-		std::atomic<bool> inactive;
-		ProducerToken* token;
-		
-		ConcurrentQueueProducerTypelessBase()
-			: next(nullptr), inactive(false), token(nullptr)
-		{
-		}
-	};
-	
-	template<bool use32> struct _hash_32_or_64 {
-		static inline std::uint32_t hash(std::uint32_t h)
-		{
-			// MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
-			// Since the thread ID is already unique, all we really want to do is propagate that
-			// uniqueness evenly across all the bits, so that we can use a subset of the bits while
-			// reducing collisions significantly
-			h ^= h >> 16;
-			h *= 0x85ebca6b;
-			h ^= h >> 13;
-			h *= 0xc2b2ae35;
-			return h ^ (h >> 16);
-		}
-	};
-	template<> struct _hash_32_or_64<1> {
-		static inline std::uint64_t hash(std::uint64_t h)
-		{
-			h ^= h >> 33;
-			h *= 0xff51afd7ed558ccd;
-			h ^= h >> 33;
-			h *= 0xc4ceb9fe1a85ec53;
-			return h ^ (h >> 33);
-		}
-	};
-	template<std::size_t size> struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {  };
-	
-	static inline size_t hash_thread_id(thread_id_t id)
-	{
-		static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values");
-		return static_cast<size_t>(hash_32_or_64<sizeof(thread_id_converter<thread_id_t>::thread_id_hash_t)>::hash(
-			thread_id_converter<thread_id_t>::prehash(id)));
-	}
-	
-	template<typename T>
-	static inline bool circular_less_than(T a, T b)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "circular_less_than is intended to be used only with unsigned integer types");
-		return static_cast<T>(a - b) > static_cast<T>(static_cast<T>(1) << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
-		// Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
-		//       silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here.
-	}
-	
-	template<typename U>
-	static inline char* align_for(char* ptr)
-	{
-		const std::size_t alignment = std::alignment_of<U>::value;
-		return ptr + (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) % alignment;
-	}
-
-	template<typename T>
-	static inline T ceil_to_pow_2(T x)
-	{
-		static_assert(std::is_integral<T>::value && !std::numeric_limits<T>::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types");
-
-		// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
-		--x;
-		x |= x >> 1;
-		x |= x >> 2;
-		x |= x >> 4;
-		for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
-			x |= x >> (i << 3);
-		}
-		++x;
-		return x;
-	}
-	
-	template<typename T>
-	static inline void swap_relaxed(std::atomic<T>& left, std::atomic<T>& right)
-	{
-		T temp = left.load(std::memory_order_relaxed);
-		left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		right.store(temp, std::memory_order_relaxed);
-	}
-	
-	template<typename T>
-	static inline T const& nomove(T const& x)
-	{
-		return x;
-	}
-	
-	template<bool Enable>
-	struct nomove_if
-	{
-		template<typename T>
-		static inline T const& eval(T const& x)
-		{
-			return x;
-		}
-	};
-	
-	template<>
-	struct nomove_if<false>
-	{
-		template<typename U>
-		static inline auto eval(U&& x)
-			-> decltype(std::forward<U>(x))
-		{
-			return std::forward<U>(x);
-		}
-	};
-	
-	template<typename It>
-	static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it)
-	{
-		return *it;
-	}
-	
-#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
-	template<typename T> struct is_trivially_destructible : std::is_trivially_destructible<T> { };
+  ConcurrentQueueProducerTypelessBase()
+      : next(nullptr), inactive(false), token(nullptr) {}
+};
+
+template <bool use32> struct _hash_32_or_64 {
+  static inline std::uint32_t hash(std::uint32_t h) {
+    // MurmurHash3 finalizer -- see
+    // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp
+    // Since the thread ID is already unique, all we really want to do is
+    // propagate that uniqueness evenly across all the bits, so that we can use
+    // a subset of the bits while reducing collisions significantly
+    h ^= h >> 16;
+    h *= 0x85ebca6b;
+    h ^= h >> 13;
+    h *= 0xc2b2ae35;
+    return h ^ (h >> 16);
+  }
+};
+template <> struct _hash_32_or_64<1> {
+  static inline std::uint64_t hash(std::uint64_t h) {
+    h ^= h >> 33;
+    h *= 0xff51afd7ed558ccd;
+    h ^= h >> 33;
+    h *= 0xc4ceb9fe1a85ec53;
+    return h ^ (h >> 33);
+  }
+};
+template <std::size_t size>
+struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {};
+
+static inline size_t hash_thread_id(thread_id_t id) {
+  static_assert(
+      sizeof(thread_id_t) <= 8,
+      "Expected a platform where thread IDs are at most 64-bit values");
+  return static_cast<size_t>(
+      hash_32_or_64<sizeof(
+          thread_id_converter<thread_id_t>::thread_id_hash_t)>::
+          hash(thread_id_converter<thread_id_t>::prehash(id)));
+}
+
+template <typename T> static inline bool circular_less_than(T a, T b) {
+  static_assert(std::is_integral<T>::value &&
+                    !std::numeric_limits<T>::is_signed,
+                "circular_less_than is intended to be used only with unsigned "
+                "integer types");
+  return static_cast<T>(a - b) >
+         static_cast<T>(static_cast<T>(1)
+                        << (static_cast<T>(sizeof(T) * CHAR_BIT - 1)));
+  // Note: extra parens around rhs of operator<< is MSVC bug:
+  // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931
+  //       silencing the bug requires #pragma warning(disable: 4554) around the
+  //       calling code and has no effect when done here.
+}
+
+template <typename U> static inline char *align_for(char *ptr) {
+  const std::size_t alignment = std::alignment_of<U>::value;
+  return ptr +
+         (alignment - (reinterpret_cast<std::uintptr_t>(ptr) % alignment)) %
+             alignment;
+}
+
+template <typename T> static inline T ceil_to_pow_2(T x) {
+  static_assert(
+      std::is_integral<T>::value && !std::numeric_limits<T>::is_signed,
+      "ceil_to_pow_2 is intended to be used only with unsigned integer types");
+
+  // Adapted from
+  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  for (std::size_t i = 1; i < sizeof(T); i <<= 1) {
+    x |= x >> (i << 3);
+  }
+  ++x;
+  return x;
+}
+
+template <typename T>
+static inline void swap_relaxed(std::atomic<T> &left, std::atomic<T> &right) {
+  T temp = left.load(std::memory_order_relaxed);
+  left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed);
+  right.store(temp, std::memory_order_relaxed);
+}
+
+template <typename T> static inline T const &nomove(T const &x) { return x; }
+
+template <bool Enable> struct nomove_if {
+  template <typename T> static inline T const &eval(T const &x) { return x; }
+};
+
+template <> struct nomove_if<false> {
+  template <typename U>
+  static inline auto eval(U &&x) -> decltype(std::forward<U>(x)) {
+    return std::forward<U>(x);
+  }
+};
+
+template <typename It>
+static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) {
+  return *it;
+}
+
+#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 ||                \
+    (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+template <typename T>
+struct is_trivially_destructible : std::is_trivially_destructible<T> {};
 #else
-	template<typename T> struct is_trivially_destructible : std::has_trivial_destructor<T> { };
+template <typename T>
+struct is_trivially_destructible : std::has_trivial_destructor<T> {};
 #endif
-	
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
 #ifdef MCDBGQ_USE_RELACY
-	typedef RelacyThreadExitListener ThreadExitListener;
-	typedef RelacyThreadExitNotifier ThreadExitNotifier;
+typedef RelacyThreadExitListener ThreadExitListener;
+typedef RelacyThreadExitNotifier ThreadExitNotifier;
 #else
-	class ThreadExitNotifier;
-
-	struct ThreadExitListener
-	{
-		typedef void (*callback_t)(void*);
-		callback_t callback;
-		void* userData;
-		
-		ThreadExitListener* next;		// reserved for use by the ThreadExitNotifier
-		ThreadExitNotifier* chain;		// reserved for use by the ThreadExitNotifier
-	};
-
-	class ThreadExitNotifier
-	{
-	public:
-		static void subscribe(ThreadExitListener* listener)
-		{
-			auto& tlsInst = instance();
-			std::lock_guard<std::mutex> guard(mutex());
-			listener->next = tlsInst.tail;
-			listener->chain = &tlsInst;
-			tlsInst.tail = listener;
-		}
-		
-		static void unsubscribe(ThreadExitListener* listener)
-		{
-			std::lock_guard<std::mutex> guard(mutex());
-			if (!listener->chain) {
-				return;  // race with ~ThreadExitNotifier
-			}
-			auto& tlsInst = *listener->chain;
-			listener->chain = nullptr;
-			ThreadExitListener** prev = &tlsInst.tail;
-			for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
-				if (ptr == listener) {
-					*prev = ptr->next;
-					break;
-				}
-				prev = &ptr->next;
-			}
-		}
-		
-	private:
-		ThreadExitNotifier() : tail(nullptr) { }
-		ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION;
-		
-		~ThreadExitNotifier()
-		{
-			// This thread is about to exit, let everyone know!
-			assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
-			std::lock_guard<std::mutex> guard(mutex());
-			for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
-				ptr->chain = nullptr;
-				ptr->callback(ptr->userData);
-			}
-		}
-		
-		// Thread-local
-		static inline ThreadExitNotifier& instance()
-		{
-			static thread_local ThreadExitNotifier notifier;
-			return notifier;
-		}
-
-		static inline std::mutex& mutex()
-		{
-			// Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called
-			static std::mutex mutex;
-			return mutex;
-		}
-		
-	private:
-		ThreadExitListener* tail;
-	};
-#endif
-#endif
-	
-	template<typename T> struct static_is_lock_free_num { enum { value = 0 }; };
-	template<> struct static_is_lock_free_num<signed char> { enum { value = ATOMIC_CHAR_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<short> { enum { value = ATOMIC_SHORT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<int> { enum { value = ATOMIC_INT_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long> { enum { value = ATOMIC_LONG_LOCK_FREE }; };
-	template<> struct static_is_lock_free_num<long long> { enum { value = ATOMIC_LLONG_LOCK_FREE }; };
-	template<typename T> struct static_is_lock_free : static_is_lock_free_num<typename std::make_signed<T>::type> {  };
-	template<> struct static_is_lock_free<bool> { enum { value = ATOMIC_BOOL_LOCK_FREE }; };
-	template<typename U> struct static_is_lock_free<U*> { enum { value = ATOMIC_POINTER_LOCK_FREE }; };
-}
+class ThreadExitNotifier;
+
+struct ThreadExitListener {
+  typedef void (*callback_t)(void *);
+  callback_t callback;
+  void *userData;
 
+  ThreadExitListener *next;  // reserved for use by the ThreadExitNotifier
+  ThreadExitNotifier *chain; // reserved for use by the ThreadExitNotifier
+};
+
+class ThreadExitNotifier {
+public:
+  static void subscribe(ThreadExitListener *listener) {
+    auto &tlsInst = instance();
+    std::lock_guard<std::mutex> guard(mutex());
+    listener->next = tlsInst.tail;
+    listener->chain = &tlsInst;
+    tlsInst.tail = listener;
+  }
+
+  static void unsubscribe(ThreadExitListener *listener) {
+    std::lock_guard<std::mutex> guard(mutex());
+    if (!listener->chain) {
+      return; // race with ~ThreadExitNotifier
+    }
+    auto &tlsInst = *listener->chain;
+    listener->chain = nullptr;
+    ThreadExitListener **prev = &tlsInst.tail;
+    for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) {
+      if (ptr == listener) {
+        *prev = ptr->next;
+        break;
+      }
+      prev = &ptr->next;
+    }
+  }
 
-struct ProducerToken
-{
-	template<typename T, typename Traits>
-	explicit ProducerToken(ConcurrentQueue<T, Traits>& queue);
-	
-	template<typename T, typename Traits>
-	explicit ProducerToken(BlockingConcurrentQueue<T, Traits>& queue);
-	
-	ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-		: producer(other.producer)
-	{
-		other.producer = nullptr;
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-	}
-	
-	inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-	
-	void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(producer, other.producer);
-		if (producer != nullptr) {
-			producer->token = this;
-		}
-		if (other.producer != nullptr) {
-			other.producer->token = &other;
-		}
-	}
-	
-	// A token is always valid unless:
-	//     1) Memory allocation failed during construction
-	//     2) It was moved via the move constructor
-	//        (Note: assignment does a swap, leaving both potentially valid)
-	//     3) The associated queue was destroyed
-	// Note that if valid() returns true, that only indicates
-	// that the token is valid for use with a specific queue,
-	// but not which one; that's up to the user to track.
-	inline bool valid() const { return producer != nullptr; }
-	
-	~ProducerToken()
-	{
-		if (producer != nullptr) {
-			producer->token = nullptr;
-			producer->inactive.store(true, std::memory_order_release);
-		}
-	}
-	
-	// Disable copying and assignment
-	ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	
 private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-	
+  ThreadExitNotifier() : tail(nullptr) {}
+  ThreadExitNotifier(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION;
+  ThreadExitNotifier &
+  operator=(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION;
+
+  ~ThreadExitNotifier() {
+    // This thread is about to exit, let everyone know!
+    assert(this == &instance() &&
+           "If this assert fails, you likely have a buggy compiler! Change the "
+           "preprocessor conditions such that "
+           "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined.");
+    std::lock_guard<std::mutex> guard(mutex());
+    for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) {
+      ptr->chain = nullptr;
+      ptr->callback(ptr->userData);
+    }
+  }
+
+  // Thread-local
+  static inline ThreadExitNotifier &instance() {
+    static thread_local ThreadExitNotifier notifier;
+    return notifier;
+  }
+
+  static inline std::mutex &mutex() {
+    // Must be static because the ThreadExitNotifier could be destroyed while
+    // unsubscribe is called
+    static std::mutex mutex;
+    return mutex;
+  }
+
+private:
+  ThreadExitListener *tail;
+};
+#endif
+#endif
+
+template <typename T> struct static_is_lock_free_num {
+  enum { value = 0 };
+};
+template <> struct static_is_lock_free_num<signed char> {
+  enum { value = ATOMIC_CHAR_LOCK_FREE };
+};
+template <> struct static_is_lock_free_num<short> {
+  enum { value = ATOMIC_SHORT_LOCK_FREE };
+};
+template <> struct static_is_lock_free_num<int> {
+  enum { value = ATOMIC_INT_LOCK_FREE };
+};
+template <> struct static_is_lock_free_num<long> {
+  enum { value = ATOMIC_LONG_LOCK_FREE };
+};
+template <> struct static_is_lock_free_num<long long> {
+  enum { value = ATOMIC_LLONG_LOCK_FREE };
+};
+template <typename T>
+struct static_is_lock_free
+    : static_is_lock_free_num<typename std::make_signed<T>::type> {};
+template <> struct static_is_lock_free<bool> {
+  enum { value = ATOMIC_BOOL_LOCK_FREE };
+};
+template <typename U> struct static_is_lock_free<U *> {
+  enum { value = ATOMIC_POINTER_LOCK_FREE };
+};
+} // namespace details
+
+struct ProducerToken {
+  template <typename T, typename Traits>
+  explicit ProducerToken(ConcurrentQueue<T, Traits> &queue);
+
+  template <typename T, typename Traits>
+  explicit ProducerToken(BlockingConcurrentQueue<T, Traits> &queue);
+
+  ProducerToken(ProducerToken &&other) MOODYCAMEL_NOEXCEPT
+      : producer(other.producer) {
+    other.producer = nullptr;
+    if (producer != nullptr) {
+      producer->token = this;
+    }
+  }
+
+  inline ProducerToken &operator=(ProducerToken &&other) MOODYCAMEL_NOEXCEPT {
+    swap(other);
+    return *this;
+  }
+
+  void swap(ProducerToken &other) MOODYCAMEL_NOEXCEPT {
+    std::swap(producer, other.producer);
+    if (producer != nullptr) {
+      producer->token = this;
+    }
+    if (other.producer != nullptr) {
+      other.producer->token = &other;
+    }
+  }
+
+  // A token is always valid unless:
+  //     1) Memory allocation failed during construction
+  //     2) It was moved via the move constructor
+  //        (Note: assignment does a swap, leaving both potentially valid)
+  //     3) The associated queue was destroyed
+  // Note that if valid() returns true, that only indicates
+  // that the token is valid for use with a specific queue,
+  // but not which one; that's up to the user to track.
+  inline bool valid() const { return producer != nullptr; }
+
+  ~ProducerToken() {
+    if (producer != nullptr) {
+      producer->token = nullptr;
+      producer->inactive.store(true, std::memory_order_release);
+    }
+  }
+
+  // Disable copying and assignment
+  ProducerToken(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+  ProducerToken &operator=(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+
+private:
+  template <typename T, typename Traits> friend class ConcurrentQueue;
+  friend class ConcurrentQueueTests;
+
 protected:
-	details::ConcurrentQueueProducerTypelessBase* producer;
+  details::ConcurrentQueueProducerTypelessBase *producer;
 };
 
+struct ConsumerToken {
+  template <typename T, typename Traits>
+  explicit ConsumerToken(ConcurrentQueue<T, Traits> &q);
+
+  template <typename T, typename Traits>
+  explicit ConsumerToken(BlockingConcurrentQueue<T, Traits> &q);
+
+  ConsumerToken(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT
+      : initialOffset(other.initialOffset),
+        lastKnownGlobalOffset(other.lastKnownGlobalOffset),
+        itemsConsumedFromCurrent(other.itemsConsumedFromCurrent),
+        currentProducer(other.currentProducer),
+        desiredProducer(other.desiredProducer) {}
+
+  inline ConsumerToken &operator=(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT {
+    swap(other);
+    return *this;
+  }
 
-struct ConsumerToken
-{
-	template<typename T, typename Traits>
-	explicit ConsumerToken(ConcurrentQueue<T, Traits>& q);
-	
-	template<typename T, typename Traits>
-	explicit ConsumerToken(BlockingConcurrentQueue<T, Traits>& q);
-	
-	ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-		: initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer)
-	{
-	}
-	
-	inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap(other);
-		return *this;
-	}
-	
-	void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT
-	{
-		std::swap(initialOffset, other.initialOffset);
-		std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
-		std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
-		std::swap(currentProducer, other.currentProducer);
-		std::swap(desiredProducer, other.desiredProducer);
-	}
-	
-	// Disable copying and assignment
-	ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION;
+  void swap(ConsumerToken &other) MOODYCAMEL_NOEXCEPT {
+    std::swap(initialOffset, other.initialOffset);
+    std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset);
+    std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent);
+    std::swap(currentProducer, other.currentProducer);
+    std::swap(desiredProducer, other.desiredProducer);
+  }
+
+  // Disable copying and assignment
+  ConsumerToken(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION;
+  ConsumerToken &operator=(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION;
 
 private:
-	template<typename T, typename Traits> friend class ConcurrentQueue;
-	friend class ConcurrentQueueTests;
-	
+  template <typename T, typename Traits> friend class ConcurrentQueue;
+  friend class ConcurrentQueueTests;
+
 private: // but shared with ConcurrentQueue
-	std::uint32_t initialOffset;
-	std::uint32_t lastKnownGlobalOffset;
-	std::uint32_t itemsConsumedFromCurrent;
-	details::ConcurrentQueueProducerTypelessBase* currentProducer;
-	details::ConcurrentQueueProducerTypelessBase* desiredProducer;
+  std::uint32_t initialOffset;
+  std::uint32_t lastKnownGlobalOffset;
+  std::uint32_t itemsConsumedFromCurrent;
+  details::ConcurrentQueueProducerTypelessBase *currentProducer;
+  details::ConcurrentQueueProducerTypelessBase *desiredProducer;
 };
 
 // Need to forward-declare this swap because it's in a namespace.
-// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT;
+// See
+// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &b)
+    MOODYCAMEL_NOEXCEPT;
 
-
-template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
-class ConcurrentQueue
-{
+template <typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class ConcurrentQueue {
 public:
-	typedef ::moodycamel::ProducerToken producer_token_t;
-	typedef ::moodycamel::ConsumerToken consumer_token_t;
-	
-	typedef typename Traits::index_t index_t;
-	typedef typename Traits::size_t size_t;
-	
-	static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
-	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
-	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
-	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
-	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast<std::uint32_t>(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
+  typedef ::moodycamel::ProducerToken producer_token_t;
+  typedef ::moodycamel::ConsumerToken consumer_token_t;
+
+  typedef typename Traits::index_t index_t;
+  typedef typename Traits::size_t size_t;
+
+  static const size_t BLOCK_SIZE = static_cast<size_t>(Traits::BLOCK_SIZE);
+  static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD =
+      static_cast<size_t>(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD);
+  static const size_t EXPLICIT_INITIAL_INDEX_SIZE =
+      static_cast<size_t>(Traits::EXPLICIT_INITIAL_INDEX_SIZE);
+  static const size_t IMPLICIT_INITIAL_INDEX_SIZE =
+      static_cast<size_t>(Traits::IMPLICIT_INITIAL_INDEX_SIZE);
+  static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE =
+      static_cast<size_t>(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE);
+  static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE =
+      static_cast<std::uint32_t>(
+          Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE);
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable: 4307)		// + integral constant overflow (that's what the ternary expression is for!)
-#pragma warning(disable: 4309)		// static_cast: Truncation of constant value
+#pragma warning(disable : 4307) // + integral constant overflow (that's what the
+                                // ternary expression is for!)
+#pragma warning(disable : 4309) // static_cast: Truncation of constant value
 #endif
-	static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max<size_t>::value - static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max<size_t>::value : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE);
+  static const size_t MAX_SUBQUEUE_SIZE =
+      (details::const_numeric_max<size_t>::value -
+           static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) <
+       BLOCK_SIZE)
+          ? details::const_numeric_max<size_t>::value
+          : ((static_cast<size_t>(Traits::MAX_SUBQUEUE_SIZE) +
+              (BLOCK_SIZE - 1)) /
+             BLOCK_SIZE * BLOCK_SIZE);
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
 
-	static_assert(!std::numeric_limits<size_t>::is_signed && std::is_integral<size_t>::value, "Traits::size_t must be an unsigned integral type");
-	static_assert(!std::numeric_limits<index_t>::is_signed && std::is_integral<index_t>::value, "Traits::index_t must be an unsigned integral type");
-	static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t");
-	static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
-	static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)");
-	static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)");
-	static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
-	static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)");
+  static_assert(!std::numeric_limits<size_t>::is_signed &&
+                    std::is_integral<size_t>::value,
+                "Traits::size_t must be an unsigned integral type");
+  static_assert(!std::numeric_limits<index_t>::is_signed &&
+                    std::is_integral<index_t>::value,
+                "Traits::index_t must be an unsigned integral type");
+  static_assert(sizeof(index_t) >= sizeof(size_t),
+                "Traits::index_t must be at least as wide as Traits::size_t");
+  static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)),
+                "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)");
+  static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) &&
+                    !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD &
+                      (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)),
+                "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a "
+                "power of 2 (and greater than 1)");
+  static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                    !(EXPLICIT_INITIAL_INDEX_SIZE &
+                      (EXPLICIT_INITIAL_INDEX_SIZE - 1)),
+                "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
+                "greater than 1)");
+  static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) &&
+                    !(IMPLICIT_INITIAL_INDEX_SIZE &
+                      (IMPLICIT_INITIAL_INDEX_SIZE - 1)),
+                "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and "
+                "greater than 1)");
+  static_assert(
+      (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) ||
+          !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE &
+            (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)),
+      "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2");
+  static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 ||
+                    INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1,
+                "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least "
+                "1 (or 0 to disable implicit enqueueing)");
 
 public:
-	// Creates a queue with at least `capacity` element slots; note that the
-	// actual number of elements that can be inserted without additional memory
-	// allocation depends on the number of producers and the block size (e.g. if
-	// the block size is equal to `capacity`, only a single block will be allocated
-	// up-front, which means only a single producer will be able to enqueue elements
-	// without an extra allocation -- blocks aren't shared between producers).
-	// This method is not thread safe -- it is up to the user to ensure that the
-	// queue is fully constructed before it starts being used by other threads (this
-	// includes making the memory effects of construction visible, possibly with a
-	// memory barrier).
-	explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
-		
+  // Creates a queue with at least `capacity` element slots; note that the
+  // actual number of elements that can be inserted without additional memory
+  // allocation depends on the number of producers and the block size (e.g. if
+  // the block size is equal to `capacity`, only a single block will be
+  // allocated up-front, which means only a single producer will be able to
+  // enqueue elements without an extra allocation -- blocks aren't shared
+  // between producers). This method is not thread safe -- it is up to the user
+  // to ensure that the queue is fully constructed before it starts being used
+  // by other threads (this includes making the memory effects of construction
+  // visible, possibly with a memory barrier).
+  explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE)
+      : producerListTail(nullptr), producerCount(0), initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0), globalExplicitConsumerOffset(0) {
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    populate_initial_block_list(capacity / BLOCK_SIZE +
+                                ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1));
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		// Track all the producers using a fully-resolved typed list for
-		// each kind; this makes it possible to debug them starting from
-		// the root queue object (otherwise wacky casts are needed that
-		// don't compile in the debugger's expression evaluator).
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-	
-	// Computes the correct amount of pre-allocated blocks for you based
-	// on the minimum number of elements you want available at any given
-	// time, and the maximum concurrent number of each type of producer.
-	ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
-		: producerListTail(nullptr),
-		producerCount(0),
-		initialBlockPoolIndex(0),
-		nextExplicitConsumerId(0),
-		globalExplicitConsumerOffset(0)
-	{
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers);
-		populate_initial_block_list(blocks);
-		
+    // Track all the producers using a fully-resolved typed list for
+    // each kind; this makes it possible to debug them starting from
+    // the root queue object (otherwise wacky casts are needed that
+    // don't compile in the debugger's expression evaluator).
+    explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+  }
+
+  // Computes the correct amount of pre-allocated blocks for you based
+  // on the minimum number of elements you want available at any given
+  // time, and the maximum concurrent number of each type of producer.
+  ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers,
+                  size_t maxImplicitProducers)
+      : producerListTail(nullptr), producerCount(0), initialBlockPoolIndex(0),
+        nextExplicitConsumerId(0), globalExplicitConsumerOffset(0) {
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) *
+                        (maxExplicitProducers + 1) +
+                    2 * (maxExplicitProducers + maxImplicitProducers);
+    populate_initial_block_list(blocks);
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-	}
-	
-	// Note: The queue should not be accessed concurrently while it's
-	// being deleted. It's up to the user to synchronize this.
-	// This method is not thread safe.
-	~ConcurrentQueue()
-	{
-		// Destroy producers
-		auto ptr = producerListTail.load(std::memory_order_relaxed);
-		while (ptr != nullptr) {
-			auto next = ptr->next_prod();
-			if (ptr->token != nullptr) {
-				ptr->token->producer = nullptr;
-			}
-			destroy(ptr);
-			ptr = next;
-		}
-		
-		// Destroy implicit producer hash tables
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
-			auto hash = implicitProducerHash.load(std::memory_order_relaxed);
-			while (hash != nullptr) {
-				auto prev = hash->prev;
-				if (prev != nullptr) {		// The last hash is part of this object and was not allocated dynamically
-					for (size_t i = 0; i != hash->capacity; ++i) {
-						hash->entries[i].~ImplicitProducerKVP();
-					}
-					hash->~ImplicitProducerHash();
-					(Traits::free)(hash);
-				}
-				hash = prev;
-			}
-		}
-		
-		// Destroy global free list
-		auto block = freeList.head_unsafe();
-		while (block != nullptr) {
-			auto next = block->freeListNext.load(std::memory_order_relaxed);
-			if (block->dynamicallyAllocated) {
-				destroy(block);
-			}
-			block = next;
-		}
-		
-		// Destroy initial free list
-		destroy_array(initialBlockPool, initialBlockPoolSize);
-	}
-
-	// Disable copying and copy assignment
-	ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
-	
-	// Moving is supported, but note that it is *not* a thread-safe operation.
-	// Nobody can use the queue while it's being moved, and the memory effects
-	// of that move must be propagated to other threads before they can use it.
-	// Note: When a queue is moved, its tokens are still valid but can only be
-	// used with the destination queue (i.e. semantically they are moved along
-	// with the queue itself).
-	ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-		: producerListTail(other.producerListTail.load(std::memory_order_relaxed)),
-		producerCount(other.producerCount.load(std::memory_order_relaxed)),
-		initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
-		initialBlockPool(other.initialBlockPool),
-		initialBlockPoolSize(other.initialBlockPoolSize),
-		freeList(std::move(other.freeList)),
-		nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
-		globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed))
-	{
-		// Move the other one into this, and leave the other one as an empty queue
-		implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-		populate_initial_implicit_producer_hash();
-		swap_implicit_producer_hashes(other);
-		
-		other.producerListTail.store(nullptr, std::memory_order_relaxed);
-		other.producerCount.store(0, std::memory_order_relaxed);
-		other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
-		other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
-		
+    explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+  }
+
+  // Note: The queue should not be accessed concurrently while it's
+  // being deleted. It's up to the user to synchronize this.
+  // This method is not thread safe.
+  ~ConcurrentQueue() {
+    // Destroy producers
+    auto ptr = producerListTail.load(std::memory_order_relaxed);
+    while (ptr != nullptr) {
+      auto next = ptr->next_prod();
+      if (ptr->token != nullptr) {
+        ptr->token->producer = nullptr;
+      }
+      destroy(ptr);
+      ptr = next;
+    }
+
+    // Destroy implicit producer hash tables
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) {
+      auto hash = implicitProducerHash.load(std::memory_order_relaxed);
+      while (hash != nullptr) {
+        auto prev = hash->prev;
+        if (prev != nullptr) { // The last hash is part of this object and was
+                               // not allocated dynamically
+          for (size_t i = 0; i != hash->capacity; ++i) {
+            hash->entries[i].~ImplicitProducerKVP();
+          }
+          hash->~ImplicitProducerHash();
+          (Traits::free)(hash);
+        }
+        hash = prev;
+      }
+    }
+
+    // Destroy global free list
+    auto block = freeList.head_unsafe();
+    while (block != nullptr) {
+      auto next = block->freeListNext.load(std::memory_order_relaxed);
+      if (block->dynamicallyAllocated) {
+        destroy(block);
+      }
+      block = next;
+    }
+
+    // Destroy initial free list
+    destroy_array(initialBlockPool, initialBlockPoolSize);
+  }
+
+  // Disable copying and copy assignment
+  ConcurrentQueue(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION;
+  ConcurrentQueue &
+  operator=(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION;
+
+  // Moving is supported, but note that it is *not* a thread-safe operation.
+  // Nobody can use the queue while it's being moved, and the memory effects
+  // of that move must be propagated to other threads before they can use it.
+  // Note: When a queue is moved, its tokens are still valid but can only be
+  // used with the destination queue (i.e. semantically they are moved along
+  // with the queue itself).
+  ConcurrentQueue(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT
+      : producerListTail(
+            other.producerListTail.load(std::memory_order_relaxed)),
+        producerCount(other.producerCount.load(std::memory_order_relaxed)),
+        initialBlockPoolIndex(
+            other.initialBlockPoolIndex.load(std::memory_order_relaxed)),
+        initialBlockPool(other.initialBlockPool),
+        initialBlockPoolSize(other.initialBlockPoolSize),
+        freeList(std::move(other.freeList)),
+        nextExplicitConsumerId(
+            other.nextExplicitConsumerId.load(std::memory_order_relaxed)),
+        globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(
+            std::memory_order_relaxed)) {
+    // Move the other one into this, and leave the other one as an empty queue
+    implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
+    populate_initial_implicit_producer_hash();
+    swap_implicit_producer_hashes(other);
+
+    other.producerListTail.store(nullptr, std::memory_order_relaxed);
+    other.producerCount.store(0, std::memory_order_relaxed);
+    other.nextExplicitConsumerId.store(0, std::memory_order_relaxed);
+    other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed);
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.explicitProducers.store(nullptr, std::memory_order_relaxed);
-		implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed);
-		other.implicitProducers.store(nullptr, std::memory_order_relaxed);
-#endif
-		
-		other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
-		other.initialBlockPoolSize = 0;
-		other.initialBlockPool = nullptr;
-		
-		reown_producers();
-	}
-	
-	inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
-	{
-		return swap_internal(other);
-	}
-	
-	// Swaps this queue's state with the other's. Not thread-safe.
-	// Swapping two queues does not invalidate their tokens, however
-	// the tokens that were created for one queue must be used with
-	// only the swapped queue (i.e. the tokens are tied to the
-	// queue's movable state, not the object itself).
-	inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
-	{
-		swap_internal(other);
-	}
-	
+    explicitProducers.store(
+        other.explicitProducers.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.explicitProducers.store(nullptr, std::memory_order_relaxed);
+    implicitProducers.store(
+        other.implicitProducers.load(std::memory_order_relaxed),
+        std::memory_order_relaxed);
+    other.implicitProducers.store(nullptr, std::memory_order_relaxed);
+#endif
+
+    other.initialBlockPoolIndex.store(0, std::memory_order_relaxed);
+    other.initialBlockPoolSize = 0;
+    other.initialBlockPool = nullptr;
+
+    reown_producers();
+  }
+
+  inline ConcurrentQueue &
+  operator=(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT {
+    return swap_internal(other);
+  }
+
+  // Swaps this queue's state with the other's. Not thread-safe.
+  // Swapping two queues does not invalidate their tokens, however
+  // the tokens that were created for one queue must be used with
+  // only the swapped queue (i.e. the tokens are tied to the
+  // queue's movable state, not the object itself).
+  inline void swap(ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT {
+    swap_internal(other);
+  }
+
 private:
-	ConcurrentQueue& swap_internal(ConcurrentQueue& other)
-	{
-		if (this == &other) {
-			return *this;
-		}
-		
-		details::swap_relaxed(producerListTail, other.producerListTail);
-		details::swap_relaxed(producerCount, other.producerCount);
-		details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
-		std::swap(initialBlockPool, other.initialBlockPool);
-		std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
-		freeList.swap(other.freeList);
-		details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
-		details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset);
-		
-		swap_implicit_producer_hashes(other);
-		
-		reown_producers();
-		other.reown_producers();
-		
+  ConcurrentQueue &swap_internal(ConcurrentQueue &other) {
+    if (this == &other) {
+      return *this;
+    }
+
+    details::swap_relaxed(producerListTail, other.producerListTail);
+    details::swap_relaxed(producerCount, other.producerCount);
+    details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex);
+    std::swap(initialBlockPool, other.initialBlockPool);
+    std::swap(initialBlockPoolSize, other.initialBlockPoolSize);
+    freeList.swap(other.freeList);
+    details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId);
+    details::swap_relaxed(globalExplicitConsumerOffset,
+                          other.globalExplicitConsumerOffset);
+
+    swap_implicit_producer_hashes(other);
+
+    reown_producers();
+    other.reown_producers();
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		details::swap_relaxed(explicitProducers, other.explicitProducers);
-		details::swap_relaxed(implicitProducers, other.implicitProducers);
+    details::swap_relaxed(explicitProducers, other.explicitProducers);
+    details::swap_relaxed(implicitProducers, other.implicitProducers);
 #endif
-		
-		return *this;
-	}
-	
+
+    return *this;
+  }
+
 public:
-	// Enqueues a single item (by copying it).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible).
-	// Allocates memory if required. Only fails if memory allocation fails (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
-	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CanAlloc>(std::move(item));
-	}
-	
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CanAlloc>(token, item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Thread-safe.
-	inline bool enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CanAlloc>(token, std::move(item));
-	}
-	
-	// Enqueues several items.
-	// Allocates memory if required. Only fails if memory allocation fails (or
-	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
-	}
-	
-	// Enqueues several items using an explicit producer token.
-	// Allocates memory if required. Only fails if memory allocation fails
-	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
-	}
-	
-	// Enqueues a single item (by copying it).
-	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
-	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
-	// is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T const& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible).
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Thread-safe.
-	inline bool try_enqueue(T&& item)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue<CannotAlloc>(std::move(item));
-	}
-	
-	// Enqueues a single item (by copying it) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T const& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, item);
-	}
-	
-	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Thread-safe.
-	inline bool try_enqueue(producer_token_t const& token, T&& item)
-	{
-		return inner_enqueue<CannotAlloc>(token, std::move(item));
-	}
-	
-	// Enqueues several items.
-	// Does not allocate memory (except for one-time implicit producer).
-	// Fails if not enough room to enqueue (or implicit production is
-	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(It itemFirst, size_t count)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false;
-		else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
-	}
-	
-	// Enqueues several items using an explicit producer token.
-	// Does not allocate memory. Fails if not enough room to enqueue.
-	// Note: Use std::make_move_iterator if the elements should be moved
-	// instead of copied.
-	// Thread-safe.
-	template<typename It>
-	bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
-	}
-	
-	
-	
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(U& item)
-	{
-		// Instead of simply trying each producer in turn (which could cause needless contention on the first
-		// producer), we score them heuristically.
-		size_t nonEmptyCount = 0;
-		ProducerBase* best = nullptr;
-		size_t bestSize = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
-			auto size = ptr->size_approx();
-			if (size > 0) {
-				if (size > bestSize) {
-					bestSize = size;
-					best = ptr;
-				}
-				++nonEmptyCount;
-			}
-		}
-		
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (nonEmptyCount > 0) {
-			if ((details::likely)(best->dequeue(item))) {
-				return true;
-			}
-			for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-				if (ptr != best && ptr->dequeue(item)) {
-					return true;
-				}
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue from the queue.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// This differs from the try_dequeue(item) method in that this one does
-	// not attempt to reduce contention by interleaving the order that producer
-	// streams are dequeued from. So, using this method can reduce overall throughput
-	// under contention, but will give more predictable results in single-threaded
-	// consumer scenarios. This is mostly only useful for internal unit tests.
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue_non_interleaved(U& item)
-	{
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->dequeue(item)) {
-				return true;
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue from the queue using an explicit consumer token.
-	// Returns false if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	bool try_dequeue(consumer_token_t& token, U& item)
-	{
-		// The idea is roughly as follows:
-		// Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less
-		// If you see that the global offset has changed, you must reset your consumption counter and move to your designated place
-		// If there's no items where you're supposed to be, keep moving until you find a producer with some items
-		// If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it
-		
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return false;
-			}
-		}
-		
-		// If there was at least one non-empty queue but it appears empty at the time
-		// we try to dequeue from it, we need to make sure every queue's been tried
-		if (static_cast<ProducerBase*>(token.currentProducer)->dequeue(item)) {
-			if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return true;
-		}
-		
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			if (ptr->dequeue(item)) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = 1;
-				return true;
-			}
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return false;
-	}
-	
-	// Attempts to dequeue several elements from the queue.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(It itemFirst, size_t max)
-	{
-		size_t count = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			count += ptr->dequeue_bulk(itemFirst, max - count);
-			if (count == max) {
-				break;
-			}
-		}
-		return count;
-	}
-	
-	// Attempts to dequeue several elements from the queue using an explicit consumer token.
-	// Returns the number of items actually dequeued.
-	// Returns 0 if all producer streams appeared empty at the time they
-	// were checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
-	{
-		if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
-			if (!update_current_producer_after_rotation(token)) {
-				return 0;
-			}
-		}
-		
-		size_t count = static_cast<ProducerBase*>(token.currentProducer)->dequeue_bulk(itemFirst, max);
-		if (count == max) {
-			if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
-				globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
-			}
-			return max;
-		}
-		token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
-		max -= count;
-		
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		auto ptr = static_cast<ProducerBase*>(token.currentProducer)->next_prod();
-		if (ptr == nullptr) {
-			ptr = tail;
-		}
-		while (ptr != static_cast<ProducerBase*>(token.currentProducer)) {
-			auto dequeued = ptr->dequeue_bulk(itemFirst, max);
-			count += dequeued;
-			if (dequeued != 0) {
-				token.currentProducer = ptr;
-				token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
-			}
-			if (dequeued == max) {
-				break;
-			}
-			max -= dequeued;
-			ptr = ptr->next_prod();
-			if (ptr == nullptr) {
-				ptr = tail;
-			}
-		}
-		return count;
-	}
-	
-	
-	
-	// Attempts to dequeue from a specific producer's inner queue.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns false if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename U>
-	inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue(item);
-	}
-	
-	// Attempts to dequeue several elements from a specific producer's inner queue.
-	// Returns the number of items actually dequeued.
-	// If you happen to know which producer you want to dequeue from, this
-	// is significantly faster than using the general-case try_dequeue methods.
-	// Returns 0 if the producer's queue appeared empty at the time it
-	// was checked (so, the queue is likely but not guaranteed to be empty).
-	// Never allocates. Thread-safe.
-	template<typename It>
-	inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max)
-	{
-		return static_cast<ExplicitProducer*>(producer.producer)->dequeue_bulk(itemFirst, max);
-	}
-	
-	
-	// Returns an estimate of the total number of elements currently in the queue. This
-	// estimate is only accurate if the queue has completely stabilized before it is called
-	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
-	// visible on the calling thread, and no further operations start while this method is
-	// being called).
-	// Thread-safe.
-	size_t size_approx() const
-	{
-		size_t size = 0;
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			size += ptr->size_approx();
-		}
-		return size;
-	}
-	
-	
-	// Returns true if the underlying atomic variables used by
-	// the queue are lock-free (they should be on most platforms).
-	// Thread-safe.
-	static constexpr bool is_lock_free()
-	{
-		return
-			details::static_is_lock_free<bool>::value == 2 &&
-			details::static_is_lock_free<size_t>::value == 2 &&
-			details::static_is_lock_free<std::uint32_t>::value == 2 &&
-			details::static_is_lock_free<index_t>::value == 2 &&
-			details::static_is_lock_free<void*>::value == 2 &&
-			details::static_is_lock_free<typename details::thread_id_converter<details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
-	}
+  // Enqueues a single item (by copying it).
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(T const &item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CanAlloc>(item);
+  }
+
+  // Enqueues a single item (by moving it, if possible).
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(T &&item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CanAlloc>(std::move(item));
+  }
 
+  // Enqueues a single item (by copying it) using an explicit producer token.
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Thread-safe.
+  inline bool enqueue(producer_token_t const &token, T const &item) {
+    return inner_enqueue<CanAlloc>(token, item);
+  }
+
+  // Enqueues a single item (by moving it, if possible) using an explicit
+  // producer token. Allocates memory if required. Only fails if memory
+  // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would
+  // be surpassed). Thread-safe.
+  inline bool enqueue(producer_token_t const &token, T &&item) {
+    return inner_enqueue<CanAlloc>(token, std::move(item));
+  }
+
+  // Enqueues several items.
+  // Allocates memory if required. Only fails if memory allocation fails (or
+  // implicit production is disabled because
+  // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or
+  // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note:
+  // Use std::make_move_iterator if the elements should be moved instead of
+  // copied. Thread-safe.
+  template <typename It> bool enqueue_bulk(It itemFirst, size_t count) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue_bulk<CanAlloc>(itemFirst, count);
+  }
+
+  // Enqueues several items using an explicit producer token.
+  // Allocates memory if required. Only fails if memory allocation fails
+  // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool enqueue_bulk(producer_token_t const &token, It itemFirst, size_t count) {
+    return inner_enqueue_bulk<CanAlloc>(token, itemFirst, count);
+  }
+
+  // Enqueues a single item (by copying it).
+  // Does not allocate memory. Fails if not enough room to enqueue (or implicit
+  // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+  // is 0).
+  // Thread-safe.
+  inline bool try_enqueue(T const &item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CannotAlloc>(item);
+  }
+
+  // Enqueues a single item (by moving it, if possible).
+  // Does not allocate memory (except for one-time implicit producer).
+  // Fails if not enough room to enqueue (or implicit production is
+  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+  // Thread-safe.
+  inline bool try_enqueue(T &&item) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue<CannotAlloc>(std::move(item));
+  }
+
+  // Enqueues a single item (by copying it) using an explicit producer token.
+  // Does not allocate memory. Fails if not enough room to enqueue.
+  // Thread-safe.
+  inline bool try_enqueue(producer_token_t const &token, T const &item) {
+    return inner_enqueue<CannotAlloc>(token, item);
+  }
+
+  // Enqueues a single item (by moving it, if possible) using an explicit
+  // producer token. Does not allocate memory. Fails if not enough room to
+  // enqueue. Thread-safe.
+  inline bool try_enqueue(producer_token_t const &token, T &&item) {
+    return inner_enqueue<CannotAlloc>(token, std::move(item));
+  }
+
+  // Enqueues several items.
+  // Does not allocate memory (except for one-time implicit producer).
+  // Fails if not enough room to enqueue (or implicit production is
+  // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It> bool try_enqueue_bulk(It itemFirst, size_t count) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0)
+    return false;
+    else return inner_enqueue_bulk<CannotAlloc>(itemFirst, count);
+  }
+
+  // Enqueues several items using an explicit producer token.
+  // Does not allocate memory. Fails if not enough room to enqueue.
+  // Note: Use std::make_move_iterator if the elements should be moved
+  // instead of copied.
+  // Thread-safe.
+  template <typename It>
+  bool try_enqueue_bulk(producer_token_t const &token, It itemFirst,
+                        size_t count) {
+    return inner_enqueue_bulk<CannotAlloc>(token, itemFirst, count);
+  }
+
+  // Attempts to dequeue from the queue.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U> bool try_dequeue(U &item) {
+    // Instead of simply trying each producer in turn (which could cause
+    // needless contention on the first producer), we score them heuristically.
+    size_t nonEmptyCount = 0;
+    ProducerBase *best = nullptr;
+    size_t bestSize = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) {
+      auto size = ptr->size_approx();
+      if (size > 0) {
+        if (size > bestSize) {
+          bestSize = size;
+          best = ptr;
+        }
+        ++nonEmptyCount;
+      }
+    }
+
+    // If there was at least one non-empty queue but it appears empty at the
+    // time we try to dequeue from it, we need to make sure every queue's been
+    // tried
+    if (nonEmptyCount > 0) {
+      if ((details::likely)(best->dequeue(item))) {
+        return true;
+      }
+      for (auto ptr = producerListTail.load(std::memory_order_acquire);
+           ptr != nullptr; ptr = ptr->next_prod()) {
+        if (ptr != best && ptr->dequeue(item)) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue from the queue.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // This differs from the try_dequeue(item) method in that this one does
+  // not attempt to reduce contention by interleaving the order that producer
+  // streams are dequeued from. So, using this method can reduce overall
+  // throughput under contention, but will give more predictable results in
+  // single-threaded consumer scenarios. This is mostly only useful for internal
+  // unit tests. Never allocates. Thread-safe.
+  template <typename U> bool try_dequeue_non_interleaved(U &item) {
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      if (ptr->dequeue(item)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue from the queue using an explicit consumer token.
+  // Returns false if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U> bool try_dequeue(consumer_token_t &token, U &item) {
+    // The idea is roughly as follows:
+    // Every 256 items from one producer, make everyone rotate (increase the
+    // global offset) -> this means the highest efficiency consumer dictates the
+    // rotation speed of everyone else, more or less If you see that the global
+    // offset has changed, you must reset your consumption counter and move to
+    // your designated place If there's no items where you're supposed to be,
+    // keep moving until you find a producer with some items If the global
+    // offset has not changed but you've run out of items to consume, move over
+    // from your current position until you find an producer with something in
+    // it
+
+    if (token.desiredProducer == nullptr ||
+        token.lastKnownGlobalOffset !=
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+      if (!update_current_producer_after_rotation(token)) {
+        return false;
+      }
+    }
+
+    // If there was at least one non-empty queue but it appears empty at the
+    // time we try to dequeue from it, we need to make sure every queue's been
+    // tried
+    if (static_cast<ProducerBase *>(token.currentProducer)->dequeue(item)) {
+      if (++token.itemsConsumedFromCurrent ==
+          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+      }
+      return true;
+    }
+
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    auto ptr = static_cast<ProducerBase *>(token.currentProducer)->next_prod();
+    if (ptr == nullptr) {
+      ptr = tail;
+    }
+    while (ptr != static_cast<ProducerBase *>(token.currentProducer)) {
+      if (ptr->dequeue(item)) {
+        token.currentProducer = ptr;
+        token.itemsConsumedFromCurrent = 1;
+        return true;
+      }
+      ptr = ptr->next_prod();
+      if (ptr == nullptr) {
+        ptr = tail;
+      }
+    }
+    return false;
+  }
+
+  // Attempts to dequeue several elements from the queue.
+  // Returns the number of items actually dequeued.
+  // Returns 0 if all producer streams appeared empty at the time they
+  // were checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename It> size_t try_dequeue_bulk(It itemFirst, size_t max) {
+    size_t count = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      count += ptr->dequeue_bulk(itemFirst, max - count);
+      if (count == max) {
+        break;
+      }
+    }
+    return count;
+  }
+
+  // Attempts to dequeue several elements from the queue using an explicit
+  // consumer token. Returns the number of items actually dequeued. Returns 0 if
+  // all producer streams appeared empty at the time they were checked (so, the
+  // queue is likely but not guaranteed to be empty). Never allocates.
+  // Thread-safe.
+  template <typename It>
+  size_t try_dequeue_bulk(consumer_token_t &token, It itemFirst, size_t max) {
+    if (token.desiredProducer == nullptr ||
+        token.lastKnownGlobalOffset !=
+            globalExplicitConsumerOffset.load(std::memory_order_relaxed)) {
+      if (!update_current_producer_after_rotation(token)) {
+        return 0;
+      }
+    }
+
+    size_t count = static_cast<ProducerBase *>(token.currentProducer)
+                       ->dequeue_bulk(itemFirst, max);
+    if (count == max) {
+      if ((token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(max)) >=
+          EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) {
+        globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed);
+      }
+      return max;
+    }
+    token.itemsConsumedFromCurrent += static_cast<std::uint32_t>(count);
+    max -= count;
+
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    auto ptr = static_cast<ProducerBase *>(token.currentProducer)->next_prod();
+    if (ptr == nullptr) {
+      ptr = tail;
+    }
+    while (ptr != static_cast<ProducerBase *>(token.currentProducer)) {
+      auto dequeued = ptr->dequeue_bulk(itemFirst, max);
+      count += dequeued;
+      if (dequeued != 0) {
+        token.currentProducer = ptr;
+        token.itemsConsumedFromCurrent = static_cast<std::uint32_t>(dequeued);
+      }
+      if (dequeued == max) {
+        break;
+      }
+      max -= dequeued;
+      ptr = ptr->next_prod();
+      if (ptr == nullptr) {
+        ptr = tail;
+      }
+    }
+    return count;
+  }
+
+  // Attempts to dequeue from a specific producer's inner queue.
+  // If you happen to know which producer you want to dequeue from, this
+  // is significantly faster than using the general-case try_dequeue methods.
+  // Returns false if the producer's queue appeared empty at the time it
+  // was checked (so, the queue is likely but not guaranteed to be empty).
+  // Never allocates. Thread-safe.
+  template <typename U>
+  inline bool try_dequeue_from_producer(producer_token_t const &producer,
+                                        U &item) {
+    return static_cast<ExplicitProducer *>(producer.producer)->dequeue(item);
+  }
+
+  // Attempts to dequeue several elements from a specific producer's inner
+  // queue. Returns the number of items actually dequeued. If you happen to know
+  // which producer you want to dequeue from, this is significantly faster than
+  // using the general-case try_dequeue methods. Returns 0 if the producer's
+  // queue appeared empty at the time it was checked (so, the queue is likely
+  // but not guaranteed to be empty). Never allocates. Thread-safe.
+  template <typename It>
+  inline size_t try_dequeue_bulk_from_producer(producer_token_t const &producer,
+                                               It itemFirst, size_t max) {
+    return static_cast<ExplicitProducer *>(producer.producer)
+        ->dequeue_bulk(itemFirst, max);
+  }
+
+  // Returns an estimate of the total number of elements currently in the queue.
+  // This estimate is only accurate if the queue has completely stabilized
+  // before it is called (i.e. all enqueue and dequeue operations have completed
+  // and their memory effects are visible on the calling thread, and no further
+  // operations start while this method is being called). Thread-safe.
+  size_t size_approx() const {
+    size_t size = 0;
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      size += ptr->size_approx();
+    }
+    return size;
+  }
+
+  // Returns true if the underlying atomic variables used by
+  // the queue are lock-free (they should be on most platforms).
+  // Thread-safe.
+  static constexpr bool is_lock_free() {
+    return details::static_is_lock_free<bool>::value == 2 &&
+           details::static_is_lock_free<size_t>::value == 2 &&
+           details::static_is_lock_free<std::uint32_t>::value == 2 &&
+           details::static_is_lock_free<index_t>::value == 2 &&
+           details::static_is_lock_free<void *>::value == 2 &&
+           details::static_is_lock_free<typename details::thread_id_converter<
+               details::thread_id_t>::thread_id_numeric_size_t>::value == 2;
+  }
 
 private:
-	friend struct ProducerToken;
-	friend struct ConsumerToken;
-	struct ExplicitProducer;
-	friend struct ExplicitProducer;
-	struct ImplicitProducer;
-	friend struct ImplicitProducer;
-	friend class ConcurrentQueueTests;
-		
-	enum AllocationMode { CanAlloc, CannotAlloc };
-	
-	
-	///////////////////////////////
-	// Queue methods
-	///////////////////////////////
-	
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(producer_token_t const& token, U&& element)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-	
-	template<AllocationMode canAlloc, typename U>
-	inline bool inner_enqueue(U&& element)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue<canAlloc>(std::forward<U>(element));
-	}
-	
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
-	{
-		return static_cast<ExplicitProducer*>(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-	
-	template<AllocationMode canAlloc, typename It>
-	inline bool inner_enqueue_bulk(It itemFirst, size_t count)
-	{
-		auto producer = get_or_add_implicit_producer();
-		return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk<canAlloc>(itemFirst, count);
-	}
-	
-	inline bool update_current_producer_after_rotation(consumer_token_t& token)
-	{
-		// Ah, there's been a rotation, figure out where we should be!
-		auto tail = producerListTail.load(std::memory_order_acquire);
-		if (token.desiredProducer == nullptr && tail == nullptr) {
-			return false;
-		}
-		auto prodCount = producerCount.load(std::memory_order_relaxed);
-		auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed);
-		if ((details::unlikely)(token.desiredProducer == nullptr)) {
-			// Aha, first time we're dequeueing anything.
-			// Figure out our local position
-			// Note: offset is from start, not end, but we're traversing from end -- subtract from count first
-			std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
-			token.desiredProducer = tail;
-			for (std::uint32_t i = 0; i != offset; ++i) {
-				token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-				if (token.desiredProducer == nullptr) {
-					token.desiredProducer = tail;
-				}
-			}
-		}
-		
-		std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
-		if (delta >= prodCount) {
-			delta = delta % prodCount;
-		}
-		for (std::uint32_t i = 0; i != delta; ++i) {
-			token.desiredProducer = static_cast<ProducerBase*>(token.desiredProducer)->next_prod();
-			if (token.desiredProducer == nullptr) {
-				token.desiredProducer = tail;
-			}
-		}
-		
-		token.lastKnownGlobalOffset = globalOffset;
-		token.currentProducer = token.desiredProducer;
-		token.itemsConsumedFromCurrent = 0;
-		return true;
-	}
-	
-	
-	///////////////////////////
-	// Free list
-	///////////////////////////
-	
-	template <typename N>
-	struct FreeListNode
-	{
-		FreeListNode() : freeListRefs(0), freeListNext(nullptr) { }
-		
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<N*> freeListNext;
-	};
-	
-	// A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but
-	// simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly
-	// speedy under low contention.
-	template<typename N>		// N must inherit FreeListNode or have the same fields (and initialization of them)
-	struct FreeList
-	{
-		FreeList() : freeListHead(nullptr) { }
-		FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); }
-		void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); }
-		
-		FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION;
-		
-		inline void add(N* node)
-		{
+  friend struct ProducerToken;
+  friend struct ConsumerToken;
+  struct ExplicitProducer;
+  friend struct ExplicitProducer;
+  struct ImplicitProducer;
+  friend struct ImplicitProducer;
+  friend class ConcurrentQueueTests;
+
+  enum AllocationMode { CanAlloc, CannotAlloc };
+
+  ///////////////////////////////
+  // Queue methods
+  ///////////////////////////////
+
+  template <AllocationMode canAlloc, typename U>
+  inline bool inner_enqueue(producer_token_t const &token, U &&element) {
+    return static_cast<ExplicitProducer *>(token.producer)
+        ->ConcurrentQueue::ExplicitProducer::template enqueue<canAlloc>(
+            std::forward<U>(element));
+  }
+
+  template <AllocationMode canAlloc, typename U>
+  inline bool inner_enqueue(U &&element) {
+    auto producer = get_or_add_implicit_producer();
+    return producer == nullptr
+               ? false
+               : producer->ConcurrentQueue::ImplicitProducer::template enqueue<
+                     canAlloc>(std::forward<U>(element));
+  }
+
+  template <AllocationMode canAlloc, typename It>
+  inline bool inner_enqueue_bulk(producer_token_t const &token, It itemFirst,
+                                 size_t count) {
+    return static_cast<ExplicitProducer *>(token.producer)
+        ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk<canAlloc>(
+            itemFirst, count);
+  }
+
+  template <AllocationMode canAlloc, typename It>
+  inline bool inner_enqueue_bulk(It itemFirst, size_t count) {
+    auto producer = get_or_add_implicit_producer();
+    return producer == nullptr
+               ? false
+               : producer->ConcurrentQueue::ImplicitProducer::
+                     template enqueue_bulk<canAlloc>(itemFirst, count);
+  }
+
+  inline bool update_current_producer_after_rotation(consumer_token_t &token) {
+    // Ah, there's been a rotation, figure out where we should be!
+    auto tail = producerListTail.load(std::memory_order_acquire);
+    if (token.desiredProducer == nullptr && tail == nullptr) {
+      return false;
+    }
+    auto prodCount = producerCount.load(std::memory_order_relaxed);
+    auto globalOffset =
+        globalExplicitConsumerOffset.load(std::memory_order_relaxed);
+    if ((details::unlikely)(token.desiredProducer == nullptr)) {
+      // Aha, first time we're dequeueing anything.
+      // Figure out our local position
+      // Note: offset is from start, not end, but we're traversing from end --
+      // subtract from count first
+      std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount);
+      token.desiredProducer = tail;
+      for (std::uint32_t i = 0; i != offset; ++i) {
+        token.desiredProducer =
+            static_cast<ProducerBase *>(token.desiredProducer)->next_prod();
+        if (token.desiredProducer == nullptr) {
+          token.desiredProducer = tail;
+        }
+      }
+    }
+
+    std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset;
+    if (delta >= prodCount) {
+      delta = delta % prodCount;
+    }
+    for (std::uint32_t i = 0; i != delta; ++i) {
+      token.desiredProducer =
+          static_cast<ProducerBase *>(token.desiredProducer)->next_prod();
+      if (token.desiredProducer == nullptr) {
+        token.desiredProducer = tail;
+      }
+    }
+
+    token.lastKnownGlobalOffset = globalOffset;
+    token.currentProducer = token.desiredProducer;
+    token.itemsConsumedFromCurrent = 0;
+    return true;
+  }
+
+  ///////////////////////////
+  // Free list
+  ///////////////////////////
+
+  template <typename N> struct FreeListNode {
+    FreeListNode() : freeListRefs(0), freeListNext(nullptr) {}
+
+    std::atomic<std::uint32_t> freeListRefs;
+    std::atomic<N *> freeListNext;
+  };
+
+  // A simple CAS-based lock-free free list. Not the fastest thing in the world
+  // under heavy contention, but simple and correct (assuming nodes are never
+  // freed until after the free list is destroyed), and fairly speedy under low
+  // contention.
+  template <typename N> // N must inherit FreeListNode or have the same fields
+                        // (and initialization of them)
+  struct FreeList {
+    FreeList() : freeListHead(nullptr) {}
+    FreeList(FreeList &&other)
+        : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) {
+      other.freeListHead.store(nullptr, std::memory_order_relaxed);
+    }
+    void swap(FreeList &other) {
+      details::swap_relaxed(freeListHead, other.freeListHead);
+    }
+
+    FreeList(FreeList const &) MOODYCAMEL_DELETE_FUNCTION;
+    FreeList &operator=(FreeList const &) MOODYCAMEL_DELETE_FUNCTION;
+
+    inline void add(N *node) {
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif		
-			// We know that the should-be-on-freelist bit is 0 at this point, so it's safe to
-			// set it using a fetch_add
-			if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) {
-				// Oh look! We were the last ones referencing this node, and we know
-				// we want to add it to the free list, so let's do it!
-		 		add_knowing_refcount_is_zero(node);
-			}
-		}
-		
-		inline N* try_get()
-		{
+      debug::DebugLock lock(mutex);
+#endif
+      // We know that the should-be-on-freelist bit is 0 at this point, so it's
+      // safe to set it using a fetch_add
+      if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST,
+                                       std::memory_order_acq_rel) == 0) {
+        // Oh look! We were the last ones referencing this node, and we know
+        // we want to add it to the free list, so let's do it!
+        add_knowing_refcount_is_zero(node);
+      }
+    }
+
+    inline N *try_get() {
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-			debug::DebugLock lock(mutex);
-#endif		
-			auto head = freeListHead.load(std::memory_order_acquire);
-			while (head != nullptr) {
-				auto prevHead = head;
-				auto refs = head->freeListRefs.load(std::memory_order_relaxed);
-				if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) {
-					head = freeListHead.load(std::memory_order_acquire);
-					continue;
-				}
-				
-				// Good, reference count has been incremented (it wasn't at zero), which means we can read the
-				// next and not worry about it changing between now and the time we do the CAS
-				auto next = head->freeListNext.load(std::memory_order_relaxed);
-				if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no
-					// matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on).
-					assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0);
-					
-					// Decrease refcount twice, once for our ref, and once for the list's ref
-					head->freeListRefs.fetch_sub(2, std::memory_order_release);
-					return head;
-				}
-				
-				// OK, the head must have changed on us, but we still need to decrease the refcount we increased.
-				// Note that we don't need to release any memory effects, but we do need to ensure that the reference
-				// count decrement happens-after the CAS on the head.
-				refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
-				if (refs == SHOULD_BE_ON_FREELIST + 1) {
-					add_knowing_refcount_is_zero(prevHead);
-				}
-			}
-			
-			return nullptr;
-		}
-		
-		// Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes)
-		N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); }
-		
-	private:
-		inline void add_knowing_refcount_is_zero(N* node)
-		{
-			// Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run
-			// only one copy of this method per node at a time, i.e. the single thread case), then we know
-			// we can safely change the next pointer of the node; however, once the refcount is back above
-			// zero, then other threads could increase it (happens under heavy contention, when the refcount
-			// goes to zero in between a load and a refcount increment of a node in try_get, then back up to
-			// something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS
-			// to add the node to the actual list fails, decrease the refcount and leave the add operation to
-			// the next thread who puts the refcount back at zero (which could be us, hence the loop).
-			auto head = freeListHead.load(std::memory_order_relaxed);
-			while (true) {
-				node->freeListNext.store(head, std::memory_order_relaxed);
-				node->freeListRefs.store(1, std::memory_order_release);
-				if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) {
-					// Hmm, the add failed, but we can only try again when the refcount goes back to zero
-					if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) {
-						continue;
-					}
-				}
-				return;
-			}
-		}
-		
-	private:
-		// Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention)
-		std::atomic<N*> freeListHead;
-	
-	static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
-	static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
-		
+      debug::DebugLock lock(mutex);
+#endif
+      auto head = freeListHead.load(std::memory_order_acquire);
+      while (head != nullptr) {
+        auto prevHead = head;
+        auto refs = head->freeListRefs.load(std::memory_order_relaxed);
+        if ((refs & REFS_MASK) == 0 ||
+            !head->freeListRefs.compare_exchange_strong(
+                refs, refs + 1, std::memory_order_acquire)) {
+          head = freeListHead.load(std::memory_order_acquire);
+          continue;
+        }
+
+        // Good, reference count has been incremented (it wasn't at zero), which
+        // means we can read the next and not worry about it changing between
+        // now and the time we do the CAS
+        auto next = head->freeListNext.load(std::memory_order_relaxed);
+        if (freeListHead.compare_exchange_strong(head, next,
+                                                 std::memory_order_acquire,
+                                                 std::memory_order_relaxed)) {
+          // Yay, got the node. This means it was on the list, which means
+          // shouldBeOnFreeList must be false no matter the refcount (because
+          // nobody else knows it's been taken off yet, it can't have been put
+          // back on).
+          assert((head->freeListRefs.load(std::memory_order_relaxed) &
+                  SHOULD_BE_ON_FREELIST) == 0);
+
+          // Decrease refcount twice, once for our ref, and once for the list's
+          // ref
+          head->freeListRefs.fetch_sub(2, std::memory_order_release);
+          return head;
+        }
+
+        // OK, the head must have changed on us, but we still need to decrease
+        // the refcount we increased. Note that we don't need to release any
+        // memory effects, but we do need to ensure that the reference count
+        // decrement happens-after the CAS on the head.
+        refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel);
+        if (refs == SHOULD_BE_ON_FREELIST + 1) {
+          add_knowing_refcount_is_zero(prevHead);
+        }
+      }
+
+      return nullptr;
+    }
+
+    // Useful for traversing the list when there's no contention (e.g. to
+    // destroy remaining nodes)
+    N *head_unsafe() const {
+      return freeListHead.load(std::memory_order_relaxed);
+    }
+
+  private:
+    inline void add_knowing_refcount_is_zero(N *node) {
+      // Since the refcount is zero, and nobody can increase it once it's zero
+      // (except us, and we run only one copy of this method per node at a time,
+      // i.e. the single thread case), then we know we can safely change the
+      // next pointer of the node; however, once the refcount is back above
+      // zero, then other threads could increase it (happens under heavy
+      // contention, when the refcount goes to zero in between a load and a
+      // refcount increment of a node in try_get, then back up to something
+      // non-zero, then the refcount increment is done by the other thread) --
+      // so, if the CAS to add the node to the actual list fails, decrease the
+      // refcount and leave the add operation to the next thread who puts the
+      // refcount back at zero (which could be us, hence the loop).
+      auto head = freeListHead.load(std::memory_order_relaxed);
+      while (true) {
+        node->freeListNext.store(head, std::memory_order_relaxed);
+        node->freeListRefs.store(1, std::memory_order_release);
+        if (!freeListHead.compare_exchange_strong(head, node,
+                                                  std::memory_order_release,
+                                                  std::memory_order_relaxed)) {
+          // Hmm, the add failed, but we can only try again when the refcount
+          // goes back to zero
+          if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1,
+                                           std::memory_order_acq_rel) == 1) {
+            continue;
+          }
+        }
+        return;
+      }
+    }
+
+  private:
+    // Implemented like a stack, but where node order doesn't matter (nodes are
+    // inserted out of order under contention)
+    std::atomic<N *> freeListHead;
+
+    static const std::uint32_t REFS_MASK = 0x7FFFFFFF;
+    static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000;
+
 #ifdef MCDBGQ_NOLOCKFREE_FREELIST
-		debug::DebugMutex mutex;
-#endif
-	};
-	
-	
-	///////////////////////////
-	// Block
-	///////////////////////////
-	
-	enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
-	
-	struct Block
-	{
-		Block()
-			: next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true)
-		{
-#ifdef MCDBGQ_TRACKMEM
-			owner = nullptr;
-#endif
-		}
-		
-		template<InnerQueueContext context>
-		inline bool is_empty() const
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Check flags
-				for (size_t i = 0; i < BLOCK_SIZE; ++i) {
-					if (!emptyFlags[i].load(std::memory_order_relaxed)) {
-						return false;
-					}
-				}
-				
-				// Aha, empty; make sure we have all other memory effects that happened before the empty flags were set
-				std::atomic_thread_fence(std::memory_order_acquire);
-				return true;
-			}
-			else {
-				// Check counter
-				if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) {
-					std::atomic_thread_fence(std::memory_order_acquire);
-					return true;
-				}
-				assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE);
-				return false;
-			}
-		}
-		
-		// Returns true if the block is now empty (does not apply in explicit context)
-		template<InnerQueueContext context>
-		inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flag
-				assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].load(std::memory_order_relaxed));
-				emptyFlags[BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1))].store(true, std::memory_order_release);
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
-				assert(prevVal < BLOCK_SIZE);
-				return prevVal == BLOCK_SIZE - 1;
-			}
-		}
-		
-		// Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0).
-		// Returns true if the block is now empty (does not apply in explicit context).
-		template<InnerQueueContext context>
-		inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count)
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set flags
-				std::atomic_thread_fence(std::memory_order_release);
-				i = BLOCK_SIZE - 1 - static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) - count + 1;
-				for (size_t j = 0; j != count; ++j) {
-					assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
-					emptyFlags[i + j].store(true, std::memory_order_relaxed);
-				}
-				return false;
-			}
-			else {
-				// Increment counter
-				auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel);
-				assert(prevVal + count <= BLOCK_SIZE);
-				return prevVal + count == BLOCK_SIZE;
-			}
-		}
-		
-		template<InnerQueueContext context>
-		inline void set_all_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Set all flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(true, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
-			}
-		}
-		
-		template<InnerQueueContext context>
-		inline void reset_empty()
-		{
-			MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
-				// Reset flags
-				for (size_t i = 0; i != BLOCK_SIZE; ++i) {
-					emptyFlags[i].store(false, std::memory_order_relaxed);
-				}
-			}
-			else {
-				// Reset counter
-				elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
-			}
-		}
-		
-		inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast<T*>(static_cast<void*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast<T const*>(static_cast<void const*>(elements)) + static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1)); }
-		
-	private:
-		static_assert(std::alignment_of<T>::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time");
-		MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
-	public:
-		Block* next;
-		std::atomic<size_t> elementsCompletelyDequeued;
-		std::atomic<bool> emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
-	public:
-		std::atomic<std::uint32_t> freeListRefs;
-		std::atomic<Block*> freeListNext;
-		bool dynamicallyAllocated;		// Perhaps a better name for this would be 'isNotPartOfInitialBlockPool'
-		
+    debug::DebugMutex mutex;
+#endif
+  };
+
+  ///////////////////////////
+  // Block
+  ///////////////////////////
+
+  enum InnerQueueContext { implicit_context = 0, explicit_context = 1 };
+
+  struct Block {
+    Block()
+        : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0),
+          freeListNext(nullptr), dynamicallyAllocated(true) {
 #ifdef MCDBGQ_TRACKMEM
-		void* owner;
+      owner = nullptr;
 #endif
-	};
-	static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping");
+    }
+
+    template <InnerQueueContext context> inline bool is_empty() const {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Check flags
+        for (size_t i = 0; i < BLOCK_SIZE; ++i) {
+          if (!emptyFlags[i].load(std::memory_order_relaxed)) {
+            return false;
+          }
+        }
+
+        // Aha, empty; make sure we have all other memory effects that happened
+        // before the empty flags were set
+        std::atomic_thread_fence(std::memory_order_acquire);
+        return true;
+      }
+      else {
+        // Check counter
+        if (elementsCompletelyDequeued.load(std::memory_order_relaxed) ==
+            BLOCK_SIZE) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          return true;
+        }
+        assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <=
+               BLOCK_SIZE);
+        return false;
+      }
+    }
+
+    // Returns true if the block is now empty (does not apply in explicit
+    // context)
+    template <InnerQueueContext context>
+    inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set flag
+        assert(!emptyFlags[BLOCK_SIZE - 1 -
+                           static_cast<size_t>(
+                               i & static_cast<index_t>(BLOCK_SIZE - 1))]
+                    .load(std::memory_order_relaxed));
+        emptyFlags[BLOCK_SIZE - 1 -
+                   static_cast<size_t>(i &
+                                       static_cast<index_t>(BLOCK_SIZE - 1))]
+            .store(true, std::memory_order_release);
+        return false;
+      }
+      else {
+        // Increment counter
+        auto prevVal =
+            elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel);
+        assert(prevVal < BLOCK_SIZE);
+        return prevVal == BLOCK_SIZE - 1;
+      }
+    }
+
+    // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping
+    // and count > 0). Returns true if the block is now empty (does not apply in
+    // explicit context).
+    template <InnerQueueContext context>
+    inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i,
+                               size_t count) {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set flags
+        std::atomic_thread_fence(std::memory_order_release);
+        i = BLOCK_SIZE - 1 -
+            static_cast<size_t>(i & static_cast<index_t>(BLOCK_SIZE - 1)) -
+            count + 1;
+        for (size_t j = 0; j != count; ++j) {
+          assert(!emptyFlags[i + j].load(std::memory_order_relaxed));
+          emptyFlags[i + j].store(true, std::memory_order_relaxed);
+        }
+        return false;
+      }
+      else {
+        // Increment counter
+        auto prevVal = elementsCompletelyDequeued.fetch_add(
+            count, std::memory_order_acq_rel);
+        assert(prevVal + count <= BLOCK_SIZE);
+        return prevVal + count == BLOCK_SIZE;
+      }
+    }
+
+    template <InnerQueueContext context> inline void set_all_empty() {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Set all flags
+        for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+          emptyFlags[i].store(true, std::memory_order_relaxed);
+        }
+      }
+      else {
+        // Reset counter
+        elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed);
+      }
+    }
+
+    template <InnerQueueContext context> inline void reset_empty() {
+      MOODYCAMEL_CONSTEXPR_IF(context == explicit_context &&
+                              BLOCK_SIZE <=
+                                  EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) {
+        // Reset flags
+        for (size_t i = 0; i != BLOCK_SIZE; ++i) {
+          emptyFlags[i].store(false, std::memory_order_relaxed);
+        }
+      }
+      else {
+        // Reset counter
+        elementsCompletelyDequeued.store(0, std::memory_order_relaxed);
+      }
+    }
 
+    inline T *operator[](index_t idx) MOODYCAMEL_NOEXCEPT {
+      return static_cast<T *>(static_cast<void *>(elements)) +
+             static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+    }
+    inline T const *operator[](index_t idx) const MOODYCAMEL_NOEXCEPT {
+      return static_cast<T const *>(static_cast<void const *>(elements)) +
+             static_cast<size_t>(idx & static_cast<index_t>(BLOCK_SIZE - 1));
+    }
+
+  private:
+    static_assert(std::alignment_of<T>::value <= sizeof(T),
+                  "The queue does not support types with an alignment greater "
+                  "than their size at this time");
+    MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements;
+
+  public:
+    Block *next;
+    std::atomic<size_t> elementsCompletelyDequeued;
+    std::atomic<bool> emptyFlags
+        [BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1];
+
+  public:
+    std::atomic<std::uint32_t> freeListRefs;
+    std::atomic<Block *> freeListNext;
+    bool dynamicallyAllocated; // Perhaps a better name for this would be
+                               // 'isNotPartOfInitialBlockPool'
+
+#ifdef MCDBGQ_TRACKMEM
+    void *owner;
+#endif
+  };
+  static_assert(std::alignment_of<Block>::value >= std::alignment_of<T>::value,
+                "Internal error: Blocks must be at least as aligned as the "
+                "type they are wrapping");
 
 #ifdef MCDBGQ_TRACKMEM
 public:
-	struct MemStats;
+  struct MemStats;
+
 private:
 #endif
-	
-	///////////////////////////
-	// Producer base
-	///////////////////////////
-	
-	struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase
-	{
-		ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) :
-			tailIndex(0),
-			headIndex(0),
-			dequeueOptimisticCount(0),
-			dequeueOvercommit(0),
-			tailBlock(nullptr),
-			isExplicit(isExplicit_),
-			parent(parent_)
-		{
-		}
-		
-		virtual ~ProducerBase() { }
-		
-		template<typename U>
-		inline bool dequeue(U& element)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue(element);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue(element);
-			}
-		}
-		
-		template<typename It>
-		inline size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			if (isExplicit) {
-				return static_cast<ExplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-			else {
-				return static_cast<ImplicitProducer*>(this)->dequeue_bulk(itemFirst, max);
-			}
-		}
-		
-		inline ProducerBase* next_prod() const { return static_cast<ProducerBase*>(next); }
-		
-		inline size_t size_approx() const
-		{
-			auto tail = tailIndex.load(std::memory_order_relaxed);
-			auto head = headIndex.load(std::memory_order_relaxed);
-			return details::circular_less_than(head, tail) ? static_cast<size_t>(tail - head) : 0;
-		}
-		
-		inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); }
-	protected:
-		std::atomic<index_t> tailIndex;		// Where to enqueue to next
-		std::atomic<index_t> headIndex;		// Where to dequeue from next
-		
-		std::atomic<index_t> dequeueOptimisticCount;
-		std::atomic<index_t> dequeueOvercommit;
-		
-		Block* tailBlock;
-		
-	public:
-		bool isExplicit;
-		ConcurrentQueue* parent;
-		
-	protected:
+
+  ///////////////////////////
+  // Producer base
+  ///////////////////////////
+
+  struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase {
+    ProducerBase(ConcurrentQueue *parent_, bool isExplicit_)
+        : tailIndex(0), headIndex(0), dequeueOptimisticCount(0),
+          dequeueOvercommit(0), tailBlock(nullptr), isExplicit(isExplicit_),
+          parent(parent_) {}
+
+    virtual ~ProducerBase() {}
+
+    template <typename U> inline bool dequeue(U &element) {
+      if (isExplicit) {
+        return static_cast<ExplicitProducer *>(this)->dequeue(element);
+      } else {
+        return static_cast<ImplicitProducer *>(this)->dequeue(element);
+      }
+    }
+
+    template <typename It>
+    inline size_t dequeue_bulk(It &itemFirst, size_t max) {
+      if (isExplicit) {
+        return static_cast<ExplicitProducer *>(this)->dequeue_bulk(itemFirst,
+                                                                   max);
+      } else {
+        return static_cast<ImplicitProducer *>(this)->dequeue_bulk(itemFirst,
+                                                                   max);
+      }
+    }
+
+    inline ProducerBase *next_prod() const {
+      return static_cast<ProducerBase *>(next);
+    }
+
+    inline size_t size_approx() const {
+      auto tail = tailIndex.load(std::memory_order_relaxed);
+      auto head = headIndex.load(std::memory_order_relaxed);
+      return details::circular_less_than(head, tail)
+                 ? static_cast<size_t>(tail - head)
+                 : 0;
+    }
+
+    inline index_t getTail() const {
+      return tailIndex.load(std::memory_order_relaxed);
+    }
+
+  protected:
+    std::atomic<index_t> tailIndex; // Where to enqueue to next
+    std::atomic<index_t> headIndex; // Where to dequeue from next
+
+    std::atomic<index_t> dequeueOptimisticCount;
+    std::atomic<index_t> dequeueOvercommit;
+
+    Block *tailBlock;
+
+  public:
+    bool isExplicit;
+    ConcurrentQueue *parent;
+
+  protected:
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	///////////////////////////
-	// Explicit queue
-	///////////////////////////
-		
-	struct ExplicitProducer : public ProducerBase
-	{
-		explicit ExplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, true),
-			blockIndex(nullptr),
-			pr_blockIndexSlotsUsed(0),
-			pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
-			pr_blockIndexFront(0),
-			pr_blockIndexEntries(nullptr),
-			pr_blockIndexRaw(nullptr)
-		{
-			size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
-			if (poolBasedIndexSize > pr_blockIndexSize) {
-				pr_blockIndexSize = poolBasedIndexSize;
-			}
-			
-			new_block_index(0);		// This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
-		}
-		
-		~ExplicitProducer()
-		{
-			// Destruct any elements not yet dequeued.
-			// Since we're in the destructor, we can assume all elements
-			// are either completely dequeued or completely not (no halfways).
-			if (this->tailBlock != nullptr) {		// Note this means there must be a block index too
-				// First find the block that's partially dequeued, if any
-				Block* halfDequeuedBlock = nullptr;
-				if ((this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
-					// The head's not on a block boundary, meaning a block somewhere is partially dequeued
-					// (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary)
-					size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1);
-					while (details::circular_less_than<index_t>(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) {
-						i = (i + 1) & (pr_blockIndexSize - 1);
-					}
-					assert(details::circular_less_than<index_t>(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed)));
-					halfDequeuedBlock = pr_blockIndexEntries[i].block;
-				}
-				
-				// Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration)
-				auto block = this->tailBlock;
-				do {
-					block = block->next;
-					if (block->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-						continue;
-					}
-					
-					size_t i = 0;	// Offset into block
-					if (block == halfDequeuedBlock) {
-						i = static_cast<size_t>(this->headIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					}
-					
-					// Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index
-					auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast<size_t>(this->tailIndex.load(std::memory_order_relaxed) & static_cast<index_t>(BLOCK_SIZE - 1));
-					while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) {
-						(*block)[i++]->~T();
-					}
-				} while (block != this->tailBlock);
-			}
-			
-			// Destroy all blocks that we own
-			if (this->tailBlock != nullptr) {
-				auto block = this->tailBlock;
-				do {
-					auto nextBlock = block->next;
-					this->parent->add_block_to_free_list(block);
-					block = nextBlock;
-				} while (block != this->tailBlock);
-			}
-			
-			// Destroy the block indices
-			auto header = static_cast<BlockIndexHeader*>(pr_blockIndexRaw);
-			while (header != nullptr) {
-				auto prev = static_cast<BlockIndexHeader*>(header->prev);
-				header->~BlockIndexHeader();
-				(Traits::free)(header);
-				header = prev;
-			}
-		}
-		
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto startBlock = this->tailBlock;
-				auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-				if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					// We can re-use the block ahead of us, it's empty!					
-					this->tailBlock = this->tailBlock->next;
-					this->tailBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					
-					// We'll put the block on the block index (guaranteed to be room since we're conceptually removing the
-					// last block from it first -- except instead of removing then adding, we can just overwrite).
-					// Note that there must be a valid block index here, since even if allocation failed in the ctor,
-					// it would have been re-attempted when adding the first block to the queue; since there is such
-					// a block, a block index must have been successfully allocated.
-				}
-				else {
-					// Whatever head value we see here is >= the last value we saw here (relatively),
-					// and <= its current value. Since we have the most recent tail, the head must be
-					// <= to it.
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE)
-						|| (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-						// We can't enqueue in another block because there's not enough leeway -- the
-						// tail could surpass the head by the time the block fills up! (Or we'll exceed
-						// the size limit, if the second part of the condition was true.)
-						return false;
-					}
-					// We're going to need a new block; check that the block index has room
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) {
-						// Hmm, the circular block index is already full -- we'll need
-						// to allocate a new index. Note pr_blockIndexRaw can only be nullptr if
-						// the initial allocation failed in the constructor.
-						
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							return false;
-						}
-						else if (!new_block_index(pr_blockIndexSlotsUsed)) {
-							return false;
-						}
-					}
-					
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						return false;
-					}
+    friend struct MemStats;
+#endif
+  };
+
+  ///////////////////////////
+  // Explicit queue
+  ///////////////////////////
+
+  struct ExplicitProducer : public ProducerBase {
+    explicit ExplicitProducer(ConcurrentQueue *parent_)
+        : ProducerBase(parent_, true), blockIndex(nullptr),
+          pr_blockIndexSlotsUsed(0),
+          pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1),
+          pr_blockIndexFront(0), pr_blockIndexEntries(nullptr),
+          pr_blockIndexRaw(nullptr) {
+      size_t poolBasedIndexSize =
+          details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1;
+      if (poolBasedIndexSize > pr_blockIndexSize) {
+        pr_blockIndexSize = poolBasedIndexSize;
+      }
+
+      new_block_index(0); // This creates an index with double the number of
+                          // current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE
+    }
+
+    ~ExplicitProducer() {
+      // Destruct any elements not yet dequeued.
+      // Since we're in the destructor, we can assume all elements
+      // are either completely dequeued or completely not (no halfways).
+      if (this->tailBlock !=
+          nullptr) { // Note this means there must be a block index too
+        // First find the block that's partially dequeued, if any
+        Block *halfDequeuedBlock = nullptr;
+        if ((this->headIndex.load(std::memory_order_relaxed) &
+             static_cast<index_t>(BLOCK_SIZE - 1)) != 0) {
+          // The head's not on a block boundary, meaning a block somewhere is
+          // partially dequeued (or the head block is the tail block and was
+          // fully dequeued, but the head/tail are still not on a boundary)
+          size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) &
+                     (pr_blockIndexSize - 1);
+          while (details::circular_less_than<index_t>(
+              pr_blockIndexEntries[i].base + BLOCK_SIZE,
+              this->headIndex.load(std::memory_order_relaxed))) {
+            i = (i + 1) & (pr_blockIndexSize - 1);
+          }
+          assert(details::circular_less_than<index_t>(
+              pr_blockIndexEntries[i].base,
+              this->headIndex.load(std::memory_order_relaxed)));
+          halfDequeuedBlock = pr_blockIndexEntries[i].block;
+        }
+
+        // Start at the head block (note the first line in the loop gives us the
+        // head from the tail on the first iteration)
+        auto block = this->tailBlock;
+        do {
+          block = block->next;
+          if (block->ConcurrentQueue::Block::template is_empty<
+                  explicit_context>()) {
+            continue;
+          }
+
+          size_t i = 0; // Offset into block
+          if (block == halfDequeuedBlock) {
+            i = static_cast<size_t>(
+                this->headIndex.load(std::memory_order_relaxed) &
+                static_cast<index_t>(BLOCK_SIZE - 1));
+          }
+
+          // Walk through all the items in the block; if this is the tail block,
+          // we need to stop when we reach the tail index
+          auto lastValidIndex =
+              (this->tailIndex.load(std::memory_order_relaxed) &
+               static_cast<index_t>(BLOCK_SIZE - 1)) == 0
+                  ? BLOCK_SIZE
+                  : static_cast<size_t>(
+                        this->tailIndex.load(std::memory_order_relaxed) &
+                        static_cast<index_t>(BLOCK_SIZE - 1));
+          while (i != BLOCK_SIZE &&
+                 (block != this->tailBlock || i != lastValidIndex)) {
+            (*block)[i++]->~T();
+          }
+        } while (block != this->tailBlock);
+      }
+
+      // Destroy all blocks that we own
+      if (this->tailBlock != nullptr) {
+        auto block = this->tailBlock;
+        do {
+          auto nextBlock = block->next;
+          this->parent->add_block_to_free_list(block);
+          block = nextBlock;
+        } while (block != this->tailBlock);
+      }
+
+      // Destroy the block indices
+      auto header = static_cast<BlockIndexHeader *>(pr_blockIndexRaw);
+      while (header != nullptr) {
+        auto prev = static_cast<BlockIndexHeader *>(header->prev);
+        header->~BlockIndexHeader();
+        (Traits::free)(header);
+        header = prev;
+      }
+    }
+
+    template <AllocationMode allocMode, typename U>
+    inline bool enqueue(U &&element) {
+      index_t currentTailIndex =
+          this->tailIndex.load(std::memory_order_relaxed);
+      index_t newTailIndex = 1 + currentTailIndex;
+      if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+        // We reached the end of a block, start a new one
+        auto startBlock = this->tailBlock;
+        auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+        if (this->tailBlock != nullptr &&
+            this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
+                explicit_context>()) {
+          // We can re-use the block ahead of us, it's empty!
+          this->tailBlock = this->tailBlock->next;
+          this->tailBlock->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+
+          // We'll put the block on the block index (guaranteed to be room since
+          // we're conceptually removing the last block from it first -- except
+          // instead of removing then adding, we can just overwrite). Note that
+          // there must be a valid block index here, since even if allocation
+          // failed in the ctor, it would have been re-attempted when adding the
+          // first block to the queue; since there is such a block, a block
+          // index must have been successfully allocated.
+        } else {
+          // Whatever head value we see here is >= the last value we saw here
+          // (relatively), and <= its current value. Since we have the most
+          // recent tail, the head must be
+          // <= to it.
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          if (!details::circular_less_than<index_t>(head, currentTailIndex +
+                                                              BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+            // We can't enqueue in another block because there's not enough
+            // leeway -- the tail could surpass the head by the time the block
+            // fills up! (Or we'll exceed the size limit, if the second part of
+            // the condition was true.)
+            return false;
+          }
+          // We're going to need a new block; check that the block index has
+          // room
+          if (pr_blockIndexRaw == nullptr ||
+              pr_blockIndexSlotsUsed == pr_blockIndexSize) {
+            // Hmm, the circular block index is already full -- we'll need
+            // to allocate a new index. Note pr_blockIndexRaw can only be
+            // nullptr if the initial allocation failed in the constructor.
+
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; }
+            else if (!new_block_index(pr_blockIndexSlotsUsed)) {
+              return false;
+            }
+          }
+
+          // Insert a new block in the circular linked list
+          auto newBlock =
+              this->parent
+                  ->ConcurrentQueue::template requisition_block<allocMode>();
+          if (newBlock == nullptr) {
+            return false;
+          }
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					++pr_blockIndexSlotsUsed;
-				}
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// The constructor may throw. We want the element not to appear in the queue in
-					// that case (without corrupting the queue):
-					MOODYCAMEL_TRY {
-						new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Revert change to the current block, but leave the new block available
-						// for next time
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				else {
-					(void)startBlock;
-					(void)originalBlockIndexSlotsUsed;
-				}
-				
-				// Add block to block index
-				auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-				entry.base = currentTailIndex;
-				entry.block = this->tailBlock;
-				blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release);
-				pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-			
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				// Might be something to dequeue, let's give it a try
-				
-				// Note that this if is purely for performance purposes in the common case when the queue is
-				// empty and the values are eventually consistent -- we may enter here spuriously.
-				
-				// Note that whatever the values of overcommit and tail are, they are not going to change (unless we
-				// change them) and must be the same value at this point (inside the if) as when the if condition was
-				// evaluated.
-
-				// We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below.
-				// This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in
-				// the fetch_add below will result in a value at least as recent as that (and therefore at least as large).
-				// Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all
-				// read-modify-write operations are guaranteed to work on the latest value in the modification order), but
-				// unfortunately that can't be shown to be correct using only the C++11 standard.
-				// See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				// Increment optimistic counter, then check if it went over the boundary
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				
-				// Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever
-				// incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now
-				// have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon
-				// incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount.
-				// However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently)
-				// overflow; in such a case, though, the logic still holds since the difference between the two is maintained.
-				
-				// Note that we reload tail here in case it changed; it will be the same value as before or greater, since
-				// this load is sequenced after (happens after) the earlier load above. This is supported by read-read
-				// coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					// Guaranteed to be at least one element to dequeue!
-					
-					// Get the index. Note that since there's guaranteed to be at least one element, this
-					// will never exceed tail. We need to do an acquire-release fence here since it's possible
-					// that whatever condition got us to this point was for an earlier enqueued element (that
-					// we already see the memory effects for), but that by the time we increment somebody else
-					// has incremented it, and we need to see the memory effects for *that* element, which is
-					// in such a case is necessarily visible on the thread that incremented it in the first
-					// place with the more current condition (they must have acquired a tail that is at least
-					// as recent).
-					auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-					
-					
-					// Determine which block the element is in
-					
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-					
-					// We need to be careful here about subtracting and dividing because of index wrap-around.
-					// When an index wraps, we need to preserve the sign of the offset when dividing it by the
-					// block size (in order to get a correct signed block count offset in all cases):
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(blockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block;
-					
-					// Dequeue
-					auto& el = *((*block)[index]);
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
-						// Make sure the element is still fully dequeued and destroyed even if the assignment
-						// throws
-						struct Guard {
-							Block* block;
-							index_t index;
-							
-							~Guard()
-							{
-								(*block)[index]->~T();
-								block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-							}
-						} guard = { block, index };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-						block->ConcurrentQueue::Block::template set_empty<explicit_context>(index);
-					}
-					
-					return true;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);		// Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write
-				}
-			}
-		
-			return false;
-		}
-		
-		template<AllocationMode allocMode, typename It>
-		bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			auto originalBlockIndexFront = pr_blockIndexFront;
-			auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
-			
-			Block* firstAllocatedBlock = nullptr;
-			
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
-				// Allocate as many blocks as possible from ahead
-				while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty<explicit_context>()) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					this->tailBlock = this->tailBlock->next;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-					
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-				
-				// Now allocate as many blocks as necessary from the block pool
-				while (blockBaseDiff > 0) {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-					if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
-						MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
-							// Failed to allocate, undo changes (but keep injected blocks)
-							pr_blockIndexFront = originalBlockIndexFront;
-							pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-							this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-							return false;
-						}
-						
-						// pr_blockIndexFront is updated inside new_block_index, so we need to
-						// update our fallback value too (since we keep the new index even if we
-						// later fail)
-						originalBlockIndexFront = originalBlockIndexSlotsUsed;
-					}
-					
-					// Insert a new block in the circular linked list
-					auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-					if (newBlock == nullptr) {
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						return false;
-					}
-					
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+          if (this->tailBlock == nullptr) {
+            newBlock->next = newBlock;
+          } else {
+            newBlock->next = this->tailBlock->next;
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          ++pr_blockIndexSlotsUsed;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          // The constructor may throw. We want the element not to appear in the
+          // queue in that case (without corrupting the queue):
+          MOODYCAMEL_TRY {
+            new ((*this->tailBlock)[currentTailIndex])
+                T(std::forward<U>(element));
+          }
+          MOODYCAMEL_CATCH(...) {
+            // Revert change to the current block, but leave the new block
+            // available for next time
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? this->tailBlock : startBlock;
+            MOODYCAMEL_RETHROW;
+          }
+        }
+        else {
+          (void)startBlock;
+          (void)originalBlockIndexSlotsUsed;
+        }
+
+        // Add block to block index
+        auto &entry = blockIndex.load(std::memory_order_relaxed)
+                          ->entries[pr_blockIndexFront];
+        entry.base = currentTailIndex;
+        entry.block = this->tailBlock;
+        blockIndex.load(std::memory_order_relaxed)
+            ->front.store(pr_blockIndexFront, std::memory_order_release);
+        pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          this->tailIndex.store(newTailIndex, std::memory_order_release);
+          return true;
+        }
+      }
+
+      // Enqueue
+      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename U> bool dequeue(U &element) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      if (details::circular_less_than<index_t>(
+              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit,
+              tail)) {
+        // Might be something to dequeue, let's give it a try
+
+        // Note that this if is purely for performance purposes in the common
+        // case when the queue is empty and the values are eventually consistent
+        // -- we may enter here spuriously.
+
+        // Note that whatever the values of overcommit and tail are, they are
+        // not going to change (unless we change them) and must be the same
+        // value at this point (inside the if) as when the if condition was
+        // evaluated.
+
+        // We insert an acquire fence here to synchronize-with the release upon
+        // incrementing dequeueOvercommit below. This ensures that whatever the
+        // value we got loaded into overcommit, the load of dequeueOptisticCount
+        // in the fetch_add below will result in a value at least as recent as
+        // that (and therefore at least as large). Note that I believe a
+        // compiler (signal) fence here would be sufficient due to the nature of
+        // fetch_add (all read-modify-write operations are guaranteed to work on
+        // the latest value in the modification order), but unfortunately that
+        // can't be shown to be correct using only the C++11 standard. See
+        // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        // Increment optimistic counter, then check if it went over the boundary
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            1, std::memory_order_relaxed);
+
+        // Note that since dequeueOvercommit must be <= dequeueOptimisticCount
+        // (because dequeueOvercommit is only ever incremented after
+        // dequeueOptimisticCount -- this is enforced in the `else` block
+        // below), and since we now have a version of dequeueOptimisticCount
+        // that is at least as recent as overcommit (due to the release upon
+        // incrementing dequeueOvercommit and the acquire above that
+        // synchronizes with it), overcommit <= myDequeueCount. However, we
+        // can't assert this since both dequeueOptimisticCount and
+        // dequeueOvercommit may (independently) overflow; in such a case,
+        // though, the logic still holds since the difference between the two is
+        // maintained.
+
+        // Note that we reload tail here in case it changed; it will be the same
+        // value as before or greater, since this load is sequenced after
+        // (happens after) the earlier load above. This is supported by
+        // read-read coherency (as defined in the standard), explained here:
+        // http://en.cppreference.com/w/cpp/atomic/memory_order
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        if ((details::likely)(details::circular_less_than<index_t>(
+                myDequeueCount - overcommit, tail))) {
+          // Guaranteed to be at least one element to dequeue!
+
+          // Get the index. Note that since there's guaranteed to be at least
+          // one element, this will never exceed tail. We need to do an
+          // acquire-release fence here since it's possible that whatever
+          // condition got us to this point was for an earlier enqueued element
+          // (that we already see the memory effects for), but that by the time
+          // we increment somebody else has incremented it, and we need to see
+          // the memory effects for *that* element, which is in such a case is
+          // necessarily visible on the thread that incremented it in the first
+          // place with the more current condition (they must have acquired a
+          // tail that is at least as recent).
+          auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+          // Determine which block the element is in
+
+          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+          auto localBlockIndexHead =
+              localBlockIndex->front.load(std::memory_order_acquire);
+
+          // We need to be careful here about subtracting and dividing because
+          // of index wrap-around. When an index wraps, we need to preserve the
+          // sign of the offset when dividing it by the block size (in order to
+          // get a correct signed block count offset in all cases):
+          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+          auto blockBaseIndex = index & ~static_cast<index_t>(BLOCK_SIZE - 1);
+          auto offset = static_cast<size_t>(
+              static_cast<typename std::make_signed<index_t>::type>(
+                  blockBaseIndex - headBase) /
+              static_cast<typename std::make_signed<index_t>::type>(
+                  BLOCK_SIZE));
+          auto block = localBlockIndex
+                           ->entries[(localBlockIndexHead + offset) &
+                                     (localBlockIndex->size - 1)]
+                           .block;
+
+          // Dequeue
+          auto &el = *((*block)[index]);
+          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
+            // Make sure the element is still fully dequeued and destroyed even
+            // if the assignment throws
+            struct Guard {
+              Block *block;
+              index_t index;
+
+              ~Guard() {
+                (*block)[index]->~T();
+                block->ConcurrentQueue::Block::template set_empty<
+                    explicit_context>(index);
+              }
+            } guard = {block, index};
+
+            element = std::move(el); // NOLINT
+          } else {
+            element = std::move(el); // NOLINT
+            el.~T();                 // NOLINT
+            block->ConcurrentQueue::Block::template set_empty<explicit_context>(
+                index);
+          }
+
+          return true;
+        } else {
+          // Wasn't anything to dequeue after all; make the effective dequeue
+          // count eventually consistent
+          this->dequeueOvercommit.fetch_add(
+              1, std::memory_order_release); // Release so that the fetch_add on
+                                             // dequeueOptimisticCount is
+                                             // guaranteed to happen before this
+                                             // write
+        }
+      }
+
+      return false;
+    }
+
+    template <AllocationMode allocMode, typename It>
+    bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) {
+      // First, we need to make sure we have enough room to enqueue all of the
+      // elements; this means pre-allocating blocks and putting them in the
+      // block index (but only if all the allocations succeeded).
+      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+      auto startBlock = this->tailBlock;
+      auto originalBlockIndexFront = pr_blockIndexFront;
+      auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed;
+
+      Block *firstAllocatedBlock = nullptr;
+
+      // Figure out how many blocks we'll need to allocate, and do so
+      size_t blockBaseDiff =
+          ((startTailIndex + count - 1) &
+           ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+          ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+      index_t currentTailIndex =
+          (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+      if (blockBaseDiff > 0) {
+        // Allocate as many blocks as possible from ahead
+        while (blockBaseDiff > 0 && this->tailBlock != nullptr &&
+               this->tailBlock->next != firstAllocatedBlock &&
+               this->tailBlock->next->ConcurrentQueue::Block::template is_empty<
+                   explicit_context>()) {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          this->tailBlock = this->tailBlock->next;
+          firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                    ? this->tailBlock
+                                    : firstAllocatedBlock;
+
+          auto &entry = blockIndex.load(std::memory_order_relaxed)
+                            ->entries[pr_blockIndexFront];
+          entry.base = currentTailIndex;
+          entry.block = this->tailBlock;
+          pr_blockIndexFront =
+              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        // Now allocate as many blocks as necessary from the block pool
+        while (blockBaseDiff > 0) {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          bool full =
+              !details::circular_less_than<index_t>(head, currentTailIndex +
+                                                              BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+          if (pr_blockIndexRaw == nullptr ||
+              pr_blockIndexSlotsUsed == pr_blockIndexSize || full) {
+            MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) {
+              // Failed to allocate, undo changes (but keep injected blocks)
+              pr_blockIndexFront = originalBlockIndexFront;
+              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+              this->tailBlock =
+                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
+              return false;
+            }
+            else if (full || !new_block_index(originalBlockIndexSlotsUsed)) {
+              // Failed to allocate, undo changes (but keep injected blocks)
+              pr_blockIndexFront = originalBlockIndexFront;
+              pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+              this->tailBlock =
+                  startBlock == nullptr ? firstAllocatedBlock : startBlock;
+              return false;
+            }
+
+            // pr_blockIndexFront is updated inside new_block_index, so we need
+            // to update our fallback value too (since we keep the new index
+            // even if we later fail)
+            originalBlockIndexFront = originalBlockIndexSlotsUsed;
+          }
+
+          // Insert a new block in the circular linked list
+          auto newBlock =
+              this->parent
+                  ->ConcurrentQueue::template requisition_block<allocMode>();
+          if (newBlock == nullptr) {
+            pr_blockIndexFront = originalBlockIndexFront;
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? firstAllocatedBlock : startBlock;
+            return false;
+          }
+
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template set_all_empty<explicit_context>();
-					if (this->tailBlock == nullptr) {
-						newBlock->next = newBlock;
-					}
-					else {
-						newBlock->next = this->tailBlock->next;
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock;
-					
-					++pr_blockIndexSlotsUsed;
-					
-					auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront];
-					entry.base = currentTailIndex;
-					entry.block = this->tailBlock;
-					pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
-				}
-				
-				// Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and
-				// publish the new block index front
-				auto block = firstAllocatedBlock;
-				while (true) {
-					block->ConcurrentQueue::Block::template reset_empty<explicit_context>();
-					if (block == this->tailBlock) {
-						break;
-					}
-					block = block->next;
-				}
-				
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-				}
-			}
-			
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			auto endBlock = this->tailBlock;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							// Must use copy constructor even if move constructor is available
-							// because we may have to revert if there's an exception.
-							// Sorry about the horrible templated next line, but it was the only way
-							// to disable moving *at compile time*, which is important because a type
-							// may only define a (noexcept) move constructor, and so calls to the
-							// cctor will not compile, even if they are in an if branch that will never
-							// be executed
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						// Oh dear, an exception's been thrown -- destroy the elements that
-						// were enqueued so far and revert the entire bulk operation (we'll keep
-						// any allocated blocks in our linked list for later, though).
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-						
-						pr_blockIndexFront = originalBlockIndexFront;
-						pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
-						this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock;
-						
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			
-			MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-				if (firstAllocatedBlock != nullptr)
-					blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release);
-			}
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-				
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-					
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-					
-					// Determine which block the first element is in
-					auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
-					auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire);
-					
-					auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
-					auto firstBlockBaseIndex = firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
-					auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(firstBlockBaseIndex - headBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-					auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
-					
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					do {
-						auto firstIndexInBlock = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						auto block = localBlockIndex->entries[indexIndex].block;
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								// It's too late to revert the dequeue, but we can make sure that all
-								// the dequeued objects are properly destroyed and the block index
-								// (and empty count) are properly updated before we propagate the exception
-								do {
-									block = localBlockIndex->entries[indexIndex].block;
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-									indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-									
-									firstIndexInBlock = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-								
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						block->ConcurrentQueue::Block::template set_many_empty<explicit_context>(firstIndexInBlock, static_cast<size_t>(endIndex - firstIndexInBlock));
-						indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
-					} while (index != firstIndex + actualCount);
-					
-					return actualCount;
-				}
-				else {
-					// Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-			
-			return 0;
-		}
-		
-	private:
-		struct BlockIndexEntry
-		{
-			index_t base;
-			Block* block;
-		};
-		
-		struct BlockIndexHeader
-		{
-			size_t size;
-			std::atomic<size_t> front;		// Current slot (not next, like pr_blockIndexFront)
-			BlockIndexEntry* entries;
-			void* prev;
-		};
-		
-		
-		bool new_block_index(size_t numberOfFilledSlotsToExpose)
-		{
-			auto prevBlockSizeMask = pr_blockIndexSize - 1;
-			
-			// Create the new block
-			pr_blockIndexSize <<= 1;
-			auto newRawPtr = static_cast<char*>((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
-			if (newRawPtr == nullptr) {
-				pr_blockIndexSize >>= 1;		// Reset to allow graceful retry
-				return false;
-			}
-			
-			auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(newRawPtr + sizeof(BlockIndexHeader)));
-			
-			// Copy in all the old indices, if any
-			size_t j = 0;
-			if (pr_blockIndexSlotsUsed != 0) {
-				auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
-				do {
-					newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
-					i = (i + 1) & prevBlockSizeMask;
-				} while (i != pr_blockIndexFront);
-			}
-			
-			// Update everything
-			auto header = new (newRawPtr) BlockIndexHeader;
-			header->size = pr_blockIndexSize;
-			header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed);
-			header->entries = newBlockIndexEntries;
-			header->prev = pr_blockIndexRaw;		// we link the new block to the old one so we can free it later
-			
-			pr_blockIndexFront = j;
-			pr_blockIndexEntries = newBlockIndexEntries;
-			pr_blockIndexRaw = newRawPtr;
-			blockIndex.store(header, std::memory_order_release);
-			
-			return true;
-		}
-		
-	private:
-		std::atomic<BlockIndexHeader*> blockIndex;
-		
-		// To be used by producer only -- consumer must use the ones in referenced by blockIndex
-		size_t pr_blockIndexSlotsUsed;
-		size_t pr_blockIndexSize;
-		size_t pr_blockIndexFront;		// Next slot (not current)
-		BlockIndexEntry* pr_blockIndexEntries;
-		void* pr_blockIndexRaw;
-		
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template set_all_empty<
+              explicit_context>();
+          if (this->tailBlock == nullptr) {
+            newBlock->next = newBlock;
+          } else {
+            newBlock->next = this->tailBlock->next;
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          firstAllocatedBlock = firstAllocatedBlock == nullptr
+                                    ? this->tailBlock
+                                    : firstAllocatedBlock;
+
+          ++pr_blockIndexSlotsUsed;
+
+          auto &entry = blockIndex.load(std::memory_order_relaxed)
+                            ->entries[pr_blockIndexFront];
+          entry.base = currentTailIndex;
+          entry.block = this->tailBlock;
+          pr_blockIndexFront =
+              (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1);
+        }
+
+        // Excellent, all allocations succeeded. Reset each block's emptiness
+        // before we fill them up, and publish the new block index front
+        auto block = firstAllocatedBlock;
+        while (true) {
+          block->ConcurrentQueue::Block::template reset_empty<
+              explicit_context>();
+          if (block == this->tailBlock) {
+            break;
+          }
+          block = block->next;
+        }
+
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          blockIndex.load(std::memory_order_relaxed)
+              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                            std::memory_order_release);
+        }
+      }
+
+      // Enqueue, one block at a time
+      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+      currentTailIndex = startTailIndex;
+      auto endBlock = this->tailBlock;
+      this->tailBlock = startBlock;
+      assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+             firstAllocatedBlock != nullptr || count == 0);
+      if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+          firstAllocatedBlock != nullptr) {
+        this->tailBlock = firstAllocatedBlock;
+      }
+      while (true) {
+        index_t stopIndex =
+            (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+            static_cast<index_t>(BLOCK_SIZE);
+        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+          stopIndex = newTailIndex;
+        }
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          while (currentTailIndex != stopIndex) {
+            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+          }
+        }
+        else {
+          MOODYCAMEL_TRY {
+            while (currentTailIndex != stopIndex) {
+              // Must use copy constructor even if move constructor is available
+              // because we may have to revert if there's an exception.
+              // Sorry about the horrible templated next line, but it was the
+              // only way to disable moving *at compile time*, which is
+              // important because a type may only define a (noexcept) move
+              // constructor, and so calls to the cctor will not compile, even
+              // if they are in an if branch that will never be executed
+              new ((*this->tailBlock)[currentTailIndex]) T(
+                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                      T, decltype(*itemFirst),
+                      new (static_cast<T *>(nullptr)) T(details::deref_noexcept(
+                          itemFirst)))>::eval(*itemFirst));
+              ++currentTailIndex;
+              ++itemFirst;
+            }
+          }
+          MOODYCAMEL_CATCH(...) {
+            // Oh dear, an exception's been thrown -- destroy the elements that
+            // were enqueued so far and revert the entire bulk operation (we'll
+            // keep any allocated blocks in our linked list for later, though).
+            auto constructedStopIndex = currentTailIndex;
+            auto lastBlockEnqueued = this->tailBlock;
+
+            pr_blockIndexFront = originalBlockIndexFront;
+            pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed;
+            this->tailBlock =
+                startBlock == nullptr ? firstAllocatedBlock : startBlock;
+
+            if (!details::is_trivially_destructible<T>::value) {
+              auto block = startBlock;
+              if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) ==
+                  0) {
+                block = firstAllocatedBlock;
+              }
+              currentTailIndex = startTailIndex;
+              while (true) {
+                stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(constructedStopIndex,
+                                                         stopIndex)) {
+                  stopIndex = constructedStopIndex;
+                }
+                while (currentTailIndex != stopIndex) {
+                  (*block)[currentTailIndex++]->~T();
+                }
+                if (block == lastBlockEnqueued) {
+                  break;
+                }
+                block = block->next;
+              }
+            }
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        if (this->tailBlock == endBlock) {
+          assert(currentTailIndex == newTailIndex);
+          break;
+        }
+        this->tailBlock = this->tailBlock->next;
+      }
+
+      MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+          T, decltype(*itemFirst),
+          new (static_cast<T *>(nullptr))
+              T(details::deref_noexcept(itemFirst)))) {
+        if (firstAllocatedBlock != nullptr)
+          blockIndex.load(std::memory_order_relaxed)
+              ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1),
+                            std::memory_order_release);
+      }
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename It> size_t dequeue_bulk(It &itemFirst, size_t max) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      auto desiredCount = static_cast<size_t>(
+          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit));
+      if (details::circular_less_than<size_t>(0, desiredCount)) {
+        desiredCount = desiredCount < max ? desiredCount : max;
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            desiredCount, std::memory_order_relaxed);
+
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        auto actualCount =
+            static_cast<size_t>(tail - (myDequeueCount - overcommit));
+        if (details::circular_less_than<size_t>(0, actualCount)) {
+          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+          if (actualCount < desiredCount) {
+            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
+                                              std::memory_order_release);
+          }
+
+          // Get the first index. Note that since there's guaranteed to be at
+          // least actualCount elements, this will never exceed tail.
+          auto firstIndex =
+              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+          // Determine which block the first element is in
+          auto localBlockIndex = blockIndex.load(std::memory_order_acquire);
+          auto localBlockIndexHead =
+              localBlockIndex->front.load(std::memory_order_acquire);
+
+          auto headBase = localBlockIndex->entries[localBlockIndexHead].base;
+          auto firstBlockBaseIndex =
+              firstIndex & ~static_cast<index_t>(BLOCK_SIZE - 1);
+          auto offset = static_cast<size_t>(
+              static_cast<typename std::make_signed<index_t>::type>(
+                  firstBlockBaseIndex - headBase) /
+              static_cast<typename std::make_signed<index_t>::type>(
+                  BLOCK_SIZE));
+          auto indexIndex =
+              (localBlockIndexHead + offset) & (localBlockIndex->size - 1);
+
+          // Iterate the blocks and dequeue
+          auto index = firstIndex;
+          do {
+            auto firstIndexInBlock = index;
+            index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                               static_cast<index_t>(BLOCK_SIZE);
+            endIndex =
+                details::circular_less_than<index_t>(
+                    firstIndex + static_cast<index_t>(actualCount), endIndex)
+                    ? firstIndex + static_cast<index_t>(actualCount)
+                    : endIndex;
+            auto block = localBlockIndex->entries[indexIndex].block;
+            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&,
+                                           details::deref_noexcept(itemFirst) =
+                                               std::move((*(*block)[index])))) {
+              while (index != endIndex) {
+                auto &el = *((*block)[index]);
+                *itemFirst++ = std::move(el);
+                el.~T();
+                ++index;
+              }
+            } else {
+              MOODYCAMEL_TRY {
+                while (index != endIndex) {
+                  auto &el = *((*block)[index]);
+                  *itemFirst = std::move(el);
+                  ++itemFirst;
+                  el.~T();
+                  ++index;
+                }
+              }
+              MOODYCAMEL_CATCH(...) {
+                // It's too late to revert the dequeue, but we can make sure
+                // that all the dequeued objects are properly destroyed and the
+                // block index (and empty count) are properly updated before we
+                // propagate the exception
+                do {
+                  block = localBlockIndex->entries[indexIndex].block;
+                  while (index != endIndex) {
+                    (*block)[index++]->~T();
+                  }
+                  block->ConcurrentQueue::Block::template set_many_empty<
+                      explicit_context>(
+                      firstIndexInBlock,
+                      static_cast<size_t>(endIndex - firstIndexInBlock));
+                  indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+
+                  firstIndexInBlock = index;
+                  endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                             static_cast<index_t>(BLOCK_SIZE);
+                  endIndex =
+                      details::circular_less_than<index_t>(
+                          firstIndex + static_cast<index_t>(actualCount),
+                          endIndex)
+                          ? firstIndex + static_cast<index_t>(actualCount)
+                          : endIndex;
+                } while (index != firstIndex + actualCount);
+
+                MOODYCAMEL_RETHROW;
+              }
+            }
+            block->ConcurrentQueue::Block::template set_many_empty<
+                explicit_context>(
+                firstIndexInBlock,
+                static_cast<size_t>(endIndex - firstIndexInBlock));
+            indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1);
+          } while (index != firstIndex + actualCount);
+
+          return actualCount;
+        } else {
+          // Wasn't anything to dequeue after all; make the effective dequeue
+          // count eventually consistent
+          this->dequeueOvercommit.fetch_add(desiredCount,
+                                            std::memory_order_release);
+        }
+      }
+
+      return 0;
+    }
+
+  private:
+    struct BlockIndexEntry {
+      index_t base;
+      Block *block;
+    };
+
+    struct BlockIndexHeader {
+      size_t size;
+      std::atomic<size_t>
+          front; // Current slot (not next, like pr_blockIndexFront)
+      BlockIndexEntry *entries;
+      void *prev;
+    };
+
+    bool new_block_index(size_t numberOfFilledSlotsToExpose) {
+      auto prevBlockSizeMask = pr_blockIndexSize - 1;
+
+      // Create the new block
+      pr_blockIndexSize <<= 1;
+      auto newRawPtr = static_cast<char *>((Traits::malloc)(
+          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
+          1 + sizeof(BlockIndexEntry) * pr_blockIndexSize));
+      if (newRawPtr == nullptr) {
+        pr_blockIndexSize >>= 1; // Reset to allow graceful retry
+        return false;
+      }
+
+      auto newBlockIndexEntries = reinterpret_cast<BlockIndexEntry *>(
+          details::align_for<BlockIndexEntry>(newRawPtr +
+                                              sizeof(BlockIndexHeader)));
+
+      // Copy in all the old indices, if any
+      size_t j = 0;
+      if (pr_blockIndexSlotsUsed != 0) {
+        auto i =
+            (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask;
+        do {
+          newBlockIndexEntries[j++] = pr_blockIndexEntries[i];
+          i = (i + 1) & prevBlockSizeMask;
+        } while (i != pr_blockIndexFront);
+      }
+
+      // Update everything
+      auto header = new (newRawPtr) BlockIndexHeader;
+      header->size = pr_blockIndexSize;
+      header->front.store(numberOfFilledSlotsToExpose - 1,
+                          std::memory_order_relaxed);
+      header->entries = newBlockIndexEntries;
+      header->prev = pr_blockIndexRaw; // we link the new block to the old one
+                                       // so we can free it later
+
+      pr_blockIndexFront = j;
+      pr_blockIndexEntries = newBlockIndexEntries;
+      pr_blockIndexRaw = newRawPtr;
+      blockIndex.store(header, std::memory_order_release);
+
+      return true;
+    }
+
+  private:
+    std::atomic<BlockIndexHeader *> blockIndex;
+
+    // To be used by producer only -- consumer must use the ones in referenced
+    // by blockIndex
+    size_t pr_blockIndexSlotsUsed;
+    size_t pr_blockIndexSize;
+    size_t pr_blockIndexFront; // Next slot (not current)
+    BlockIndexEntry *pr_blockIndexEntries;
+    void *pr_blockIndexRaw;
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ExplicitProducer* nextExplicitProducer;
-	private:
+  public:
+    ExplicitProducer *nextExplicitProducer;
+
+  private:
 #endif
-		
+
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	//////////////////////////////////
-	// Implicit queue
-	//////////////////////////////////
-	
-	struct ImplicitProducer : public ProducerBase
-	{			
-		ImplicitProducer(ConcurrentQueue* parent_) :
-			ProducerBase(parent_, false),
-			nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
-			blockIndex(nullptr)
-		{
-			new_block_index();
-		}
-		
-		~ImplicitProducer()
-		{
-			// Note that since we're in the destructor we can assume that all enqueue/dequeue operations
-			// completed already; this means that all undequeued elements are placed contiguously across
-			// contiguous blocks, and that only the first and last remaining blocks can be only partially
-			// empty (all other remaining blocks must be completely full).
-			
+    friend struct MemStats;
+#endif
+  };
+
+  //////////////////////////////////
+  // Implicit queue
+  //////////////////////////////////
+
+  struct ImplicitProducer : public ProducerBase {
+    ImplicitProducer(ConcurrentQueue *parent_)
+        : ProducerBase(parent_, false),
+          nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE),
+          blockIndex(nullptr) {
+      new_block_index();
+    }
+
+    ~ImplicitProducer() {
+      // Note that since we're in the destructor we can assume that all
+      // enqueue/dequeue operations completed already; this means that all
+      // undequeued elements are placed contiguously across contiguous blocks,
+      // and that only the first and last remaining blocks can be only partially
+      // empty (all other remaining blocks must be completely full).
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-			// Unregister ourselves for thread termination notification
-			if (!this->inactive.load(std::memory_order_relaxed)) {
-				details::ThreadExitNotifier::unsubscribe(&threadExitListener);
-			}
-#endif
-			
-			// Destroy all remaining elements!
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto index = this->headIndex.load(std::memory_order_relaxed);
-			Block* block = nullptr;
-			assert(index == tail || details::circular_less_than(index, tail));
-			bool forceFreeLastBlock = index != tail;		// If we enter the loop, then the last (tail) block will not be freed
-			while (index != tail) {
-				if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 || block == nullptr) {
-					if (block != nullptr) {
-						// Free the old block
-						this->parent->add_block_to_free_list(block);
-					}
-					
-					block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed);
-				}
-				
-				((*block)[index])->~T();
-				++index;
-			}
-			// Even if the queue is empty, there's still one block that's not on the free list
-			// (unless the head index reached the end of it, in which case the tail will be poised
-			// to create a new block).
-			if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
-				this->parent->add_block_to_free_list(this->tailBlock);
-			}
-			
-			// Destroy block index
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			if (localBlockIndex != nullptr) {
-				for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
-					localBlockIndex->index[i]->~BlockIndexEntry();
-				}
-				do {
-					auto prev = localBlockIndex->prev;
-					localBlockIndex->~BlockIndexHeader();
-					(Traits::free)(localBlockIndex);
-					localBlockIndex = prev;
-				} while (localBlockIndex != nullptr);
-			}
-		}
-		
-		template<AllocationMode allocMode, typename U>
-		inline bool enqueue(U&& element)
-		{
-			index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			index_t newTailIndex = 1 + currentTailIndex;
-			if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-				// We reached the end of a block, start a new one
-				auto head = this->headIndex.load(std::memory_order_relaxed);
-				assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-				if (!details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
-					return false;
-				}
+      // Unregister ourselves for thread termination notification
+      if (!this->inactive.load(std::memory_order_relaxed)) {
+        details::ThreadExitNotifier::unsubscribe(&threadExitListener);
+      }
+#endif
+
+      // Destroy all remaining elements!
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto index = this->headIndex.load(std::memory_order_relaxed);
+      Block *block = nullptr;
+      assert(index == tail || details::circular_less_than(index, tail));
+      bool forceFreeLastBlock =
+          index != tail; // If we enter the loop, then the last (tail) block
+                         // will not be freed
+      while (index != tail) {
+        if ((index & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 ||
+            block == nullptr) {
+          if (block != nullptr) {
+            // Free the old block
+            this->parent->add_block_to_free_list(block);
+          }
+
+          block = get_block_index_entry_for_index(index)->value.load(
+              std::memory_order_relaxed);
+        }
+
+        ((*block)[index])->~T();
+        ++index;
+      }
+      // Even if the queue is empty, there's still one block that's not on the
+      // free list (unless the head index reached the end of it, in which case
+      // the tail will be poised to create a new block).
+      if (this->tailBlock != nullptr &&
+          (forceFreeLastBlock ||
+           (tail & static_cast<index_t>(BLOCK_SIZE - 1)) != 0)) {
+        this->parent->add_block_to_free_list(this->tailBlock);
+      }
+
+      // Destroy block index
+      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+      if (localBlockIndex != nullptr) {
+        for (size_t i = 0; i != localBlockIndex->capacity; ++i) {
+          localBlockIndex->index[i]->~BlockIndexEntry();
+        }
+        do {
+          auto prev = localBlockIndex->prev;
+          localBlockIndex->~BlockIndexHeader();
+          (Traits::free)(localBlockIndex);
+          localBlockIndex = prev;
+        } while (localBlockIndex != nullptr);
+      }
+    }
+
+    template <AllocationMode allocMode, typename U>
+    inline bool enqueue(U &&element) {
+      index_t currentTailIndex =
+          this->tailIndex.load(std::memory_order_relaxed);
+      index_t newTailIndex = 1 + currentTailIndex;
+      if ((currentTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
+        // We reached the end of a block, start a new one
+        auto head = this->headIndex.load(std::memory_order_relaxed);
+        assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+        if (!details::circular_less_than<index_t>(head, currentTailIndex +
+                                                            BLOCK_SIZE) ||
+            (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+             (MAX_SUBQUEUE_SIZE == 0 ||
+              MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) {
+          return false;
+        }
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				// Find out where we'll be inserting this block in the block index
-				BlockIndexEntry* idxEntry;
-				if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
-					return false;
-				}
-				
-				// Get ahold of a new block
-				auto newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>();
-				if (newBlock == nullptr) {
-					rewind_block_index_tail();
-					idxEntry->value.store(nullptr, std::memory_order_relaxed);
-					return false;
-				}
+        debug::DebugLock lock(mutex);
+#endif
+        // Find out where we'll be inserting this block in the block index
+        BlockIndexEntry *idxEntry;
+        if (!insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) {
+          return false;
+        }
+
+        // Get ahold of a new block
+        auto newBlock =
+            this->parent
+                ->ConcurrentQueue::template requisition_block<allocMode>();
+        if (newBlock == nullptr) {
+          rewind_block_index_tail();
+          idxEntry->value.store(nullptr, std::memory_order_relaxed);
+          return false;
+        }
 #ifdef MCDBGQ_TRACKMEM
-				newBlock->owner = this;
-#endif
-				newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					// May throw, try to insert now before we publish the fact that we have this new block
-					MOODYCAMEL_TRY {
-						new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
-					}
-					MOODYCAMEL_CATCH (...) {
-						rewind_block_index_tail();
-						idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						this->parent->add_block_to_free_list(newBlock);
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				// Insert the new block into the index
-				idxEntry->value.store(newBlock, std::memory_order_relaxed);
-				
-				this->tailBlock = newBlock;
-				
-				MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast<T*>(nullptr)) T(std::forward<U>(element)))) {
-					this->tailIndex.store(newTailIndex, std::memory_order_release);
-					return true;
-				}
-			}
-			
-			// Enqueue
-			new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
-			
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
-		
-		template<typename U>
-		bool dequeue(U& element)
-		{
-			// See ExplicitProducer::dequeue for rationale and explanation
-			index_t tail = this->tailIndex.load(std::memory_order_relaxed);
-			index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			if (details::circular_less_than<index_t>(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) {
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed);
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				if ((details::likely)(details::circular_less_than<index_t>(myDequeueCount - overcommit, tail))) {
-					index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel);
-					
-					// Determine which block the element is in
-					auto entry = get_block_index_entry_for_index(index);
-					
-					// Dequeue
-					auto block = entry->value.load(std::memory_order_relaxed);
-					auto& el = *((*block)[index]);
-					
-					if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) {
+        newBlock->owner = this;
+#endif
+        newBlock
+            ->ConcurrentQueue::Block::template reset_empty<implicit_context>();
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          // May throw, try to insert now before we publish the fact that we
+          // have this new block
+          MOODYCAMEL_TRY {
+            new ((*newBlock)[currentTailIndex]) T(std::forward<U>(element));
+          }
+          MOODYCAMEL_CATCH(...) {
+            rewind_block_index_tail();
+            idxEntry->value.store(nullptr, std::memory_order_relaxed);
+            this->parent->add_block_to_free_list(newBlock);
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        // Insert the new block into the index
+        idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+        this->tailBlock = newBlock;
+
+        MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR(
+            T, U,
+            new (static_cast<T *>(nullptr)) T(std::forward<U>(element)))) {
+          this->tailIndex.store(newTailIndex, std::memory_order_release);
+          return true;
+        }
+      }
+
+      // Enqueue
+      new ((*this->tailBlock)[currentTailIndex]) T(std::forward<U>(element));
+
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
+
+    template <typename U> bool dequeue(U &element) {
+      // See ExplicitProducer::dequeue for rationale and explanation
+      index_t tail = this->tailIndex.load(std::memory_order_relaxed);
+      index_t overcommit =
+          this->dequeueOvercommit.load(std::memory_order_relaxed);
+      if (details::circular_less_than<index_t>(
+              this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit,
+              tail)) {
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            1, std::memory_order_relaxed);
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        if ((details::likely)(details::circular_less_than<index_t>(
+                myDequeueCount - overcommit, tail))) {
+          index_t index =
+              this->headIndex.fetch_add(1, std::memory_order_acq_rel);
+
+          // Determine which block the element is in
+          auto entry = get_block_index_entry_for_index(index);
+
+          // Dequeue
+          auto block = entry->value.load(std::memory_order_relaxed);
+          auto &el = *((*block)[index]);
+
+          if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-						// Note: Acquiring the mutex with every dequeue instead of only when a block
-						// is released is very sub-optimal, but it is, after all, purely debug code.
-						debug::DebugLock lock(producer->mutex);
-#endif
-						struct Guard {
-							Block* block;
-							index_t index;
-							BlockIndexEntry* entry;
-							ConcurrentQueue* parent;
-							
-							~Guard()
-							{
-								(*block)[index]->~T();
-								if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-									entry->value.store(nullptr, std::memory_order_relaxed);
-									parent->add_block_to_free_list(block);
-								}
-							}
-						} guard = { block, index, entry, this->parent };
-
-						element = std::move(el); // NOLINT
-					}
-					else {
-						element = std::move(el); // NOLINT
-						el.~T(); // NOLINT
-
-						if (block->ConcurrentQueue::Block::template set_empty<implicit_context>(index)) {
-							{
+            // Note: Acquiring the mutex with every dequeue instead of only when
+            // a block is released is very sub-optimal, but it is, after all,
+            // purely debug code.
+            debug::DebugLock lock(producer->mutex);
+#endif
+            struct Guard {
+              Block *block;
+              index_t index;
+              BlockIndexEntry *entry;
+              ConcurrentQueue *parent;
+
+              ~Guard() {
+                (*block)[index]->~T();
+                if (block->ConcurrentQueue::Block::template set_empty<
+                        implicit_context>(index)) {
+                  entry->value.store(nullptr, std::memory_order_relaxed);
+                  parent->add_block_to_free_list(block);
+                }
+              }
+            } guard = {block, index, entry, this->parent};
+
+            element = std::move(el); // NOLINT
+          } else {
+            element = std::move(el); // NOLINT
+            el.~T();                 // NOLINT
+
+            if (block->ConcurrentQueue::Block::template set_empty<
+                    implicit_context>(index)) {
+              {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Add the block back into the global free pool (and remove from block index)
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-					}
-					
-					return true;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
-				}
-			}
-		
-			return false;
-		}
-		
+                debug::DebugLock lock(mutex);
+#endif
+                // Add the block back into the global free pool (and remove from
+                // block index)
+                entry->value.store(nullptr, std::memory_order_relaxed);
+              }
+              this->parent->add_block_to_free_list(
+                  block); // releases the above store
+            }
+          }
+
+          return true;
+        } else {
+          this->dequeueOvercommit.fetch_add(1, std::memory_order_release);
+        }
+      }
+
+      return false;
+    }
+
 #ifdef _MSC_VER
 #pragma warning(push)
-#pragma warning(disable: 4706)  // assignment within conditional expression
-#endif
-		template<AllocationMode allocMode, typename It>
-		bool enqueue_bulk(It itemFirst, size_t count)
-		{
-			// First, we need to make sure we have enough room to enqueue all of the elements;
-			// this means pre-allocating blocks and putting them in the block index (but only if
-			// all the allocations succeeded).
-			
-			// Note that the tailBlock we start off with may not be owned by us any more;
-			// this happens if it was filled up exactly to the top (setting tailIndex to
-			// the first index of the next block which is not yet allocated), then dequeued
-			// completely (putting it on the free list) before we enqueue again.
-			
-			index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
-			auto startBlock = this->tailBlock;
-			Block* firstAllocatedBlock = nullptr;
-			auto endBlock = this->tailBlock;
-			
-			// Figure out how many blocks we'll need to allocate, and do so
-			size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
-			index_t currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-			if (blockBaseDiff > 0) {
+#pragma warning(disable : 4706) // assignment within conditional expression
+#endif
+    template <AllocationMode allocMode, typename It>
+    bool enqueue_bulk(It itemFirst, size_t count) {
+      // First, we need to make sure we have enough room to enqueue all of the
+      // elements; this means pre-allocating blocks and putting them in the
+      // block index (but only if all the allocations succeeded).
+
+      // Note that the tailBlock we start off with may not be owned by us any
+      // more; this happens if it was filled up exactly to the top (setting
+      // tailIndex to the first index of the next block which is not yet
+      // allocated), then dequeued completely (putting it on the free list)
+      // before we enqueue again.
+
+      index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed);
+      auto startBlock = this->tailBlock;
+      Block *firstAllocatedBlock = nullptr;
+      auto endBlock = this->tailBlock;
+
+      // Figure out how many blocks we'll need to allocate, and do so
+      size_t blockBaseDiff =
+          ((startTailIndex + count - 1) &
+           ~static_cast<index_t>(BLOCK_SIZE - 1)) -
+          ((startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1));
+      index_t currentTailIndex =
+          (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+      if (blockBaseDiff > 0) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-				debug::DebugLock lock(mutex);
-#endif
-				do {
-					blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
-					currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-					
-					// Find out where we'll be inserting this block in the block index
-					BlockIndexEntry* idxEntry = nullptr;  // initialization here unnecessary but compiler can't always tell
-					Block* newBlock;
-					bool indexInserted = false;
-					auto head = this->headIndex.load(std::memory_order_relaxed);
-					assert(!details::circular_less_than<index_t>(currentTailIndex, head));
-					bool full = !details::circular_less_than<index_t>(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
-
-					if (full || !(indexInserted = insert_block_index_entry<allocMode>(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block<allocMode>()) == nullptr) {
-						// Index allocation or block allocation failed; revert any other allocations
-						// and index insertions done so far for this operation
-						if (indexInserted) {
-							rewind_block_index_tail();
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-						}
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						
-						return false;
-					}
-					
+        debug::DebugLock lock(mutex);
+#endif
+        do {
+          blockBaseDiff -= static_cast<index_t>(BLOCK_SIZE);
+          currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+
+          // Find out where we'll be inserting this block in the block index
+          BlockIndexEntry *idxEntry =
+              nullptr; // initialization here unnecessary but compiler can't
+                       // always tell
+          Block *newBlock;
+          bool indexInserted = false;
+          auto head = this->headIndex.load(std::memory_order_relaxed);
+          assert(!details::circular_less_than<index_t>(currentTailIndex, head));
+          bool full =
+              !details::circular_less_than<index_t>(head, currentTailIndex +
+                                                              BLOCK_SIZE) ||
+              (MAX_SUBQUEUE_SIZE != details::const_numeric_max<size_t>::value &&
+               (MAX_SUBQUEUE_SIZE == 0 ||
+                MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head));
+
+          if (full ||
+              !(indexInserted = insert_block_index_entry<allocMode>(
+                    idxEntry, currentTailIndex)) ||
+              (newBlock =
+                   this->parent->ConcurrentQueue::template requisition_block<
+                       allocMode>()) == nullptr) {
+            // Index allocation or block allocation failed; revert any other
+            // allocations and index insertions done so far for this operation
+            if (indexInserted) {
+              rewind_block_index_tail();
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+            }
+            currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            for (auto block = firstAllocatedBlock; block != nullptr;
+                 block = block->next) {
+              currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+              idxEntry = get_block_index_entry_for_index(currentTailIndex);
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+              rewind_block_index_tail();
+            }
+            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+            this->tailBlock = startBlock;
+
+            return false;
+          }
+
 #ifdef MCDBGQ_TRACKMEM
-					newBlock->owner = this;
-#endif
-					newBlock->ConcurrentQueue::Block::template reset_empty<implicit_context>();
-					newBlock->next = nullptr;
-					
-					// Insert the new block into the index
-					idxEntry->value.store(newBlock, std::memory_order_relaxed);
-					
-					// Store the chain of blocks so that we can undo if later allocations fail,
-					// and so that we can find the blocks when we do the actual enqueueing
-					if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) {
-						assert(this->tailBlock != nullptr);
-						this->tailBlock->next = newBlock;
-					}
-					this->tailBlock = newBlock;
-					endBlock = newBlock;
-					firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
-				} while (blockBaseDiff > 0);
-			}
-			
-			// Enqueue, one block at a time
-			index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
-			currentTailIndex = startTailIndex;
-			this->tailBlock = startBlock;
-			assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0);
-			if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) {
-				this->tailBlock = firstAllocatedBlock;
-			}
-			while (true) {
-				index_t stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-				if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
-					stopIndex = newTailIndex;
-				}
-				MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))) {
-					while (currentTailIndex != stopIndex) {
-						new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
-					}
-				}
-				else {
-					MOODYCAMEL_TRY {
-						while (currentTailIndex != stopIndex) {
-							new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast<T*>(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst));
-							++currentTailIndex;
-							++itemFirst;
-						}
-					}
-					MOODYCAMEL_CATCH (...) {
-						auto constructedStopIndex = currentTailIndex;
-						auto lastBlockEnqueued = this->tailBlock;
-						
-						if (!details::is_trivially_destructible<T>::value) {
-							auto block = startBlock;
-							if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0) {
-								block = firstAllocatedBlock;
-							}
-							currentTailIndex = startTailIndex;
-							while (true) {
-								stopIndex = (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-								if (details::circular_less_than<index_t>(constructedStopIndex, stopIndex)) {
-									stopIndex = constructedStopIndex;
-								}
-								while (currentTailIndex != stopIndex) {
-									(*block)[currentTailIndex++]->~T();
-								}
-								if (block == lastBlockEnqueued) {
-									break;
-								}
-								block = block->next;
-							}
-						}
-						
-						currentTailIndex = (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
-						for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) {
-							currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
-							auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
-							idxEntry->value.store(nullptr, std::memory_order_relaxed);
-							rewind_block_index_tail();
-						}
-						this->parent->add_blocks_to_free_list(firstAllocatedBlock);
-						this->tailBlock = startBlock;
-						MOODYCAMEL_RETHROW;
-					}
-				}
-				
-				if (this->tailBlock == endBlock) {
-					assert(currentTailIndex == newTailIndex);
-					break;
-				}
-				this->tailBlock = this->tailBlock->next;
-			}
-			this->tailIndex.store(newTailIndex, std::memory_order_release);
-			return true;
-		}
+          newBlock->owner = this;
+#endif
+          newBlock->ConcurrentQueue::Block::template reset_empty<
+              implicit_context>();
+          newBlock->next = nullptr;
+
+          // Insert the new block into the index
+          idxEntry->value.store(newBlock, std::memory_order_relaxed);
+
+          // Store the chain of blocks so that we can undo if later allocations
+          // fail, and so that we can find the blocks when we do the actual
+          // enqueueing
+          if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+              firstAllocatedBlock != nullptr) {
+            assert(this->tailBlock != nullptr);
+            this->tailBlock->next = newBlock;
+          }
+          this->tailBlock = newBlock;
+          endBlock = newBlock;
+          firstAllocatedBlock =
+              firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock;
+        } while (blockBaseDiff > 0);
+      }
+
+      // Enqueue, one block at a time
+      index_t newTailIndex = startTailIndex + static_cast<index_t>(count);
+      currentTailIndex = startTailIndex;
+      this->tailBlock = startBlock;
+      assert((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) != 0 ||
+             firstAllocatedBlock != nullptr || count == 0);
+      if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) == 0 &&
+          firstAllocatedBlock != nullptr) {
+        this->tailBlock = firstAllocatedBlock;
+      }
+      while (true) {
+        index_t stopIndex =
+            (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+            static_cast<index_t>(BLOCK_SIZE);
+        if (details::circular_less_than<index_t>(newTailIndex, stopIndex)) {
+          stopIndex = newTailIndex;
+        }
+        MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR(
+            T, decltype(*itemFirst),
+            new (static_cast<T *>(nullptr))
+                T(details::deref_noexcept(itemFirst)))) {
+          while (currentTailIndex != stopIndex) {
+            new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++);
+          }
+        }
+        else {
+          MOODYCAMEL_TRY {
+            while (currentTailIndex != stopIndex) {
+              new ((*this->tailBlock)[currentTailIndex]) T(
+                  details::nomove_if<!MOODYCAMEL_NOEXCEPT_CTOR(
+                      T, decltype(*itemFirst),
+                      new (static_cast<T *>(nullptr)) T(details::deref_noexcept(
+                          itemFirst)))>::eval(*itemFirst));
+              ++currentTailIndex;
+              ++itemFirst;
+            }
+          }
+          MOODYCAMEL_CATCH(...) {
+            auto constructedStopIndex = currentTailIndex;
+            auto lastBlockEnqueued = this->tailBlock;
+
+            if (!details::is_trivially_destructible<T>::value) {
+              auto block = startBlock;
+              if ((startTailIndex & static_cast<index_t>(BLOCK_SIZE - 1)) ==
+                  0) {
+                block = firstAllocatedBlock;
+              }
+              currentTailIndex = startTailIndex;
+              while (true) {
+                stopIndex =
+                    (currentTailIndex & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                    static_cast<index_t>(BLOCK_SIZE);
+                if (details::circular_less_than<index_t>(constructedStopIndex,
+                                                         stopIndex)) {
+                  stopIndex = constructedStopIndex;
+                }
+                while (currentTailIndex != stopIndex) {
+                  (*block)[currentTailIndex++]->~T();
+                }
+                if (block == lastBlockEnqueued) {
+                  break;
+                }
+                block = block->next;
+              }
+            }
+
+            currentTailIndex =
+                (startTailIndex - 1) & ~static_cast<index_t>(BLOCK_SIZE - 1);
+            for (auto block = firstAllocatedBlock; block != nullptr;
+                 block = block->next) {
+              currentTailIndex += static_cast<index_t>(BLOCK_SIZE);
+              auto idxEntry = get_block_index_entry_for_index(currentTailIndex);
+              idxEntry->value.store(nullptr, std::memory_order_relaxed);
+              rewind_block_index_tail();
+            }
+            this->parent->add_blocks_to_free_list(firstAllocatedBlock);
+            this->tailBlock = startBlock;
+            MOODYCAMEL_RETHROW;
+          }
+        }
+
+        if (this->tailBlock == endBlock) {
+          assert(currentTailIndex == newTailIndex);
+          break;
+        }
+        this->tailBlock = this->tailBlock->next;
+      }
+      this->tailIndex.store(newTailIndex, std::memory_order_release);
+      return true;
+    }
 #ifdef _MSC_VER
 #pragma warning(pop)
 #endif
-		
-		template<typename It>
-		size_t dequeue_bulk(It& itemFirst, size_t max)
-		{
-			auto tail = this->tailIndex.load(std::memory_order_relaxed);
-			auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
-			auto desiredCount = static_cast<size_t>(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit));
-			if (details::circular_less_than<size_t>(0, desiredCount)) {
-				desiredCount = desiredCount < max ? desiredCount : max;
-				std::atomic_thread_fence(std::memory_order_acquire);
-				
-				auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed);
-				
-				tail = this->tailIndex.load(std::memory_order_acquire);
-				auto actualCount = static_cast<size_t>(tail - (myDequeueCount - overcommit));
-				if (details::circular_less_than<size_t>(0, actualCount)) {
-					actualCount = desiredCount < actualCount ? desiredCount : actualCount;
-					if (actualCount < desiredCount) {
-						this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release);
-					}
-					
-					// Get the first index. Note that since there's guaranteed to be at least actualCount elements, this
-					// will never exceed tail.
-					auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
-					
-					// Iterate the blocks and dequeue
-					auto index = firstIndex;
-					BlockIndexHeader* localBlockIndex;
-					auto indexIndex = get_block_index_index_for_index(index, localBlockIndex);
-					do {
-						auto blockStartIndex = index;
-						index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-						endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-						
-						auto entry = localBlockIndex->index[indexIndex];
-						auto block = entry->value.load(std::memory_order_relaxed);
-						if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) {
-							while (index != endIndex) {
-								auto& el = *((*block)[index]);
-								*itemFirst++ = std::move(el);
-								el.~T();
-								++index;
-							}
-						}
-						else {
-							MOODYCAMEL_TRY {
-								while (index != endIndex) {
-									auto& el = *((*block)[index]);
-									*itemFirst = std::move(el);
-									++itemFirst;
-									el.~T();
-									++index;
-								}
-							}
-							MOODYCAMEL_CATCH (...) {
-								do {
-									entry = localBlockIndex->index[indexIndex];
-									block = entry->value.load(std::memory_order_relaxed);
-									while (index != endIndex) {
-										(*block)[index++]->~T();
-									}
-									
-									if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
+
+    template <typename It> size_t dequeue_bulk(It &itemFirst, size_t max) {
+      auto tail = this->tailIndex.load(std::memory_order_relaxed);
+      auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed);
+      auto desiredCount = static_cast<size_t>(
+          tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) -
+                  overcommit));
+      if (details::circular_less_than<size_t>(0, desiredCount)) {
+        desiredCount = desiredCount < max ? desiredCount : max;
+        std::atomic_thread_fence(std::memory_order_acquire);
+
+        auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(
+            desiredCount, std::memory_order_relaxed);
+
+        tail = this->tailIndex.load(std::memory_order_acquire);
+        auto actualCount =
+            static_cast<size_t>(tail - (myDequeueCount - overcommit));
+        if (details::circular_less_than<size_t>(0, actualCount)) {
+          actualCount = desiredCount < actualCount ? desiredCount : actualCount;
+          if (actualCount < desiredCount) {
+            this->dequeueOvercommit.fetch_add(desiredCount - actualCount,
+                                              std::memory_order_release);
+          }
+
+          // Get the first index. Note that since there's guaranteed to be at
+          // least actualCount elements, this will never exceed tail.
+          auto firstIndex =
+              this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel);
+
+          // Iterate the blocks and dequeue
+          auto index = firstIndex;
+          BlockIndexHeader *localBlockIndex;
+          auto indexIndex =
+              get_block_index_index_for_index(index, localBlockIndex);
+          do {
+            auto blockStartIndex = index;
+            index_t endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                               static_cast<index_t>(BLOCK_SIZE);
+            endIndex =
+                details::circular_less_than<index_t>(
+                    firstIndex + static_cast<index_t>(actualCount), endIndex)
+                    ? firstIndex + static_cast<index_t>(actualCount)
+                    : endIndex;
+
+            auto entry = localBlockIndex->index[indexIndex];
+            auto block = entry->value.load(std::memory_order_relaxed);
+            if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&,
+                                           details::deref_noexcept(itemFirst) =
+                                               std::move((*(*block)[index])))) {
+              while (index != endIndex) {
+                auto &el = *((*block)[index]);
+                *itemFirst++ = std::move(el);
+                el.~T();
+                ++index;
+              }
+            } else {
+              MOODYCAMEL_TRY {
+                while (index != endIndex) {
+                  auto &el = *((*block)[index]);
+                  *itemFirst = std::move(el);
+                  ++itemFirst;
+                  el.~T();
+                  ++index;
+                }
+              }
+              MOODYCAMEL_CATCH(...) {
+                do {
+                  entry = localBlockIndex->index[indexIndex];
+                  block = entry->value.load(std::memory_order_relaxed);
+                  while (index != endIndex) {
+                    (*block)[index++]->~T();
+                  }
+
+                  if (block->ConcurrentQueue::Block::template set_many_empty<
+                          implicit_context>(
+                          blockStartIndex,
+                          static_cast<size_t>(endIndex - blockStartIndex))) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-										debug::DebugLock lock(mutex);
-#endif
-										entry->value.store(nullptr, std::memory_order_relaxed);
-										this->parent->add_block_to_free_list(block);
-									}
-									indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-									
-									blockStartIndex = index;
-									endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) + static_cast<index_t>(BLOCK_SIZE);
-									endIndex = details::circular_less_than<index_t>(firstIndex + static_cast<index_t>(actualCount), endIndex) ? firstIndex + static_cast<index_t>(actualCount) : endIndex;
-								} while (index != firstIndex + actualCount);
-								
-								MOODYCAMEL_RETHROW;
-							}
-						}
-						if (block->ConcurrentQueue::Block::template set_many_empty<implicit_context>(blockStartIndex, static_cast<size_t>(endIndex - blockStartIndex))) {
-							{
+                    debug::DebugLock lock(mutex);
+#endif
+                    entry->value.store(nullptr, std::memory_order_relaxed);
+                    this->parent->add_block_to_free_list(block);
+                  }
+                  indexIndex =
+                      (indexIndex + 1) & (localBlockIndex->capacity - 1);
+
+                  blockStartIndex = index;
+                  endIndex = (index & ~static_cast<index_t>(BLOCK_SIZE - 1)) +
+                             static_cast<index_t>(BLOCK_SIZE);
+                  endIndex =
+                      details::circular_less_than<index_t>(
+                          firstIndex + static_cast<index_t>(actualCount),
+                          endIndex)
+                          ? firstIndex + static_cast<index_t>(actualCount)
+                          : endIndex;
+                } while (index != firstIndex + actualCount);
+
+                MOODYCAMEL_RETHROW;
+              }
+            }
+            if (block->ConcurrentQueue::Block::template set_many_empty<
+                    implicit_context>(
+                    blockStartIndex,
+                    static_cast<size_t>(endIndex - blockStartIndex))) {
+              {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-								debug::DebugLock lock(mutex);
-#endif
-								// Note that the set_many_empty above did a release, meaning that anybody who acquires the block
-								// we're about to free can use it safely since our writes (and reads!) will have happened-before then.
-								entry->value.store(nullptr, std::memory_order_relaxed);
-							}
-							this->parent->add_block_to_free_list(block);		// releases the above store
-						}
-						indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
-					} while (index != firstIndex + actualCount);
-					
-					return actualCount;
-				}
-				else {
-					this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release);
-				}
-			}
-			
-			return 0;
-		}
-		
-	private:
-		// The block size must be > 1, so any number with the low bit set is an invalid block base index
-		static const index_t INVALID_BLOCK_BASE = 1;
-		
-		struct BlockIndexEntry
-		{
-			std::atomic<index_t> key;
-			std::atomic<Block*> value;
-		};
-		
-		struct BlockIndexHeader
-		{
-			size_t capacity;
-			std::atomic<size_t> tail;
-			BlockIndexEntry* entries;
-			BlockIndexEntry** index;
-			BlockIndexHeader* prev;
-		};
-		
-		template<AllocationMode allocMode>
-		inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex)
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);		// We're the only writer thread, relaxed is OK
-			if (localBlockIndex == nullptr) {
-				return false;  // this can happen if new_block_index failed in the constructor
-			}
-			size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-			idxEntry = localBlockIndex->index[newTail];
-			if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
-				idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
-				
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-			
-			// No room in the old block index, try to allocate another one!
-			MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) {
-				return false;
-			}
-			else if (!new_block_index()) {
-				return false;
-			}
-			else {
-				localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-				newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1);
-				idxEntry = localBlockIndex->index[newTail];
-				assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE);
-				idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
-				localBlockIndex->tail.store(newTail, std::memory_order_release);
-				return true;
-			}
-		}
-		
-		inline void rewind_block_index_tail()
-		{
-			auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
-			localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed);
-		}
-		
-		inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const
-		{
-			BlockIndexHeader* localBlockIndex;
-			auto idx = get_block_index_index_for_index(index, localBlockIndex);
-			return localBlockIndex->index[idx];
-		}
-		
-		inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const
-		{
+                debug::DebugLock lock(mutex);
+#endif
+                // Note that the set_many_empty above did a release, meaning
+                // that anybody who acquires the block we're about to free can
+                // use it safely since our writes (and reads!) will have
+                // happened-before then.
+                entry->value.store(nullptr, std::memory_order_relaxed);
+              }
+              this->parent->add_block_to_free_list(
+                  block); // releases the above store
+            }
+            indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1);
+          } while (index != firstIndex + actualCount);
+
+          return actualCount;
+        } else {
+          this->dequeueOvercommit.fetch_add(desiredCount,
+                                            std::memory_order_release);
+        }
+      }
+
+      return 0;
+    }
+
+  private:
+    // The block size must be > 1, so any number with the low bit set is an
+    // invalid block base index
+    static const index_t INVALID_BLOCK_BASE = 1;
+
+    struct BlockIndexEntry {
+      std::atomic<index_t> key;
+      std::atomic<Block *> value;
+    };
+
+    struct BlockIndexHeader {
+      size_t capacity;
+      std::atomic<size_t> tail;
+      BlockIndexEntry *entries;
+      BlockIndexEntry **index;
+      BlockIndexHeader *prev;
+    };
+
+    template <AllocationMode allocMode>
+    inline bool insert_block_index_entry(BlockIndexEntry *&idxEntry,
+                                         index_t blockStartIndex) {
+      auto localBlockIndex =
+          blockIndex.load(std::memory_order_relaxed); // We're the only writer
+                                                      // thread, relaxed is OK
+      if (localBlockIndex == nullptr) {
+        return false; // this can happen if new_block_index failed in the
+                      // constructor
+      }
+      size_t newTail =
+          (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+          (localBlockIndex->capacity - 1);
+      idxEntry = localBlockIndex->index[newTail];
+      if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE ||
+          idxEntry->value.load(std::memory_order_relaxed) == nullptr) {
+
+        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+        localBlockIndex->tail.store(newTail, std::memory_order_release);
+        return true;
+      }
+
+      // No room in the old block index, try to allocate another one!
+      MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; }
+      else if (!new_block_index()) {
+        return false;
+      }
+      else {
+        localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+        newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) &
+                  (localBlockIndex->capacity - 1);
+        idxEntry = localBlockIndex->index[newTail];
+        assert(idxEntry->key.load(std::memory_order_relaxed) ==
+               INVALID_BLOCK_BASE);
+        idxEntry->key.store(blockStartIndex, std::memory_order_relaxed);
+        localBlockIndex->tail.store(newTail, std::memory_order_release);
+        return true;
+      }
+    }
+
+    inline void rewind_block_index_tail() {
+      auto localBlockIndex = blockIndex.load(std::memory_order_relaxed);
+      localBlockIndex->tail.store(
+          (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) &
+              (localBlockIndex->capacity - 1),
+          std::memory_order_relaxed);
+    }
+
+    inline BlockIndexEntry *
+    get_block_index_entry_for_index(index_t index) const {
+      BlockIndexHeader *localBlockIndex;
+      auto idx = get_block_index_index_for_index(index, localBlockIndex);
+      return localBlockIndex->index[idx];
+    }
+
+    inline size_t
+    get_block_index_index_for_index(index_t index,
+                                    BlockIndexHeader *&localBlockIndex) const {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-			debug::DebugLock lock(mutex);
-#endif
-			index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
-			localBlockIndex = blockIndex.load(std::memory_order_acquire);
-			auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
-			auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
-			assert(tailBase != INVALID_BLOCK_BASE);
-			// Note: Must use division instead of shift because the index may wrap around, causing a negative
-			// offset, whose negativity we want to preserve
-			auto offset = static_cast<size_t>(static_cast<typename std::make_signed<index_t>::type>(index - tailBase) / static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
-			size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
-			assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr);
-			return idx;
-		}
-		
-		bool new_block_index()
-		{
-			auto prev = blockIndex.load(std::memory_order_relaxed);
-			size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
-			auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
-			auto raw = static_cast<char*>((Traits::malloc)(
-				sizeof(BlockIndexHeader) +
-				std::alignment_of<BlockIndexEntry>::value - 1 + sizeof(BlockIndexEntry) * entryCount +
-				std::alignment_of<BlockIndexEntry*>::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity));
-			if (raw == nullptr) {
-				return false;
-			}
-			
-			auto header = new (raw) BlockIndexHeader;
-			auto entries = reinterpret_cast<BlockIndexEntry*>(details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
-			auto index = reinterpret_cast<BlockIndexEntry**>(details::align_for<BlockIndexEntry*>(reinterpret_cast<char*>(entries) + sizeof(BlockIndexEntry) * entryCount));
-			if (prev != nullptr) {
-				auto prevTail = prev->tail.load(std::memory_order_relaxed);
-				auto prevPos = prevTail;
-				size_t i = 0;
-				do {
-					prevPos = (prevPos + 1) & (prev->capacity - 1);
-					index[i++] = prev->index[prevPos];
-				} while (prevPos != prevTail);
-				assert(i == prevCapacity);
-			}
-			for (size_t i = 0; i != entryCount; ++i) {
-				new (entries + i) BlockIndexEntry;
-				entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
-				index[prevCapacity + i] = entries + i;
-			}
-			header->prev = prev;
-			header->entries = entries;
-			header->index = index;
-			header->capacity = nextBlockIndexCapacity;
-			header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed);
-			
-			blockIndex.store(header, std::memory_order_release);
-			
-			nextBlockIndexCapacity <<= 1;
-			
-			return true;
-		}
-		
-	private:
-		size_t nextBlockIndexCapacity;
-		std::atomic<BlockIndexHeader*> blockIndex;
+      debug::DebugLock lock(mutex);
+#endif
+      index &= ~static_cast<index_t>(BLOCK_SIZE - 1);
+      localBlockIndex = blockIndex.load(std::memory_order_acquire);
+      auto tail = localBlockIndex->tail.load(std::memory_order_acquire);
+      auto tailBase =
+          localBlockIndex->index[tail]->key.load(std::memory_order_relaxed);
+      assert(tailBase != INVALID_BLOCK_BASE);
+      // Note: Must use division instead of shift because the index may wrap
+      // around, causing a negative offset, whose negativity we want to preserve
+      auto offset = static_cast<size_t>(
+          static_cast<typename std::make_signed<index_t>::type>(index -
+                                                                tailBase) /
+          static_cast<typename std::make_signed<index_t>::type>(BLOCK_SIZE));
+      size_t idx = (tail + offset) & (localBlockIndex->capacity - 1);
+      assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) ==
+                 index &&
+             localBlockIndex->index[idx]->value.load(
+                 std::memory_order_relaxed) != nullptr);
+      return idx;
+    }
+
+    bool new_block_index() {
+      auto prev = blockIndex.load(std::memory_order_relaxed);
+      size_t prevCapacity = prev == nullptr ? 0 : prev->capacity;
+      auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity;
+      auto raw = static_cast<char *>((Traits::malloc)(
+          sizeof(BlockIndexHeader) + std::alignment_of<BlockIndexEntry>::value -
+          1 + sizeof(BlockIndexEntry) * entryCount +
+          std::alignment_of<BlockIndexEntry *>::value - 1 +
+          sizeof(BlockIndexEntry *) * nextBlockIndexCapacity));
+      if (raw == nullptr) {
+        return false;
+      }
+
+      auto header = new (raw) BlockIndexHeader;
+      auto entries = reinterpret_cast<BlockIndexEntry *>(
+          details::align_for<BlockIndexEntry>(raw + sizeof(BlockIndexHeader)));
+      auto index = reinterpret_cast<BlockIndexEntry **>(
+          details::align_for<BlockIndexEntry *>(
+              reinterpret_cast<char *>(entries) +
+              sizeof(BlockIndexEntry) * entryCount));
+      if (prev != nullptr) {
+        auto prevTail = prev->tail.load(std::memory_order_relaxed);
+        auto prevPos = prevTail;
+        size_t i = 0;
+        do {
+          prevPos = (prevPos + 1) & (prev->capacity - 1);
+          index[i++] = prev->index[prevPos];
+        } while (prevPos != prevTail);
+        assert(i == prevCapacity);
+      }
+      for (size_t i = 0; i != entryCount; ++i) {
+        new (entries + i) BlockIndexEntry;
+        entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed);
+        index[prevCapacity + i] = entries + i;
+      }
+      header->prev = prev;
+      header->entries = entries;
+      header->index = index;
+      header->capacity = nextBlockIndexCapacity;
+      header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1),
+                         std::memory_order_relaxed);
+
+      blockIndex.store(header, std::memory_order_release);
+
+      nextBlockIndexCapacity <<= 1;
+
+      return true;
+    }
+
+  private:
+    size_t nextBlockIndexCapacity;
+    std::atomic<BlockIndexHeader *> blockIndex;
 
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	public:
-		details::ThreadExitListener threadExitListener;
-	private:
+  public:
+    details::ThreadExitListener threadExitListener;
+
+  private:
 #endif
-		
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	public:
-		ImplicitProducer* nextImplicitProducer;
-	private:
+  public:
+    ImplicitProducer *nextImplicitProducer;
+
+  private:
 #endif
 
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX
-		mutable debug::DebugMutex mutex;
+    mutable debug::DebugMutex mutex;
 #endif
 #ifdef MCDBGQ_TRACKMEM
-		friend struct MemStats;
-#endif
-	};
-	
-	
-	//////////////////////////////////
-	// Block pool manipulation
-	//////////////////////////////////
-	
-	void populate_initial_block_list(size_t blockCount)
-	{
-		initialBlockPoolSize = blockCount;
-		if (initialBlockPoolSize == 0) {
-			initialBlockPool = nullptr;
-			return;
-		}
-		
-		initialBlockPool = create_array<Block>(blockCount);
-		if (initialBlockPool == nullptr) {
-			initialBlockPoolSize = 0;
-		}
-		for (size_t i = 0; i < initialBlockPoolSize; ++i) {
-			initialBlockPool[i].dynamicallyAllocated = false;
-		}
-	}
-	
-	inline Block* try_get_block_from_initial_pool()
-	{
-		if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) {
-			return nullptr;
-		}
-		
-		auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
-		
-		return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
-	}
-	
-	inline void add_block_to_free_list(Block* block)
-	{
+    friend struct MemStats;
+#endif
+  };
+
+  //////////////////////////////////
+  // Block pool manipulation
+  //////////////////////////////////
+
+  void populate_initial_block_list(size_t blockCount) {
+    initialBlockPoolSize = blockCount;
+    if (initialBlockPoolSize == 0) {
+      initialBlockPool = nullptr;
+      return;
+    }
+
+    initialBlockPool = create_array<Block>(blockCount);
+    if (initialBlockPool == nullptr) {
+      initialBlockPoolSize = 0;
+    }
+    for (size_t i = 0; i < initialBlockPoolSize; ++i) {
+      initialBlockPool[i].dynamicallyAllocated = false;
+    }
+  }
+
+  inline Block *try_get_block_from_initial_pool() {
+    if (initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+        initialBlockPoolSize) {
+      return nullptr;
+    }
+
+    auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed);
+
+    return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr;
+  }
+
+  inline void add_block_to_free_list(Block *block) {
 #ifdef MCDBGQ_TRACKMEM
-		block->owner = nullptr;
-#endif
-		if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
-			destroy(block);
-		}
-		else {
-			freeList.add(block);
-		}
-	}
-	
-	inline void add_blocks_to_free_list(Block* block)
-	{
-		while (block != nullptr) {
-			auto next = block->next;
-			add_block_to_free_list(block);
-			block = next;
-		}
-	}
-	
-	inline Block* try_get_block_from_free_list()
-	{
-		return freeList.try_get();
-	}
-	
-	// Gets a free block from one of the memory pools, or allocates a new one (if applicable)
-	template<AllocationMode canAlloc>
-	Block* requisition_block()
-	{
-		auto block = try_get_block_from_initial_pool();
-		if (block != nullptr) {
-			return block;
-		}
-		
-		block = try_get_block_from_free_list();
-		if (block != nullptr) {
-			return block;
-		}
-		
-		MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) {
-			return create<Block>();
-		}
-		else {
-			return nullptr;
-		}
-	}
-	
+    block->owner = nullptr;
+#endif
+    if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) {
+      destroy(block);
+    } else {
+      freeList.add(block);
+    }
+  }
+
+  inline void add_blocks_to_free_list(Block *block) {
+    while (block != nullptr) {
+      auto next = block->next;
+      add_block_to_free_list(block);
+      block = next;
+    }
+  }
+
+  inline Block *try_get_block_from_free_list() { return freeList.try_get(); }
+
+  // Gets a free block from one of the memory pools, or allocates a new one (if
+  // applicable)
+  template <AllocationMode canAlloc> Block *requisition_block() {
+    auto block = try_get_block_from_initial_pool();
+    if (block != nullptr) {
+      return block;
+    }
+
+    block = try_get_block_from_free_list();
+    if (block != nullptr) {
+      return block;
+    }
+
+    MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) { return create<Block>(); }
+    else {
+      return nullptr;
+    }
+  }
 
 #ifdef MCDBGQ_TRACKMEM
-	public:
-		struct MemStats {
-			size_t allocatedBlocks;
-			size_t usedBlocks;
-			size_t freeBlocks;
-			size_t ownedBlocksExplicit;
-			size_t ownedBlocksImplicit;
-			size_t implicitProducers;
-			size_t explicitProducers;
-			size_t elementsEnqueued;
-			size_t blockClassBytes;
-			size_t queueClassBytes;
-			size_t implicitBlockIndexBytes;
-			size_t explicitBlockIndexBytes;
-			
-			friend class ConcurrentQueue;
-			
-		private:
-			static MemStats getFor(ConcurrentQueue* q)
-			{
-				MemStats stats = { 0 };
-				
-				stats.elementsEnqueued = q->size_approx();
-			
-				auto block = q->freeList.head_unsafe();
-				while (block != nullptr) {
-					++stats.allocatedBlocks;
-					++stats.freeBlocks;
-					block = block->freeListNext.load(std::memory_order_relaxed);
-				}
-				
-				for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-					bool implicit = dynamic_cast<ImplicitProducer*>(ptr) != nullptr;
-					stats.implicitProducers += implicit ? 1 : 0;
-					stats.explicitProducers += implicit ? 0 : 1;
-					
-					if (implicit) {
-						auto prod = static_cast<ImplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ImplicitProducer);
-						auto head = prod->headIndex.load(std::memory_order_relaxed);
-						auto tail = prod->tailIndex.load(std::memory_order_relaxed);
-						auto hash = prod->blockIndex.load(std::memory_order_relaxed);
-						if (hash != nullptr) {
-							for (size_t i = 0; i != hash->capacity; ++i) {
-								if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) {
-									++stats.allocatedBlocks;
-									++stats.ownedBlocksImplicit;
-								}
-							}
-							stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry);
-							for (; hash != nullptr; hash = hash->prev) {
-								stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*);
-							}
-						}
-						for (; details::circular_less_than<index_t>(head, tail); head += BLOCK_SIZE) {
-							//auto block = prod->get_block_index_entry_for_index(head);
-							++stats.usedBlocks;
-						}
-					}
-					else {
-						auto prod = static_cast<ExplicitProducer*>(ptr);
-						stats.queueClassBytes += sizeof(ExplicitProducer);
-						auto tailBlock = prod->tailBlock;
-						bool wasNonEmpty = false;
-						if (tailBlock != nullptr) {
-							auto block = tailBlock;
-							do {
-								++stats.allocatedBlocks;
-								if (!block->ConcurrentQueue::Block::template is_empty<explicit_context>() || wasNonEmpty) {
-									++stats.usedBlocks;
-									wasNonEmpty = wasNonEmpty || block != tailBlock;
-								}
-								++stats.ownedBlocksExplicit;
-								block = block->next;
-							} while (block != tailBlock);
-						}
-						auto index = prod->blockIndex.load(std::memory_order_relaxed);
-						while (index != nullptr) {
-							stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry);
-							index = static_cast<typename ExplicitProducer::BlockIndexHeader*>(index->prev);
-						}
-					}
-				}
-				
-				auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed);
-				stats.allocatedBlocks += freeOnInitialPool;
-				stats.freeBlocks += freeOnInitialPool;
-				
-				stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
-				stats.queueClassBytes += sizeof(ConcurrentQueue);
-				
-				return stats;
-			}
-		};
-		
-		// For debugging only. Not thread-safe.
-		MemStats getMemStats()
-		{
-			return MemStats::getFor(this);
-		}
-	private:
-		friend struct MemStats;
-#endif
-	
-	
-	//////////////////////////////////
-	// Producer list manipulation
-	//////////////////////////////////	
-	
-	ProducerBase* recycle_or_create_producer(bool isExplicit)
-	{
+public:
+  struct MemStats {
+    size_t allocatedBlocks;
+    size_t usedBlocks;
+    size_t freeBlocks;
+    size_t ownedBlocksExplicit;
+    size_t ownedBlocksImplicit;
+    size_t implicitProducers;
+    size_t explicitProducers;
+    size_t elementsEnqueued;
+    size_t blockClassBytes;
+    size_t queueClassBytes;
+    size_t implicitBlockIndexBytes;
+    size_t explicitBlockIndexBytes;
+
+    friend class ConcurrentQueue;
+
+  private:
+    static MemStats getFor(ConcurrentQueue *q) {
+      MemStats stats = {0};
+
+      stats.elementsEnqueued = q->size_approx();
+
+      auto block = q->freeList.head_unsafe();
+      while (block != nullptr) {
+        ++stats.allocatedBlocks;
+        ++stats.freeBlocks;
+        block = block->freeListNext.load(std::memory_order_relaxed);
+      }
+
+      for (auto ptr = q->producerListTail.load(std::memory_order_acquire);
+           ptr != nullptr; ptr = ptr->next_prod()) {
+        bool implicit = dynamic_cast<ImplicitProducer *>(ptr) != nullptr;
+        stats.implicitProducers += implicit ? 1 : 0;
+        stats.explicitProducers += implicit ? 0 : 1;
+
+        if (implicit) {
+          auto prod = static_cast<ImplicitProducer *>(ptr);
+          stats.queueClassBytes += sizeof(ImplicitProducer);
+          auto head = prod->headIndex.load(std::memory_order_relaxed);
+          auto tail = prod->tailIndex.load(std::memory_order_relaxed);
+          auto hash = prod->blockIndex.load(std::memory_order_relaxed);
+          if (hash != nullptr) {
+            for (size_t i = 0; i != hash->capacity; ++i) {
+              if (hash->index[i]->key.load(std::memory_order_relaxed) !=
+                      ImplicitProducer::INVALID_BLOCK_BASE &&
+                  hash->index[i]->value.load(std::memory_order_relaxed) !=
+                      nullptr) {
+                ++stats.allocatedBlocks;
+                ++stats.ownedBlocksImplicit;
+              }
+            }
+            stats.implicitBlockIndexBytes +=
+                hash->capacity *
+                sizeof(typename ImplicitProducer::BlockIndexEntry);
+            for (; hash != nullptr; hash = hash->prev) {
+              stats.implicitBlockIndexBytes +=
+                  sizeof(typename ImplicitProducer::BlockIndexHeader) +
+                  hash->capacity *
+                      sizeof(typename ImplicitProducer::BlockIndexEntry *);
+            }
+          }
+          for (; details::circular_less_than<index_t>(head, tail);
+               head += BLOCK_SIZE) {
+            // auto block = prod->get_block_index_entry_for_index(head);
+            ++stats.usedBlocks;
+          }
+        } else {
+          auto prod = static_cast<ExplicitProducer *>(ptr);
+          stats.queueClassBytes += sizeof(ExplicitProducer);
+          auto tailBlock = prod->tailBlock;
+          bool wasNonEmpty = false;
+          if (tailBlock != nullptr) {
+            auto block = tailBlock;
+            do {
+              ++stats.allocatedBlocks;
+              if (!block->ConcurrentQueue::Block::template is_empty<
+                      explicit_context>() ||
+                  wasNonEmpty) {
+                ++stats.usedBlocks;
+                wasNonEmpty = wasNonEmpty || block != tailBlock;
+              }
+              ++stats.ownedBlocksExplicit;
+              block = block->next;
+            } while (block != tailBlock);
+          }
+          auto index = prod->blockIndex.load(std::memory_order_relaxed);
+          while (index != nullptr) {
+            stats.explicitBlockIndexBytes +=
+                sizeof(typename ExplicitProducer::BlockIndexHeader) +
+                index->size *
+                    sizeof(typename ExplicitProducer::BlockIndexEntry);
+            index = static_cast<typename ExplicitProducer::BlockIndexHeader *>(
+                index->prev);
+          }
+        }
+      }
+
+      auto freeOnInitialPool =
+          q->initialBlockPoolIndex.load(std::memory_order_relaxed) >=
+                  q->initialBlockPoolSize
+              ? 0
+              : q->initialBlockPoolSize -
+                    q->initialBlockPoolIndex.load(std::memory_order_relaxed);
+      stats.allocatedBlocks += freeOnInitialPool;
+      stats.freeBlocks += freeOnInitialPool;
+
+      stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks;
+      stats.queueClassBytes += sizeof(ConcurrentQueue);
+
+      return stats;
+    }
+  };
+
+  // For debugging only. Not thread-safe.
+  MemStats getMemStats() { return MemStats::getFor(this); }
+
+private:
+  friend struct MemStats;
+#endif
+
+  //////////////////////////////////
+  // Producer list manipulation
+  //////////////////////////////////
+
+  ProducerBase *recycle_or_create_producer(bool isExplicit) {
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		// Try to re-use one first
-		for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) {
-			if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) {
-				bool expected = true;
-				if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) {
-					// We caught one! It's been marked as activated, the caller can have it
-					return ptr;
-				}
-			}
-		}
-
-		return add_producer(isExplicit ? static_cast<ProducerBase*>(create<ExplicitProducer>(this)) : create<ImplicitProducer>(this));
-	}
-	
-	ProducerBase* add_producer(ProducerBase* producer)
-	{
-		// Handle failed memory allocation
-		if (producer == nullptr) {
-			return nullptr;
-		}
-		
-		producerCount.fetch_add(1, std::memory_order_relaxed);
-		
-		// Add it to the lock-free list
-		auto prevTail = producerListTail.load(std::memory_order_relaxed);
-		do {
-			producer->next = prevTail;
-		} while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed));
-		
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+    // Try to re-use one first
+    for (auto ptr = producerListTail.load(std::memory_order_acquire);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      if (ptr->inactive.load(std::memory_order_relaxed) &&
+          ptr->isExplicit == isExplicit) {
+        bool expected = true;
+        if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false,
+                                                  std::memory_order_acquire,
+                                                  std::memory_order_relaxed)) {
+          // We caught one! It's been marked as activated, the caller can have
+          // it
+          return ptr;
+        }
+      }
+    }
+
+    return add_producer(
+        isExplicit ? static_cast<ProducerBase *>(create<ExplicitProducer>(this))
+                   : create<ImplicitProducer>(this));
+  }
+
+  ProducerBase *add_producer(ProducerBase *producer) {
+    // Handle failed memory allocation
+    if (producer == nullptr) {
+      return nullptr;
+    }
+
+    producerCount.fetch_add(1, std::memory_order_relaxed);
+
+    // Add it to the lock-free list
+    auto prevTail = producerListTail.load(std::memory_order_relaxed);
+    do {
+      producer->next = prevTail;
+    } while (!producerListTail.compare_exchange_weak(
+        prevTail, producer, std::memory_order_release,
+        std::memory_order_relaxed));
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-		if (producer->isExplicit) {
-			auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ExplicitProducer*>(producer)->nextExplicitProducer = prevTailExplicit;
-			} while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast<ExplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-		else {
-			auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
-			do {
-				static_cast<ImplicitProducer*>(producer)->nextImplicitProducer = prevTailImplicit;
-			} while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast<ImplicitProducer*>(producer), std::memory_order_release, std::memory_order_relaxed));
-		}
-#endif
-		
-		return producer;
-	}
-	
-	void reown_producers()
-	{
-		// After another instance is moved-into/swapped-with this one, all the
-		// producers we stole still think their parents are the other queue.
-		// So fix them up!
-		for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) {
-			ptr->parent = this;
-		}
-	}
-	
-	
-	//////////////////////////////////
-	// Implicit producer hash
-	//////////////////////////////////
-	
-	struct ImplicitProducerKVP
-	{
-		std::atomic<details::thread_id_t> key;
-		ImplicitProducer* value;		// No need for atomicity since it's only read by the thread that sets it in the first place
-		
-		ImplicitProducerKVP() : value(nullptr) { }
-		
-		ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed);
-			value = other.value;
-		}
-		
-		inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT
-		{
-			swap(other);
-			return *this;
-		}
-		
-		inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT
-		{
-			if (this != &other) {
-				details::swap_relaxed(key, other.key);
-				std::swap(value, other.value);
-			}
-		}
-	};
-	
-	template<typename XT, typename XTraits>
-	friend void moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&, typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT;
-	
-	struct ImplicitProducerHash
-	{
-		size_t capacity;
-		ImplicitProducerKVP* entries;
-		ImplicitProducerHash* prev;
-	};
-	
-	inline void populate_initial_implicit_producer_hash()
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			implicitProducerHashCount.store(0, std::memory_order_relaxed);
-			auto hash = &initialImplicitProducerHash;
-			hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
-			hash->entries = &initialImplicitProducerHashEntries[0];
-			for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
-				initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-			}
-			hash->prev = nullptr;
-			implicitProducerHash.store(hash, std::memory_order_relaxed);
-		}
-	}
-	
-	void swap_implicit_producer_hashes(ConcurrentQueue& other)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
-			return;
-		}
-		else {
-			// Swap (assumes our implicit producer hash is initialized)
-			initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries);
-			initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0];
-			other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0];
-			
-			details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount);
-			
-			details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
-			if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) {
-				implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &initialImplicitProducerHash;
-			}
-			if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) {
-				other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed);
-			}
-			else {
-				ImplicitProducerHash* hash;
-				for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
-					continue;
-				}
-				hash->prev = &other.initialImplicitProducerHash;
-			}
-		}
-	}
-	
-	// Only fails (returns nullptr) if memory allocation fails
-	ImplicitProducer* get_or_add_implicit_producer()
-	{
-		// Note that since the data is essentially thread-local (key is thread ID),
-		// there's a reduced need for fences (memory ordering is already consistent
-		// for any individual thread), except for the current table itself.
-		
-		// Start by looking for the thread ID in the current and all previous hash tables.
-		// If it's not found, it must not be in there yet, since this same thread would
-		// have added it previously to one of the tables that we traversed.
-		
-		// Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
-		
+    if (producer->isExplicit) {
+      auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed);
+      do {
+        static_cast<ExplicitProducer *>(producer)->nextExplicitProducer =
+            prevTailExplicit;
+      } while (!explicitProducers.compare_exchange_weak(
+          prevTailExplicit, static_cast<ExplicitProducer *>(producer),
+          std::memory_order_release, std::memory_order_relaxed));
+    } else {
+      auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed);
+      do {
+        static_cast<ImplicitProducer *>(producer)->nextImplicitProducer =
+            prevTailImplicit;
+      } while (!implicitProducers.compare_exchange_weak(
+          prevTailImplicit, static_cast<ImplicitProducer *>(producer),
+          std::memory_order_release, std::memory_order_relaxed));
+    }
+#endif
+
+    return producer;
+  }
+
+  void reown_producers() {
+    // After another instance is moved-into/swapped-with this one, all the
+    // producers we stole still think their parents are the other queue.
+    // So fix them up!
+    for (auto ptr = producerListTail.load(std::memory_order_relaxed);
+         ptr != nullptr; ptr = ptr->next_prod()) {
+      ptr->parent = this;
+    }
+  }
+
+  //////////////////////////////////
+  // Implicit producer hash
+  //////////////////////////////////
+
+  struct ImplicitProducerKVP {
+    std::atomic<details::thread_id_t> key;
+    ImplicitProducer *value; // No need for atomicity since it's only read by
+                             // the thread that sets it in the first place
+
+    ImplicitProducerKVP() : value(nullptr) {}
+
+    ImplicitProducerKVP(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT {
+      key.store(other.key.load(std::memory_order_relaxed),
+                std::memory_order_relaxed);
+      value = other.value;
+    }
+
+    inline ImplicitProducerKVP &
+    operator=(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT {
+      swap(other);
+      return *this;
+    }
+
+    inline void swap(ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT {
+      if (this != &other) {
+        details::swap_relaxed(key, other.key);
+        std::swap(value, other.value);
+      }
+    }
+  };
+
+  template <typename XT, typename XTraits>
+  friend void
+  moodycamel::swap(typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP &,
+                   typename ConcurrentQueue<XT, XTraits>::ImplicitProducerKVP &)
+      MOODYCAMEL_NOEXCEPT;
+
+  struct ImplicitProducerHash {
+    size_t capacity;
+    ImplicitProducerKVP *entries;
+    ImplicitProducerHash *prev;
+  };
+
+  inline void populate_initial_implicit_producer_hash() {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+      return;
+    }
+    else {
+      implicitProducerHashCount.store(0, std::memory_order_relaxed);
+      auto hash = &initialImplicitProducerHash;
+      hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+      hash->entries = &initialImplicitProducerHashEntries[0];
+      for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) {
+        initialImplicitProducerHashEntries[i].key.store(
+            details::invalid_thread_id, std::memory_order_relaxed);
+      }
+      hash->prev = nullptr;
+      implicitProducerHash.store(hash, std::memory_order_relaxed);
+    }
+  }
+
+  void swap_implicit_producer_hashes(ConcurrentQueue &other) {
+    MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) {
+      return;
+    }
+    else {
+      // Swap (assumes our implicit producer hash is initialized)
+      initialImplicitProducerHashEntries.swap(
+          other.initialImplicitProducerHashEntries);
+      initialImplicitProducerHash.entries =
+          &initialImplicitProducerHashEntries[0];
+      other.initialImplicitProducerHash.entries =
+          &other.initialImplicitProducerHashEntries[0];
+
+      details::swap_relaxed(implicitProducerHashCount,
+                            other.implicitProducerHashCount);
+
+      details::swap_relaxed(implicitProducerHash, other.implicitProducerHash);
+      if (implicitProducerHash.load(std::memory_order_relaxed) ==
+          &other.initialImplicitProducerHash) {
+        implicitProducerHash.store(&initialImplicitProducerHash,
+                                   std::memory_order_relaxed);
+      } else {
+        ImplicitProducerHash *hash;
+        for (hash = implicitProducerHash.load(std::memory_order_relaxed);
+             hash->prev != &other.initialImplicitProducerHash;
+             hash = hash->prev) {
+          continue;
+        }
+        hash->prev = &initialImplicitProducerHash;
+      }
+      if (other.implicitProducerHash.load(std::memory_order_relaxed) ==
+          &initialImplicitProducerHash) {
+        other.implicitProducerHash.store(&other.initialImplicitProducerHash,
+                                         std::memory_order_relaxed);
+      } else {
+        ImplicitProducerHash *hash;
+        for (hash = other.implicitProducerHash.load(std::memory_order_relaxed);
+             hash->prev != &initialImplicitProducerHash; hash = hash->prev) {
+          continue;
+        }
+        hash->prev = &other.initialImplicitProducerHash;
+      }
+    }
+  }
+
+  // Only fails (returns nullptr) if memory allocation fails
+  ImplicitProducer *get_or_add_implicit_producer() {
+    // Note that since the data is essentially thread-local (key is thread ID),
+    // there's a reduced need for fences (memory ordering is already consistent
+    // for any individual thread), except for the current table itself.
+
+    // Start by looking for the thread ID in the current and all previous hash
+    // tables. If it's not found, it must not be in there yet, since this same
+    // thread would have added it previously to one of the tables that we
+    // traversed.
+
+    // Code and algorithm adapted from
+    // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table
+
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		
-		auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(mainHash != nullptr);  // silence clang-tidy and MSVC warnings (hash cannot be null)
-		for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
-			// Look for the id in this hash
-			auto index = hashedId;
-			while (true) {		// Not an infinite loop because at least one slot is free in the hash table
-				index &= hash->capacity - 1u;
-				
-				auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed);
-				if (probedKey == id) {
-					// Found it! If we had to search several hashes deep, though, we should lazily add it
-					// to the current main hash table to avoid the extended search next time.
-					// Note there's guaranteed to be room in the current hash table since every subsequent
-					// table implicitly reserves space for all previous tables (there's only one
-					// implicitProducerHashCount).
-					auto value = hash->entries[index].value;
-					if (hash != mainHash) {
-						index = hashedId;
-						while (true) {
-							index &= mainHash->capacity - 1u;
-							auto empty = details::invalid_thread_id;
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+
+    auto id = details::thread_id();
+    auto hashedId = details::hash_thread_id(id);
+
+    auto mainHash = implicitProducerHash.load(std::memory_order_acquire);
+    assert(
+        mainHash !=
+        nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null)
+    for (auto hash = mainHash; hash != nullptr; hash = hash->prev) {
+      // Look for the id in this hash
+      auto index = hashedId;
+      while (true) { // Not an infinite loop because at least one slot is free
+                     // in the hash table
+        index &= hash->capacity - 1u;
+
+        auto probedKey =
+            hash->entries[index].key.load(std::memory_order_relaxed);
+        if (probedKey == id) {
+          // Found it! If we had to search several hashes deep, though, we
+          // should lazily add it to the current main hash table to avoid the
+          // extended search next time. Note there's guaranteed to be room in
+          // the current hash table since every subsequent table implicitly
+          // reserves space for all previous tables (there's only one
+          // implicitProducerHashCount).
+          auto value = hash->entries[index].value;
+          if (hash != mainHash) {
+            index = hashedId;
+            while (true) {
+              index &= mainHash->capacity - 1u;
+              auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-							auto reusable = details::invalid_thread_id2;
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed) ||
-								mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
+              auto reusable = details::invalid_thread_id2;
+              if (mainHash->entries[index].key.compare_exchange_strong(
+                      empty, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed) ||
+                  mainHash->entries[index].key.compare_exchange_strong(
+                      reusable, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed)) {
 #else
-							if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-#endif
-								mainHash->entries[index].value = value;
-								break;
-							}
-							++index;
-						}
-					}
-					
-					return value;
-				}
-				if (probedKey == details::invalid_thread_id) {
-					break;		// Not in this hash table
-				}
-				++index;
-			}
-		}
-		
-		// Insert!
-		auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
-		while (true) {
-			// NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
-			if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) {
-				// We've acquired the resize lock, try to allocate a bigger hash table.
-				// Note the acquire fence synchronizes with the release fence at the end of this block, and hence when
-				// we reload implicitProducerHash it must be the most recent version (it only gets changed within this
-				// locked block).
-				mainHash = implicitProducerHash.load(std::memory_order_acquire);
-				if (newCount >= (mainHash->capacity >> 1)) {
-					size_t newCapacity = mainHash->capacity << 1;
-					while (newCount >= (newCapacity >> 1)) {
-						newCapacity <<= 1;
-					}
-					auto raw = static_cast<char*>((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of<ImplicitProducerKVP>::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity));
-					if (raw == nullptr) {
-						// Allocation failed
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-						implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed);
-						return nullptr;
-					}
-					
-					auto newHash = new (raw) ImplicitProducerHash;
-					newHash->capacity = static_cast<size_t>(newCapacity);
-					newHash->entries = reinterpret_cast<ImplicitProducerKVP*>(details::align_for<ImplicitProducerKVP>(raw + sizeof(ImplicitProducerHash)));
-					for (size_t i = 0; i != newCapacity; ++i) {
-						new (newHash->entries + i) ImplicitProducerKVP;
-						newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed);
-					}
-					newHash->prev = mainHash;
-					implicitProducerHash.store(newHash, std::memory_order_release);
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-					mainHash = newHash;
-				}
-				else {
-					implicitProducerHashResizeInProgress.clear(std::memory_order_release);
-				}
-			}
-			
-			// If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table
-			// to finish being allocated by another thread (and if we just finished allocating above, the condition will
-			// always be true)
-			if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
-				auto producer = static_cast<ImplicitProducer*>(recycle_or_create_producer(false));
-				if (producer == nullptr) {
-					implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
-					return nullptr;
-				}
-				
+              if (mainHash->entries[index].key.compare_exchange_strong(
+                      empty, id, std::memory_order_seq_cst,
+                      std::memory_order_relaxed)) {
+#endif
+                mainHash->entries[index].value = value;
+                break;
+              }
+              ++index;
+            }
+          }
+
+          return value;
+        }
+        if (probedKey == details::invalid_thread_id) {
+          break; // Not in this hash table
+        }
+        ++index;
+      }
+    }
+
+    // Insert!
+    auto newCount =
+        1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed);
+    while (true) {
+      // NOLINTNEXTLINE(clang-analyzer-core.NullDereference)
+      if (newCount >= (mainHash->capacity >> 1) &&
+          !implicitProducerHashResizeInProgress.test_and_set(
+              std::memory_order_acquire)) {
+        // We've acquired the resize lock, try to allocate a bigger hash table.
+        // Note the acquire fence synchronizes with the release fence at the end
+        // of this block, and hence when we reload implicitProducerHash it must
+        // be the most recent version (it only gets changed within this locked
+        // block).
+        mainHash = implicitProducerHash.load(std::memory_order_acquire);
+        if (newCount >= (mainHash->capacity >> 1)) {
+          size_t newCapacity = mainHash->capacity << 1;
+          while (newCount >= (newCapacity >> 1)) {
+            newCapacity <<= 1;
+          }
+          auto raw = static_cast<char *>(
+              (Traits::malloc)(sizeof(ImplicitProducerHash) +
+                               std::alignment_of<ImplicitProducerKVP>::value -
+                               1 + sizeof(ImplicitProducerKVP) * newCapacity));
+          if (raw == nullptr) {
+            // Allocation failed
+            implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+            implicitProducerHashResizeInProgress.clear(
+                std::memory_order_relaxed);
+            return nullptr;
+          }
+
+          auto newHash = new (raw) ImplicitProducerHash;
+          newHash->capacity = static_cast<size_t>(newCapacity);
+          newHash->entries = reinterpret_cast<ImplicitProducerKVP *>(
+              details::align_for<ImplicitProducerKVP>(
+                  raw + sizeof(ImplicitProducerHash)));
+          for (size_t i = 0; i != newCapacity; ++i) {
+            new (newHash->entries + i) ImplicitProducerKVP;
+            newHash->entries[i].key.store(details::invalid_thread_id,
+                                          std::memory_order_relaxed);
+          }
+          newHash->prev = mainHash;
+          implicitProducerHash.store(newHash, std::memory_order_release);
+          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+          mainHash = newHash;
+        } else {
+          implicitProducerHashResizeInProgress.clear(std::memory_order_release);
+        }
+      }
+
+      // If it's < three-quarters full, add to the old one anyway so that we
+      // don't have to wait for the next table to finish being allocated by
+      // another thread (and if we just finished allocating above, the condition
+      // will always be true)
+      if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) {
+        auto producer =
+            static_cast<ImplicitProducer *>(recycle_or_create_producer(false));
+        if (producer == nullptr) {
+          implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);
+          return nullptr;
+        }
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-				producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback;
-				producer->threadExitListener.userData = producer;
-				details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
-#endif
-				
-				auto index = hashedId;
-				while (true) {
-					index &= mainHash->capacity - 1u;
-					auto empty = details::invalid_thread_id;
+        producer->threadExitListener.callback =
+            &ConcurrentQueue::implicit_producer_thread_exited_callback;
+        producer->threadExitListener.userData = producer;
+        details::ThreadExitNotifier::subscribe(&producer->threadExitListener);
+#endif
+
+        auto index = hashedId;
+        while (true) {
+          index &= mainHash->capacity - 1u;
+          auto empty = details::invalid_thread_id;
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-					auto reusable = details::invalid_thread_id2;
-					if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed);  // already counted as a used slot
-						mainHash->entries[index].value = producer;
-						break;
-					}
-#endif
-					if (mainHash->entries[index].key.compare_exchange_strong(empty,    id, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-						mainHash->entries[index].value = producer;
-						break;
-					}
-					++index;
-				}
-				return producer;
-			}
-			
-			// Hmm, the old hash is quite full and somebody else is busy allocating a new one.
-			// We need to wait for the allocating thread to finish (if it succeeds, we add, if not,
-			// we try to allocate ourselves).
-			mainHash = implicitProducerHash.load(std::memory_order_acquire);
-		}
-	}
-	
+          auto reusable = details::invalid_thread_id2;
+          if (mainHash->entries[index].key.compare_exchange_strong(
+                  reusable, id, std::memory_order_seq_cst,
+                  std::memory_order_relaxed)) {
+            implicitProducerHashCount.fetch_sub(
+                1, std::memory_order_relaxed); // already counted as a used slot
+            mainHash->entries[index].value = producer;
+            break;
+          }
+#endif
+          if (mainHash->entries[index].key.compare_exchange_strong(
+                  empty, id, std::memory_order_seq_cst,
+                  std::memory_order_relaxed)) {
+            mainHash->entries[index].value = producer;
+            break;
+          }
+          ++index;
+        }
+        return producer;
+      }
+
+      // Hmm, the old hash is quite full and somebody else is busy allocating a
+      // new one. We need to wait for the allocating thread to finish (if it
+      // succeeds, we add, if not, we try to allocate ourselves).
+      mainHash = implicitProducerHash.load(std::memory_order_acquire);
+    }
+  }
+
 #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED
-	void implicit_producer_thread_exited(ImplicitProducer* producer)
-	{
-		// Remove from hash
+  void implicit_producer_thread_exited(ImplicitProducer *producer) {
+    // Remove from hash
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-		debug::DebugLock lock(implicitProdMutex);
-#endif
-		auto hash = implicitProducerHash.load(std::memory_order_acquire);
-		assert(hash != nullptr);		// The thread exit listener is only registered if we were added to a hash in the first place
-		auto id = details::thread_id();
-		auto hashedId = details::hash_thread_id(id);
-		details::thread_id_t probedKey;
-		
-		// We need to traverse all the hashes just in case other threads aren't on the current one yet and are
-		// trying to add an entry thinking there's a free slot (because they reused a producer)
-		for (; hash != nullptr; hash = hash->prev) {
-			auto index = hashedId;
-			do {
-				index &= hash->capacity - 1u;
-				probedKey = id;
-				if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) {
-					break;
-				}
-				++index;
-			} while (probedKey != details::invalid_thread_id);		// Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place
-		}
-		
-		// Mark the queue as being recyclable
-		producer->inactive.store(true, std::memory_order_release);
-	}
-	
-	static void implicit_producer_thread_exited_callback(void* userData)
-	{
-		auto producer = static_cast<ImplicitProducer*>(userData);
-		auto queue = producer->parent;
-		queue->implicit_producer_thread_exited(producer);
-	}
-#endif
-	
-	//////////////////////////////////
-	// Utility functions
-	//////////////////////////////////
-
-	template<typename TAlign>
-	static inline void* aligned_malloc(size_t size)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::malloc)(size);
-		else {
-			size_t alignment = std::alignment_of<TAlign>::value;
-			void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*));
-			if (!raw)
-				return nullptr;
-			char* ptr = details::align_for<TAlign>(reinterpret_cast<char*>(raw) + sizeof(void*));
-			*(reinterpret_cast<void**>(ptr) - 1) = raw;
-			return ptr;
-		}
-	}
-
-	template<typename TAlign>
-	static inline void aligned_free(void* ptr)
-	{
-		MOODYCAMEL_CONSTEXPR_IF (std::alignment_of<TAlign>::value <= std::alignment_of<details::max_align_t>::value)
-			return (Traits::free)(ptr);
-		else
-			(Traits::free)(ptr ? *(reinterpret_cast<void**>(ptr) - 1) : nullptr);
-	}
-
-	template<typename U>
-	static inline U* create_array(size_t count)
-	{
-		assert(count > 0);
-		U* p = static_cast<U*>(aligned_malloc<U>(sizeof(U) * count));
-		if (p == nullptr)
-			return nullptr;
-
-		for (size_t i = 0; i != count; ++i)
-			new (p + i) U();
-		return p;
-	}
-
-	template<typename U>
-	static inline void destroy_array(U* p, size_t count)
-	{
-		if (p != nullptr) {
-			assert(count > 0);
-			for (size_t i = count; i != 0; )
-				(p + --i)->~U();
-		}
-		aligned_free<U>(p);
-	}
-
-	template<typename U>
-	static inline U* create()
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U : nullptr;
-	}
-
-	template<typename U, typename A1>
-	static inline U* create(A1&& a1)
-	{
-		void* p = aligned_malloc<U>(sizeof(U));
-		return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
-	}
-
-	template<typename U>
-	static inline void destroy(U* p)
-	{
-		if (p != nullptr)
-			p->~U();
-		aligned_free<U>(p);
-	}
+    debug::DebugLock lock(implicitProdMutex);
+#endif
+    auto hash = implicitProducerHash.load(std::memory_order_acquire);
+    assert(hash != nullptr); // The thread exit listener is only registered if
+                             // we were added to a hash in the first place
+    auto id = details::thread_id();
+    auto hashedId = details::hash_thread_id(id);
+    details::thread_id_t probedKey;
+
+    // We need to traverse all the hashes just in case other threads aren't on
+    // the current one yet and are trying to add an entry thinking there's a
+    // free slot (because they reused a producer)
+    for (; hash != nullptr; hash = hash->prev) {
+      auto index = hashedId;
+      do {
+        index &= hash->capacity - 1u;
+        probedKey = id;
+        if (hash->entries[index].key.compare_exchange_strong(
+                probedKey, details::invalid_thread_id2,
+                std::memory_order_seq_cst, std::memory_order_relaxed)) {
+          break;
+        }
+        ++index;
+      } while (probedKey !=
+               details::invalid_thread_id); // Can happen if the hash has
+                                            // changed but we weren't put back
+                                            // in it yet, or if we weren't added
+                                            // to this hash in the first place
+    }
+
+    // Mark the queue as being recyclable
+    producer->inactive.store(true, std::memory_order_release);
+  }
+
+  static void implicit_producer_thread_exited_callback(void *userData) {
+    auto producer = static_cast<ImplicitProducer *>(userData);
+    auto queue = producer->parent;
+    queue->implicit_producer_thread_exited(producer);
+  }
+#endif
+
+  //////////////////////////////////
+  // Utility functions
+  //////////////////////////////////
+
+  template <typename TAlign> static inline void *aligned_malloc(size_t size) {
+    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
+                            std::alignment_of<details::max_align_t>::value)
+    return (Traits::malloc)(size);
+    else {
+      size_t alignment = std::alignment_of<TAlign>::value;
+      void *raw = (Traits::malloc)(size + alignment - 1 + sizeof(void *));
+      if (!raw)
+        return nullptr;
+      char *ptr = details::align_for<TAlign>(reinterpret_cast<char *>(raw) +
+                                             sizeof(void *));
+      *(reinterpret_cast<void **>(ptr) - 1) = raw;
+      return ptr;
+    }
+  }
+
+  template <typename TAlign> static inline void aligned_free(void *ptr) {
+    MOODYCAMEL_CONSTEXPR_IF(std::alignment_of<TAlign>::value <=
+                            std::alignment_of<details::max_align_t>::value)
+    return (Traits::free)(ptr);
+    else(Traits::free)(ptr ? *(reinterpret_cast<void **>(ptr) - 1) : nullptr);
+  }
+
+  template <typename U> static inline U *create_array(size_t count) {
+    assert(count > 0);
+    U *p = static_cast<U *>(aligned_malloc<U>(sizeof(U) * count));
+    if (p == nullptr)
+      return nullptr;
+
+    for (size_t i = 0; i != count; ++i)
+      new (p + i) U();
+    return p;
+  }
+
+  template <typename U> static inline void destroy_array(U *p, size_t count) {
+    if (p != nullptr) {
+      assert(count > 0);
+      for (size_t i = count; i != 0;)
+        (p + --i)->~U();
+    }
+    aligned_free<U>(p);
+  }
+
+  template <typename U> static inline U *create() {
+    void *p = aligned_malloc<U>(sizeof(U));
+    return p != nullptr ? new (p) U : nullptr;
+  }
+
+  template <typename U, typename A1> static inline U *create(A1 &&a1) {
+    void *p = aligned_malloc<U>(sizeof(U));
+    return p != nullptr ? new (p) U(std::forward<A1>(a1)) : nullptr;
+  }
+
+  template <typename U> static inline void destroy(U *p) {
+    if (p != nullptr)
+      p->~U();
+    aligned_free<U>(p);
+  }
 
 private:
-	std::atomic<ProducerBase*> producerListTail;
-	std::atomic<std::uint32_t> producerCount;
-	
-	std::atomic<size_t> initialBlockPoolIndex;
-	Block* initialBlockPool;
-	size_t initialBlockPoolSize;
-	
+  std::atomic<ProducerBase *> producerListTail;
+  std::atomic<std::uint32_t> producerCount;
+
+  std::atomic<size_t> initialBlockPoolIndex;
+  Block *initialBlockPool;
+  size_t initialBlockPoolSize;
+
 #ifndef MCDBGQ_USEDEBUGFREELIST
-	FreeList<Block> freeList;
+  FreeList<Block> freeList;
 #else
-	debug::DebugFreeList<Block> freeList;
-#endif
-	
-	std::atomic<ImplicitProducerHash*> implicitProducerHash;
-	std::atomic<size_t> implicitProducerHashCount;		// Number of slots logically used
-	ImplicitProducerHash initialImplicitProducerHash;
-	std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE> initialImplicitProducerHashEntries;
-	std::atomic_flag implicitProducerHashResizeInProgress;
-	
-	std::atomic<std::uint32_t> nextExplicitConsumerId;
-	std::atomic<std::uint32_t> globalExplicitConsumerOffset;
-	
+  debug::DebugFreeList<Block> freeList;
+#endif
+
+  std::atomic<ImplicitProducerHash *> implicitProducerHash;
+  std::atomic<size_t>
+      implicitProducerHashCount; // Number of slots logically used
+  ImplicitProducerHash initialImplicitProducerHash;
+  std::array<ImplicitProducerKVP, INITIAL_IMPLICIT_PRODUCER_HASH_SIZE>
+      initialImplicitProducerHashEntries;
+  std::atomic_flag implicitProducerHashResizeInProgress;
+
+  std::atomic<std::uint32_t> nextExplicitConsumerId;
+  std::atomic<std::uint32_t> globalExplicitConsumerOffset;
+
 #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH
-	debug::DebugMutex implicitProdMutex;
+  debug::DebugMutex implicitProdMutex;
 #endif
-	
+
 #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG
-	std::atomic<ExplicitProducer*> explicitProducers;
-	std::atomic<ImplicitProducer*> implicitProducers;
+  std::atomic<ExplicitProducer *> explicitProducers;
+  std::atomic<ImplicitProducer *> implicitProducers;
 #endif
 };
 
-
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(ConcurrentQueue<T, Traits>& queue)
-	: producer(queue.recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(ConcurrentQueue<T, Traits> &queue)
+    : producer(queue.recycle_or_create_producer(true)) {
+  if (producer != nullptr) {
+    producer->token = this;
+  }
 }
 
-template<typename T, typename Traits>
-ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: producer(reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->recycle_or_create_producer(true))
-{
-	if (producer != nullptr) {
-		producer->token = this;
-	}
+template <typename T, typename Traits>
+ProducerToken::ProducerToken(BlockingConcurrentQueue<T, Traits> &queue)
+    : producer(reinterpret_cast<ConcurrentQueue<T, Traits> *>(&queue)
+                   ->recycle_or_create_producer(true)) {
+  if (producer != nullptr) {
+    producer->token = this;
+  }
 }
 
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(ConcurrentQueue<T, Traits> &queue)
+    : itemsConsumedFromCurrent(0), currentProducer(nullptr),
+      desiredProducer(nullptr) {
+  initialOffset =
+      queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
 }
 
-template<typename T, typename Traits>
-ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits>& queue)
-	: itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr)
-{
-	initialOffset = reinterpret_cast<ConcurrentQueue<T, Traits>*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
-	lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
+template <typename T, typename Traits>
+ConsumerToken::ConsumerToken(BlockingConcurrentQueue<T, Traits> &queue)
+    : itemsConsumedFromCurrent(0), currentProducer(nullptr),
+      desiredProducer(nullptr) {
+  initialOffset =
+      reinterpret_cast<ConcurrentQueue<T, Traits> *>(&queue)
+          ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release);
+  lastKnownGlobalOffset = static_cast<std::uint32_t>(-1);
 }
 
-template<typename T, typename Traits>
-inline void swap(ConcurrentQueue<T, Traits>& a, ConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
+template <typename T, typename Traits>
+inline void swap(ConcurrentQueue<T, Traits> &a,
+                 ConcurrentQueue<T, Traits> &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
 }
 
-inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
+inline void swap(ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
 }
 
-inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
+inline void swap(ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
 }
 
-template<typename T, typename Traits>
-inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& a, typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT
-{
-	a.swap(b);
+template <typename T, typename Traits>
+inline void swap(typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &a,
+                 typename ConcurrentQueue<T, Traits>::ImplicitProducerKVP &b)
+    MOODYCAMEL_NOEXCEPT {
+  a.swap(b);
 }
 
-}
+} // namespace moodycamel
 
 #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17)
 #pragma warning(pop)
diff --git a/src/conv-conds.cpp b/src/conv-conds.cpp
index 4c9521d..74e4b0a 100644
--- a/src/conv-conds.cpp
+++ b/src/conv-conds.cpp
@@ -25,11 +25,9 @@ struct ccd_cond_callback {
 struct ccd_periodic_callback {
   CcdVoidFn fn;
   void *arg;
-  int pe;			/* the pe that sets the callback */
+  int pe; /* the pe that sets the callback */
 
-  ccd_periodic_callback(CcdVoidFn f, void *a, int p)
-    : fn{f}, arg{a}, pe{p}
-  { }
+  ccd_periodic_callback(CcdVoidFn f, void *a, int p) : fn{f}, arg{a}, pe{p} {}
 };
 
 /**
@@ -165,7 +163,8 @@ struct ccd_heap_elem {
   double time;
   ccd_periodic_callback cb;
 
-  ccd_heap_elem(double t, CcdVoidFn fn, void *arg, int pe) : time{t}, cb{fn, arg, pe} {}
+  ccd_heap_elem(double t, CcdVoidFn fn, void *arg, int pe)
+      : time{t}, cb{fn, arg, pe} {}
 
   bool operator>(const ccd_heap_elem &rhs) const {
     return this->time > rhs.time;
@@ -187,7 +186,8 @@ int CcdNumTimerCBs(void) { return ccd_heap.size() + _ccd_num_timed_cond_cbs; }
 /**
  * Insert a new callback into the heap
  */
-static inline void ccd_heap_insert(double t, CcdVoidFn fnp, void *arg, int pe = CcdIGNOREPE) {
+static inline void ccd_heap_insert(double t, CcdVoidFn fnp, void *arg,
+                                   int pe = CcdIGNOREPE) {
   auto &h = ccd_heap;
   h.emplace(t, fnp, arg, pe);
 }
@@ -259,12 +259,11 @@ void CcdCancelCallOnConditionKeep(int condnum, int idx) {
  * Register a callback function that will be triggered on the specified PE
  * after a minimum delay of deltaT
  */
-void CcdCallFnAfterOnPE(CcdVoidFn fnp, void *arg, double deltaT, int pe)
-{
-    double ctime  = CmiWallTimer();
-    double tcall = ctime + deltaT * (1.0/1000.0);
-    ccd_heap_insert(tcall, fnp, arg, pe);
-} 
+void CcdCallFnAfterOnPE(CcdVoidFn fnp, void *arg, double deltaT, int pe) {
+  double ctime = CmiWallTimer();
+  double tcall = ctime + deltaT * (1.0 / 1000.0);
+  ccd_heap_insert(tcall, fnp, arg, pe);
+}
 
 /**
  * Register a callback function that will be triggered after a minimum
diff --git a/src/conv-rdma.cpp b/src/conv-rdma.cpp
index 4cc2694..fbb38ae 100644
--- a/src/conv-rdma.cpp
+++ b/src/conv-rdma.cpp
@@ -146,8 +146,8 @@ void CmiIssueRputCopyBased(NcpyOperationInfo *ncpyOpInfo) {
 
   // Invoke the source ack
   ncpyOpInfo->ackMode = CMK_SRC_ACK; // only invoke the source ack
-  // We need to ensure consistent behavior no matter what ncpyDirectAckHandlerFn actually does
-  // so we cannot rely on the charm layer to free the ncpyOpInfo
+  // We need to ensure consistent behavior no matter what ncpyDirectAckHandlerFn
+  // actually does so we cannot rely on the charm layer to free the ncpyOpInfo
   auto realFreeMe = ncpyOpInfo->freeMe;
   ncpyOpInfo->freeMe = CMK_DONT_FREE_NCPYOPINFO;
   ncpyDirectAckHandlerFn(ncpyOpInfo);
@@ -367,7 +367,7 @@ void CommRputLocalHandler(comm_backend::Status status) {
   ncpyOpInfo->freeMe = CMK_DONT_FREE_NCPYOPINFO;
   ncpyDirectAckHandlerFn(ncpyOpInfo);
   if (realFreeMe == CMK_FREE_NCPYOPINFO)
-    CmiFree(ncpyOpInfo);  
+    CmiFree(ncpyOpInfo);
 }
 
 // Invoked by the local completion of the Rget operation
@@ -384,12 +384,12 @@ void CommRgetLocalHandler(comm_backend::Status status) {
 /* Perform an RDMA Get operation into the local destination address from the
  * remote source address*/
 void CmiIssueRget(NcpyOperationInfo *ncpyOpInfo) {
-// #if CMK_USE_LRTS && CMK_ONESIDED_IMPL
-//   // Use network RDMA for a PE on a remote host
-//   LrtsIssueRget(ncpyOpInfo);
-// #else
-//   CmiIssueRgetCopyBased(ncpyOpInfo);
-// #endif
+  // #if CMK_USE_LRTS && CMK_ONESIDED_IMPL
+  //   // Use network RDMA for a PE on a remote host
+  //   LrtsIssueRget(ncpyOpInfo);
+  // #else
+  //   CmiIssueRgetCopyBased(ncpyOpInfo);
+  // #endif
   int target_node = CmiNodeOf(ncpyOpInfo->srcPe);
   if (target_node == CmiMyNode()) {
     // loopback messages
@@ -415,39 +415,40 @@ void CmiIssueRget(NcpyOperationInfo *ncpyOpInfo) {
 /* Perform an RDMA Put operation into the remote destination address from the
  * local source address */
 void CmiIssueRput(NcpyOperationInfo *ncpyOpInfo) {
-// #if CMK_USE_LRTS && CMK_ONESIDED_IMPL
-//   // Use network RDMA for a PE on a remote host
-//   LrtsIssueRput(ncpyOpInfo);
-// #else
-//   CmiIssueRputCopyBased(ncpyOpInfo);
-// #endif
-int target_node = CmiNodeOf(ncpyOpInfo->destPe);
-if (target_node == CmiMyNode()) {
-  // loopback messages
-  memcpy((void *)ncpyOpInfo->destPtr, ncpyOpInfo->srcPtr, ncpyOpInfo->srcSize);
-  comm_backend::Status status;
-  status.local_buf = ncpyOpInfo->srcPtr;
-  status.size = ncpyOpInfo->srcSize;
-  status.user_context = ncpyOpInfo;
-  CommRputLocalHandler(status);
-} else if (!CmiUseCopyBasedRDMA) {
-  auto mr = *(comm_backend::mr_t *)ncpyOpInfo->srcLayerInfo;
-  void *rmr = ncpyOpInfo->destLayerInfo + sizeof(comm_backend::mr_t);
-  // FIXME: we assume the offset to the registered base address is 0 here
-  comm_backend::issueRput(CmiNodeOf(ncpyOpInfo->destPe), ncpyOpInfo->srcPtr,
-                          ncpyOpInfo->srcSize, mr, 0, rmr, CommRputLocalHandler,
-                          ncpyOpInfo);
-} else {
-  CmiIssueRputCopyBased(ncpyOpInfo);
-}
+  // #if CMK_USE_LRTS && CMK_ONESIDED_IMPL
+  //   // Use network RDMA for a PE on a remote host
+  //   LrtsIssueRput(ncpyOpInfo);
+  // #else
+  //   CmiIssueRputCopyBased(ncpyOpInfo);
+  // #endif
+  int target_node = CmiNodeOf(ncpyOpInfo->destPe);
+  if (target_node == CmiMyNode()) {
+    // loopback messages
+    memcpy((void *)ncpyOpInfo->destPtr, ncpyOpInfo->srcPtr,
+           ncpyOpInfo->srcSize);
+    comm_backend::Status status;
+    status.local_buf = ncpyOpInfo->srcPtr;
+    status.size = ncpyOpInfo->srcSize;
+    status.user_context = ncpyOpInfo;
+    CommRputLocalHandler(status);
+  } else if (!CmiUseCopyBasedRDMA) {
+    auto mr = *(comm_backend::mr_t *)ncpyOpInfo->srcLayerInfo;
+    void *rmr = ncpyOpInfo->destLayerInfo + sizeof(comm_backend::mr_t);
+    // FIXME: we assume the offset to the registered base address is 0 here
+    comm_backend::issueRput(CmiNodeOf(ncpyOpInfo->destPe), ncpyOpInfo->srcPtr,
+                            ncpyOpInfo->srcSize, mr, 0, rmr,
+                            CommRputLocalHandler, ncpyOpInfo);
+  } else {
+    CmiIssueRputCopyBased(ncpyOpInfo);
+  }
 }
 
 /* De-register registered memory for pointer */
 void CmiDeregisterMem(const void *ptr, void *info, int pe,
                       unsigned short int mode) {
-// #if CMK_USE_LRTS && CMK_ONESIDED_IMPL
-//   LrtsDeregisterMem(ptr, info, pe, mode);
-// #endif
+  // #if CMK_USE_LRTS && CMK_ONESIDED_IMPL
+  //   LrtsDeregisterMem(ptr, info, pe, mode);
+  // #endif
   if (!CmiUseCopyBasedRDMA) {
     comm_backend::deregisterMemory(*(comm_backend::mr_t *)info);
   }
@@ -455,18 +456,19 @@ void CmiDeregisterMem(const void *ptr, void *info, int pe,
 
 // FIXME: This really should be implemented in the charm layer
 void CmiInvokeRemoteDeregAckHandler(int pe, NcpyOperationInfo *ncpyOpInfo) {
-// #if CMK_USE_LRTS && CMK_ONESIDED_IMPL
-//   LrtsInvokeRemoteDeregAckHandler(pe, ncpyOpInfo);
-// #endif
-  if(ncpyOpInfo->opMode == CMK_BCAST_EM_API)
+  // #if CMK_USE_LRTS && CMK_ONESIDED_IMPL
+  //   LrtsInvokeRemoteDeregAckHandler(pe, ncpyOpInfo);
+  // #endif
+  if (ncpyOpInfo->opMode == CMK_BCAST_EM_API)
     return;
   bool freeInfo;
-  if(ncpyOpInfo->opMode == CMK_DIRECT_API) {
+  if (ncpyOpInfo->opMode == CMK_DIRECT_API) {
     freeInfo = true;
-  } else if(ncpyOpInfo->opMode == CMK_EM_API) {
+  } else if (ncpyOpInfo->opMode == CMK_EM_API) {
     freeInfo = false;
   } else {
-    CmiAbort("CmiInvokeRemoteDeregAckHandler: ncpyOpInfo->opMode is not valid for dereg\n");
+    CmiAbort("CmiInvokeRemoteDeregAckHandler: ncpyOpInfo->opMode is not valid "
+             "for dereg\n");
   }
 
   int ncpyOpInfoSize = ncpyOpInfo->ncpyOpInfoSize;
@@ -479,8 +481,8 @@ void CmiInvokeRemoteDeregAckHandler(int pe, NcpyOperationInfo *ncpyOpInfo) {
          ncpyOpInfoSize);
 
   CmiSetHandler(remoteDeregMsg, remote_dereg_handler_idx);
-  CmiSyncSendAndFree(pe,
-                     sizeof(ConverseRdmaMsg) + ncpyOpInfoSize, remoteDeregMsg);
+  CmiSyncSendAndFree(pe, sizeof(ConverseRdmaMsg) + ncpyOpInfoSize,
+                     remoteDeregMsg);
 
   // free original ncpyOpinfo
   if (freeInfo)
@@ -504,7 +506,8 @@ void CmiSetRdmaBufferInfo(void *info, const void *ptr, int size,
     info = (char *)info + sizeof(mr);
     size_t info_size_left = CMK_NOCOPY_DIRECT_BYTES - sizeof(mr);
     size_t rmr_size = comm_backend::getRMR(mr, info, info_size_left);
-    CmiAssertMsg(rmr_size <= info_size_left, "CMK_NOCOPY_DIRECT_BYTES is too small");
+    CmiAssertMsg(rmr_size <= info_size_left,
+                 "CMK_NOCOPY_DIRECT_BYTES is too small");
   }
 }
 
diff --git a/src/conv-topology.cpp b/src/conv-topology.cpp
index be577fa..add33f3 100644
--- a/src/conv-topology.cpp
+++ b/src/conv-topology.cpp
@@ -3,12 +3,11 @@
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
-#include <sched.h>
-#include <netinet/in.h> /* for sockaddr_in */
-#include <ifaddrs.h> /* for getifaddrs */
-#include <net/if.h> /* for IFF_RUNNING */
 #include <cstring>
-
+#include <ifaddrs.h>    /* for getifaddrs */
+#include <net/if.h>     /* for IFF_RUNNING */
+#include <netinet/in.h> /* for sockaddr_in */
+#include <sched.h>
 
 #include <algorithm>
 #include <map>
@@ -35,59 +34,59 @@
 
 #if 1
 
-#  include <stdio.h>
-#  include <stdlib.h>
-#  include <unistd.h>
 #include <atomic>
 #include <cerrno>
-
-
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
 
 void CmiInitMemAffinity(char **argv) {
-    char *tmpstr = NULL;
-    int maffinity_flag = CmiGetArgFlagDesc(argv,"+maffinity",
-                                           "memory affinity");
-    if (maffinity_flag && CmiMyPe()==0)
-        CmiPrintf("memory affinity is not supported, +maffinity flag disabled.\n");
-
-    /* consume the remaining possible arguments */
-    CmiGetArgStringDesc(argv, "+memnodemap", &tmpstr, "define memory node mapping");
-    CmiGetArgStringDesc(argv, "+mempol", &tmpstr, "define memory policy {bind, preferred or interleave} ");
+  char *tmpstr = NULL;
+  int maffinity_flag = CmiGetArgFlagDesc(argv, "+maffinity", "memory affinity");
+  if (maffinity_flag && CmiMyPe() == 0)
+    CmiPrintf("memory affinity is not supported, +maffinity flag disabled.\n");
+
+  /* consume the remaining possible arguments */
+  CmiGetArgStringDesc(argv, "+memnodemap", &tmpstr,
+                      "define memory node mapping");
+  CmiGetArgStringDesc(argv, "+mempol", &tmpstr,
+                      "define memory policy {bind, preferred or interleave} ");
 }
 
+skt_ip_t _skt_invalid_ip = {{0}};
 
-
-skt_ip_t _skt_invalid_ip={{0}};
-
-skt_ip_t skt_my_ip(void)
-{
+skt_ip_t skt_my_ip(void) {
   char hostname[1000];
   skt_ip_t ip = _skt_invalid_ip;
   int ifcount = 0;
-    /* Code snippet from  Jens Alfke
- *     http://lists.apple.com/archives/macnetworkprog/2008/May/msg00013.html */
-  struct ifaddrs *ifaces=0;
-  if( getifaddrs(&ifaces) == 0 ) {
-        struct ifaddrs *iface;
-        for( iface=ifaces; iface; iface=iface->ifa_next ) {
-            if( (iface->ifa_flags & IFF_UP) && ! (iface->ifa_flags & IFF_LOOPBACK) ) {
-                const struct sockaddr_in *addr = (const struct sockaddr_in*)iface->ifa_addr;
-                if( addr && addr->sin_family==AF_INET ) {
-                    ifcount ++;
-                    if ( ifcount==1 ) memcpy(&ip, &addr->sin_addr, sizeof(ip));
-                }
-            }
+  /* Code snippet from  Jens Alfke
+   *     http://lists.apple.com/archives/macnetworkprog/2008/May/msg00013.html
+   */
+  struct ifaddrs *ifaces = 0;
+  if (getifaddrs(&ifaces) == 0) {
+    struct ifaddrs *iface;
+    for (iface = ifaces; iface; iface = iface->ifa_next) {
+      if ((iface->ifa_flags & IFF_UP) && !(iface->ifa_flags & IFF_LOOPBACK)) {
+        const struct sockaddr_in *addr =
+            (const struct sockaddr_in *)iface->ifa_addr;
+        if (addr && addr->sin_family == AF_INET) {
+          ifcount++;
+          if (ifcount == 1)
+            memcpy(&ip, &addr->sin_addr, sizeof(ip));
         }
-        freeifaddrs(ifaces);
+      }
+    }
+    freeifaddrs(ifaces);
   }
-  /* fprintf(stderr, "My IP is %d.%d.%d.%d\n", ip.data[0],ip.data[1],ip.data[2],ip.data[3]); */
-  if (ifcount==1) return ip;
+  /* fprintf(stderr, "My IP is %d.%d.%d.%d\n",
+   * ip.data[0],ip.data[1],ip.data[2],ip.data[3]); */
+  if (ifcount == 1)
+    return ip;
 
   return _skt_invalid_ip;
 }
 
-struct _procInfo
-{
+struct _procInfo {
   skt_ip_t ip;
   int pe;
   int ncores;
@@ -95,40 +94,35 @@ struct _procInfo
   int nodeID;
 };
 
-typedef struct _hostnameMsg
-{
+typedef struct _hostnameMsg {
   char core[CmiMsgHeaderSizeBytes];
   int n;
-  _procInfo* procs;
+  _procInfo *procs;
 } hostnameMsg;
 
-typedef struct _nodeTopoMsg
-{
+typedef struct _nodeTopoMsg {
   char core[CmiMsgHeaderSizeBytes];
-  int* nodes;
+  int *nodes;
 } nodeTopoMsg;
 
 // nodeIDs[pe] is the node number of processor pe
-class CpuTopology
-{
+class CpuTopology {
 public:
-  static int* nodeIDs;
+  static int *nodeIDs;
   static int numPes;
   static int numNodes;
-  static std::vector<int>* bynodes;
+  static std::vector<int> *bynodes;
   static int supported;
 
-  ~CpuTopology()
-  {
+  ~CpuTopology() {
     auto n = bynodes;
     bynodes = nullptr;
     delete[] n;
   }
 
   // return -1 when not supported
-  int numUniqNodes()
-  {
-#  if 0
+  int numUniqNodes() {
+#if 0
     if (numNodes != 0) return numNodes;
     int n = 0;
     for (int i=0; i<CmiNumPes(); i++) 
@@ -136,88 +130,81 @@ class CpuTopology
 	n = nodeIDs[i];
     numNodes = n+1;
     return numNodes;
-#  else
+#else
     if (numNodes > 0)
-      return numNodes;  // already calculated
+      return numNodes; // already calculated
     std::vector<int> unodes(numPes);
     int i;
-    for (i = 0; i < numPes; i++) unodes[i] = nodeIDs[i];
+    for (i = 0; i < numPes; i++)
+      unodes[i] = nodeIDs[i];
     std::sort(unodes.begin(), unodes.end());
     int last = -1;
-    std::map<int, int> nodemap;  // nodeIDs can be out of range of [0,numNodes]
-    for (i = 0; i < numPes; i++)
-    {
-      if (unodes[i] != last)
-      {
+    std::map<int, int> nodemap; // nodeIDs can be out of range of [0,numNodes]
+    for (i = 0; i < numPes; i++) {
+      if (unodes[i] != last) {
         last = unodes[i];
         nodemap[unodes[i]] = numNodes;
         numNodes++;
       }
     }
-    if (numNodes == 0)
-    {
+    if (numNodes == 0) {
       numNodes = CmiNumNodes();
       numPes = CmiNumPes();
-    }
-    else
-    {
+    } else {
       // re-number nodeIDs, which may be necessary e.g. on BlueGene/P
-      for (i = 0; i < numPes; i++) nodeIDs[i] = nodemap[nodeIDs[i]];
+      for (i = 0; i < numPes; i++)
+        nodeIDs[i] = nodemap[nodeIDs[i]];
       CpuTopology::supported = 1;
     }
     return numNodes;
-#  endif
+#endif
   }
 
-  void sort()
-  {
+  void sort() {
     int i;
     numUniqNodes();
     bynodes = new std::vector<int>[numNodes];
-    if (supported)
-    {
-      for (i = 0; i < numPes; i++)
-      {
-        CmiAssert(nodeIDs[i] >= 0 &&
-                  nodeIDs[i] <=
-                      numNodes);  // Sanity check for bug that occurs on mpi-crayxt
+    if (supported) {
+      for (i = 0; i < numPes; i++) {
+        CmiAssert(
+            nodeIDs[i] >= 0 &&
+            nodeIDs[i] <=
+                numNodes); // Sanity check for bug that occurs on mpi-crayxt
         bynodes[nodeIDs[i]].push_back(i);
       }
-    }
-    else
-    { /* not supported/enabled */
-      for (i = 0; i < CmiNumPes(); i++) bynodes[CmiNodeOf(i)].push_back(i);
+    } else { /* not supported/enabled */
+      for (i = 0; i < CmiNumPes(); i++)
+        bynodes[CmiNodeOf(i)].push_back(i);
     }
   }
 
-  void print()
-  {
+  void print() {
     int i;
     CmiPrintf("Charm++> Cpu topology info:\n");
     CmiPrintf("PE to node map: ");
-    for (i = 0; i < CmiNumPes(); i++) CmiPrintf("%d ", nodeIDs[i]);
+    for (i = 0; i < CmiNumPes(); i++)
+      CmiPrintf("%d ", nodeIDs[i]);
     CmiPrintf("\n");
     CmiPrintf("Node to PE map:\n");
-    for (i = 0; i < numNodes; i++)
-    {
+    for (i = 0; i < numNodes; i++) {
       CmiPrintf("Chip #%d: ", i);
-      for (int j = 0; j < bynodes[i].size(); j++) CmiPrintf("%d ", bynodes[i][j]);
+      for (int j = 0; j < bynodes[i].size(); j++)
+        CmiPrintf("%d ", bynodes[i][j]);
       CmiPrintf("\n");
     }
   }
 };
 
-int* CpuTopology::nodeIDs = NULL;
+int *CpuTopology::nodeIDs = NULL;
 int CpuTopology::numPes = 0;
 int CpuTopology::numNodes = 0;
-std::vector<int>* CpuTopology::bynodes = NULL;
+std::vector<int> *CpuTopology::bynodes = NULL;
 int CpuTopology::supported = 0;
 
-namespace CpuTopoDetails
-{
+namespace CpuTopoDetails {
 
-static nodeTopoMsg* topomsg = NULL;
-static std::map<skt_ip_t, _procInfo*> hostTable;
+static nodeTopoMsg *topomsg = NULL;
+static std::map<skt_ip_t, _procInfo *> hostTable;
 
 static int cpuTopoHandlerIdx;
 static int cpuTopoRecvHandlerIdx;
@@ -226,7 +213,7 @@ static CpuTopology cpuTopo;
 static int done = 0;
 static int _noip = 0;
 
-}  // namespace CpuTopoDetails
+} // namespace CpuTopoDetails
 
 using namespace CpuTopoDetails;
 
@@ -236,75 +223,72 @@ using namespace CpuTopoDetails;
 //   const int ways = CmiHwlocTopologyLocal.num_pus;
 //   if (ways > 1)
 //     CmiPrintf(
-//         "Charm++> Running on %d hosts (%d sockets x %d cores x %d PUs = %d-way SMP)\n",
-//         numNodes, CmiHwlocTopologyLocal.num_sockets,
+//         "Charm++> Running on %d hosts (%d sockets x %d cores x %d PUs =
+//         %d-way SMP)\n", numNodes, CmiHwlocTopologyLocal.num_sockets,
 //         CmiHwlocTopologyLocal.num_cores / CmiHwlocTopologyLocal.num_sockets,
-//         CmiHwlocTopologyLocal.num_pus / CmiHwlocTopologyLocal.num_cores, ways);
+//         CmiHwlocTopologyLocal.num_pus / CmiHwlocTopologyLocal.num_cores,
+//         ways);
 //   else
 //     CmiPrintf("Charm++> Running on %d hosts\n", numNodes);
 // }
 
 static std::atomic<bool> cpuTopoSyncHandlerDone{};
-#  if CMK_SMP && !CMK_SMP_NO_COMMTHD
+#if CMK_SMP && !CMK_SMP_NO_COMMTHD
 extern void CommunicationServerThread(int sleepTime);
 static std::atomic<bool> cpuTopoSyncCommThreadDone{};
-#  endif
+#endif
 
-#  if CMK_SMP && !CMK_SMP_NO_COMMTHD
-static void cpuTopoSyncWaitCommThread(std::atomic<bool>& done)
-{
-  do CommunicationServerThread(5);
+#if CMK_SMP && !CMK_SMP_NO_COMMTHD
+static void cpuTopoSyncWaitCommThread(std::atomic<bool> &done) {
+  do
+    CommunicationServerThread(5);
   while (!done.load());
 
   CommunicationServerThread(5);
 }
-#  endif
+#endif
 
-static void cpuTopoSyncWait(std::atomic<bool>& done)
-{
-  do CsdSchedulePoll();
+static void cpuTopoSyncWait(std::atomic<bool> &done) {
+  do
+    CsdSchedulePoll();
   while (!done.load());
 
   CsdSchedulePoll();
 }
 
 /* called on PE 0 */
-static void cpuTopoHandler(void* m)
-{
-  _procInfo* rec;
-  hostnameMsg* msg = (hostnameMsg*)m;
+static void cpuTopoHandler(void *m) {
+  _procInfo *rec;
+  hostnameMsg *msg = (hostnameMsg *)m;
   int pe;
 
-  if (topomsg == NULL)
-  {
+  if (topomsg == NULL) {
     int i;
-    topomsg = (nodeTopoMsg*)CmiAlloc(sizeof(nodeTopoMsg) + CmiNumPes() * sizeof(int));
-    CmiSetHandler((char*)topomsg, cpuTopoRecvHandlerIdx);
-    topomsg->nodes = (int*)((char*)topomsg + sizeof(nodeTopoMsg));
-    for (i = 0; i < CmiNumPes(); i++) topomsg->nodes[i] = -1;
+    topomsg = (nodeTopoMsg *)CmiAlloc(sizeof(nodeTopoMsg) +
+                                      CmiNumPes() * sizeof(int));
+    CmiSetHandler((char *)topomsg, cpuTopoRecvHandlerIdx);
+    topomsg->nodes = (int *)((char *)topomsg + sizeof(nodeTopoMsg));
+    for (i = 0; i < CmiNumPes(); i++)
+      topomsg->nodes[i] = -1;
   }
   CmiAssert(topomsg != NULL);
 
-  msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg));
+  msg->procs = (_procInfo *)((char *)msg + sizeof(hostnameMsg));
   CmiAssert(msg->n == CmiNumPes());
-  for (int i = 0; i < msg->n; i++)
-  {
-    _procInfo* proc = msg->procs + i;
+  for (int i = 0; i < msg->n; i++) {
+    _procInfo *proc = msg->procs + i;
 
     /*   for debug
       skt_print_ip(str, msg->ip);
       printf("hostname: %d %s\n", msg->pe, str);
     */
-    skt_ip_t& ip = proc->ip;
+    skt_ip_t &ip = proc->ip;
     pe = proc->pe;
     auto iter = hostTable.find(ip);
-    if (iter != hostTable.end())
-    {
+    if (iter != hostTable.end()) {
       rec = iter->second;
-    }
-    else
-    {
-      proc->nodeID = pe;  // we will compact the node ID later
+    } else {
+      proc->nodeID = pe; // we will compact the node ID later
       rec = proc;
       hostTable.emplace(ip, proc);
     }
@@ -321,17 +305,14 @@ static void cpuTopoHandler(void* m)
 }
 
 /* called on each processor */
-static void cpuTopoRecvHandler(void* msg)
-{
-  nodeTopoMsg* m = (nodeTopoMsg*)msg;
-  m->nodes = (int*)((char*)m + sizeof(nodeTopoMsg));
+static void cpuTopoRecvHandler(void *msg) {
+  nodeTopoMsg *m = (nodeTopoMsg *)msg;
+  m->nodes = (int *)((char *)m + sizeof(nodeTopoMsg));
 
-  if (cpuTopo.nodeIDs == NULL)
-  {
+  if (cpuTopo.nodeIDs == NULL) {
     cpuTopo.nodeIDs = m->nodes;
     cpuTopo.sort();
-  }
-  else
+  } else
     CmiFree(m);
   done++;
 
@@ -341,28 +322,29 @@ static void cpuTopoRecvHandler(void* msg)
 }
 
 // reduction function
-static void* combineMessage(int* size, void* data, void** remote, int count)
-{
+static void *combineMessage(int *size, void *data, void **remote, int count) {
   int i, j;
-  int nprocs = ((hostnameMsg*)data)->n;
+  int nprocs = ((hostnameMsg *)data)->n;
   if (count == 0)
     return data;
-  for (i = 0; i < count; i++) nprocs += ((hostnameMsg*)remote[i])->n;
+  for (i = 0; i < count; i++)
+    nprocs += ((hostnameMsg *)remote[i])->n;
   *size = sizeof(hostnameMsg) + sizeof(_procInfo) * nprocs;
-  hostnameMsg* msg = (hostnameMsg*)CmiAlloc(*size);
-  msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg));
+  hostnameMsg *msg = (hostnameMsg *)CmiAlloc(*size);
+  msg->procs = (_procInfo *)((char *)msg + sizeof(hostnameMsg));
   msg->n = nprocs;
-  CmiSetHandler((char*)msg, cpuTopoHandlerIdx);
+  CmiSetHandler((char *)msg, cpuTopoHandlerIdx);
 
   int n = 0;
-  hostnameMsg* m = (hostnameMsg*)data;
-  m->procs = (_procInfo*)((char*)m + sizeof(hostnameMsg));
-  for (j = 0; j < m->n; j++) msg->procs[n++] = m->procs[j];
-  for (i = 0; i < count; i++)
-  {
-    m = (hostnameMsg*)remote[i];
-    m->procs = (_procInfo*)((char*)m + sizeof(hostnameMsg));
-    for (j = 0; j < m->n; j++) msg->procs[n++] = m->procs[j];
+  hostnameMsg *m = (hostnameMsg *)data;
+  m->procs = (_procInfo *)((char *)m + sizeof(hostnameMsg));
+  for (j = 0; j < m->n; j++)
+    msg->procs[n++] = m->procs[j];
+  for (i = 0; i < count; i++) {
+    m = (hostnameMsg *)remote[i];
+    m->procs = (_procInfo *)((char *)m + sizeof(hostnameMsg));
+    for (j = 0; j < m->n; j++)
+      msg->procs[n++] = m->procs[j];
   }
   return msg;
 }
@@ -371,9 +353,8 @@ static void* combineMessage(int* size, void* data, void** remote, int count)
 
 int LrtsCpuTopoEnabled() { return CpuTopology::supported; }
 
-int LrtsPeOnSameNode(int pe1, int pe2)
-{
-  int* nodeIDs = cpuTopo.nodeIDs;
+int LrtsPeOnSameNode(int pe1, int pe2) {
+  int *nodeIDs = cpuTopo.nodeIDs;
   if (!cpuTopo.supported || nodeIDs == NULL)
     return CmiNodeOf(pe1) == CmiNodeOf(pe2);
   else
@@ -381,66 +362,62 @@ int LrtsPeOnSameNode(int pe1, int pe2)
 }
 
 // return -1 when not supported
-int LrtsNumNodes()
-{
+int LrtsNumNodes() {
   if (!cpuTopo.supported)
     return CmiNumNodes();
   else
     return cpuTopo.numUniqNodes();
 }
 
-int LrtsNodeSize(int node)
-{
-  return !cpuTopo.supported ? CmiNodeSize(node) : (int)cpuTopo.bynodes[node].size();
+int LrtsNodeSize(int node) {
+  return !cpuTopo.supported ? CmiNodeSize(node)
+                            : (int)cpuTopo.bynodes[node].size();
 }
 
 // pelist points to system memory, user should not free it
-void LrtsPeOnNode(int node, int** pelist, int* num)
-{
+void LrtsPeOnNode(int node, int **pelist, int *num) {
   *num = cpuTopo.bynodes[node].size();
   if (pelist != NULL && *num > 0)
     *pelist = cpuTopo.bynodes[node].data();
 }
 
-int LrtsRankOf(int pe)
-{
+int LrtsRankOf(int pe) {
   if (!cpuTopo.supported)
     return CmiRankOf(pe);
-  const std::vector<int>& v = cpuTopo.bynodes[cpuTopo.nodeIDs[pe]];
+  const std::vector<int> &v = cpuTopo.bynodes[cpuTopo.nodeIDs[pe]];
   int rank = 0;
   int npes = v.size();
-  while (rank < npes && v[rank] < pe) rank++;  // already sorted
+  while (rank < npes && v[rank] < pe)
+    rank++; // already sorted
   CmiAssert(v[rank] == pe);
   return rank;
 }
 
-int LrtsNodeOf(int pe)
-{
+int LrtsNodeOf(int pe) {
   if (!cpuTopo.supported)
     return CmiNodeOf(pe);
   return cpuTopo.nodeIDs[pe];
 }
 
 // the least number processor on the same physical node
-int LrtsNodeFirst(int node)
-{
+int LrtsNodeFirst(int node) {
   if (!cpuTopo.supported)
     return CmiNodeFirst(node);
   return cpuTopo.bynodes[node][0];
 }
 
-void LrtsInitCpuTopo(char** argv)
-{
+void LrtsInitCpuTopo(char **argv) {
   static skt_ip_t myip;
   double startT;
 
-  int obtain_flag = 1;  // default on
-  int show_flag = 0;    // default not show topology
+  int obtain_flag = 1; // default on
+  int show_flag = 0;   // default not show topology
 
-#  if __FAULT__
+#if __FAULT__
   obtain_flag = 0;
-#  endif
-  if (CmiGetArgFlagDesc(argv, "+obtain_cpu_topology", "obtain cpu topology info"))
+#endif
+  if (CmiGetArgFlagDesc(argv, "+obtain_cpu_topology",
+                        "obtain cpu topology info"))
     obtain_flag = 1;
   if (CmiGetArgFlagDesc(argv, "+skip_cpu_topology",
                         "skip the processof getting cpu topology info"))
@@ -448,40 +425,37 @@ void LrtsInitCpuTopo(char** argv)
   if (CmiGetArgFlagDesc(argv, "+show_cpu_topology", "Show cpu topology info"))
     show_flag = 1;
 
-  CmiAssignOnce(&cpuTopoHandlerIdx, CmiRegisterHandler((CmiHandler)cpuTopoHandler));
+  CmiAssignOnce(&cpuTopoHandlerIdx,
+                CmiRegisterHandler((CmiHandler)cpuTopoHandler));
   CmiAssignOnce(&cpuTopoRecvHandlerIdx,
                 CmiRegisterHandler((CmiHandler)cpuTopoRecvHandler));
 
-  if (!obtain_flag)
-  {
+  if (!obtain_flag) {
     if (CmiMyRank() == 0)
       cpuTopo.sort();
     CmiNodeAllBarrier();
-    CcdRaiseCondition(CcdTOPOLOGY_AVAIL);  // call callbacks
+    CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks
     return;
   }
 
-  if (CmiMyPe() == 0)
-  {
+  if (CmiMyPe() == 0) {
     startT = CmiWallTimer();
   }
 
-#  if 0
+#if 0
   if (gethostname(hostname, 999)!=0) {
       strcpy(hostname, "");
   }
-#  endif
-#  if CMK_CRAYXE || CMK_CRAYXC || CMK_CRAYEX
-  if (CmiMyRank() == 0)
-  {
+#endif
+#if CMK_CRAYXE || CMK_CRAYXC || CMK_CRAYEX
+  if (CmiMyRank() == 0) {
     int numPes = cpuTopo.numPes = CmiNumPes();
     int numNodes = CmiNumNodes();
     cpuTopo.nodeIDs = new int[numPes];
     CpuTopology::supported = 1;
 
     int nid;
-    for (int i = 0; i < numPes; i++)
-    {
+    for (int i = 0; i < numPes; i++) {
       nid = getXTNodeID(CmiNodeOf(i), numNodes);
       cpuTopo.nodeIDs[i] = nid;
     }
@@ -490,14 +464,11 @@ void LrtsInitCpuTopo(char** argv)
 
     // this assumes that all cores on a node have consecutive MPI rank IDs
     // and then changes nodeIDs to 0 to numNodes-1
-    for (int i = 0; i < numPes; i++)
-    {
-      if (cpuTopo.nodeIDs[i] != prev)
-      {
+    for (int i = 0; i < numPes; i++) {
+      if (cpuTopo.nodeIDs[i] != prev) {
         prev = cpuTopo.nodeIDs[i];
         cpuTopo.nodeIDs[i] = ++nid;
-      }
-      else
+      } else
         cpuTopo.nodeIDs[i] = nid;
     }
     cpuTopo.sort();
@@ -506,11 +477,10 @@ void LrtsInitCpuTopo(char** argv)
   }
   CmiNodeAllBarrier();
 
-#  else
+#else
 
   /* get my ip address */
-  if (CmiMyRank() == 0)
-  {
+  if (CmiMyRank() == 0) {
     myip = skt_my_ip(); /* not thread safe, so only calls on rank 0 */
     // fprintf(stderr, "[%d] IP is %d.%d.%d.%d\n", CmiMyPe(),
     // myip.data[0],myip.data[1],myip.data[2],myip.data[3]);
@@ -518,27 +488,25 @@ void LrtsInitCpuTopo(char** argv)
   }
 
   CmiNodeAllBarrier();
-  if (_noip)
-  {
+  if (_noip) {
     if (CmiMyRank() == 0)
       cpuTopo.sort();
-    CcdRaiseCondition(CcdTOPOLOGY_AVAIL);  // call callbacks
+    CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks
     return;
   }
 
-#    if CMK_SMP && !CMK_SMP_NO_COMMTHD
-  if (CmiInCommThread())
-  {
+#if CMK_SMP && !CMK_SMP_NO_COMMTHD
+  if (CmiInCommThread()) {
     cpuTopoSyncWaitCommThread(cpuTopoSyncCommThreadDone);
-  }
-  else
-#    endif
+  } else
+#endif
   {
     /* prepare a msg to send */
-    hostnameMsg* msg = (hostnameMsg*)CmiAlloc(sizeof(hostnameMsg) + sizeof(_procInfo));
+    hostnameMsg *msg =
+        (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg) + sizeof(_procInfo));
     msg->n = 1;
-    msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg));
-    CmiSetHandler((char*)msg, cpuTopoHandlerIdx);
+    msg->procs = (_procInfo *)((char *)msg + sizeof(hostnameMsg));
+    CmiSetHandler((char *)msg, cpuTopoHandlerIdx);
     auto proc = &msg->procs[0];
     proc->pe = CmiMyPe();
     proc->ip = myip;
@@ -550,44 +518,40 @@ void LrtsInitCpuTopo(char** argv)
 
     cpuTopoSyncWait(cpuTopoSyncHandlerDone);
 
-    if (CmiMyRank() == 0)
-    {
-      if (CmiMyPe() == 0)
-      {
-        CmiSyncNodeBroadcastAllAndFree(sizeof(nodeTopoMsg) + CmiNumPes() * sizeof(int),
-                                       (char*)topomsg);
+    if (CmiMyRank() == 0) {
+      if (CmiMyPe() == 0) {
+        CmiSyncNodeBroadcastAllAndFree(
+            sizeof(nodeTopoMsg) + CmiNumPes() * sizeof(int), (char *)topomsg);
 
         CsdSchedulePoll();
       }
 
-#    if CMK_SMP && !CMK_SMP_NO_COMMTHD
+#if CMK_SMP && !CMK_SMP_NO_COMMTHD
       cpuTopoSyncCommThreadDone = true;
-#    endif
+#endif
     }
   }
 
   CmiBarrier();
 
-  if (CmiMyPe() == 0)
-  {
+  if (CmiMyPe() == 0) {
     CmiPrintf("Charm++> cpu topology info is gathered in %.3f seconds.\n",
               CmiWallTimer() - startT);
   }
-#  endif
+#endif
 
   // now every one should have the node info
-  CcdRaiseCondition(CcdTOPOLOGY_AVAIL);  // call callbacks
+  CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks
   if (CmiMyPe() == 0 && show_flag)
     cpuTopo.print();
 }
 
 #else /* not supporting cpu topology */
 
-extern "C" void LrtsInitCpuTopo(char** argv)
-{
+extern "C" void LrtsInitCpuTopo(char **argv) {
   /* do nothing */
-  int obtain_flag =
-      CmiGetArgFlagDesc(argv, "+obtain_cpu_topology", "obtain cpu topology info");
+  int obtain_flag = CmiGetArgFlagDesc(argv, "+obtain_cpu_topology",
+                                      "obtain cpu topology info");
   CmiGetArgFlagDesc(argv, "+skip_cpu_topology",
                     "skip the processof getting cpu topology info");
   CmiGetArgFlagDesc(argv, "+show_cpu_topology", "Show cpu topology info");
@@ -596,14 +560,15 @@ extern "C" void LrtsInitCpuTopo(char** argv)
 #endif
 
 int CmiCpuTopologyEnabled() { return LrtsCpuTopoEnabled(); }
-int CmiPeOnSamePhysicalNode(int pe1, int pe2) { return LrtsPeOnSameNode(pe1, pe2); }
+int CmiPeOnSamePhysicalNode(int pe1, int pe2) {
+  return LrtsPeOnSameNode(pe1, pe2);
+}
 int CmiNumPhysicalNodes() { return LrtsNumNodes(); }
 int CmiNumPesOnPhysicalNode(int node) { return LrtsNodeSize(node); }
-void CmiGetPesOnPhysicalNode(int node, int** pelist, int* num)
-{
+void CmiGetPesOnPhysicalNode(int node, int **pelist, int *num) {
   LrtsPeOnNode(node, pelist, num);
 }
 int CmiPhysicalRank(int pe) { return LrtsRankOf(pe); }
 int CmiPhysicalNodeID(int pe) { return LrtsNodeOf(pe); }
 int CmiGetFirstPeOnPhysicalNode(int node) { return LrtsNodeFirst(node); }
-void CmiInitCPUTopology(char** argv) { LrtsInitCpuTopo(argv); }
+void CmiInitCPUTopology(char **argv) { LrtsInitCpuTopo(argv); }
diff --git a/src/conv-topology.h b/src/conv-topology.h
index 6091937..12ac3c4 100644
--- a/src/conv-topology.h
+++ b/src/conv-topology.h
@@ -1,12 +1,11 @@
 #include "converse.h"
 #include <cstring>
 
-typedef struct {/*IPv4 IP address*/
+typedef struct { /*IPv4 IP address*/
   unsigned char data[4];
 } skt_ip_t;
 extern skt_ip_t _skt_invalid_ip;
 
-static inline bool operator< (const skt_ip_t & a, const skt_ip_t & b)
-{
+static inline bool operator<(const skt_ip_t &a, const skt_ip_t &b) {
   return memcmp(&a, &b, sizeof(a)) < 0;
 }
\ No newline at end of file
diff --git a/src/convcore.cpp b/src/convcore.cpp
index a040708..70f2f5b 100644
--- a/src/convcore.cpp
+++ b/src/convcore.cpp
@@ -10,9 +10,9 @@
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/time.h>
 #include <thread>
 #include <vector>
-#include <sys/time.h>
 
 // GLOBALS
 static char **Cmi_argv;
@@ -47,7 +47,7 @@ int _replaySystem = 0;
 
 CmiNodeLock CmiMemLock_lock;
 CpvDeclare(int, isHelperOn);
-//partition
+// partition
 PartitionInfo _partitionInfo;
 int _Cmi_mype_global;
 int _Cmi_numpes_global;
@@ -77,12 +77,10 @@ int Cmi_exitHandler;
 comm_backend::AmHandler g_amHandler;
 
 CpvStaticDeclare(double, clocktick);
-CpvStaticDeclare(int,inittime_wallclock);
-CpvStaticDeclare(int,inittime_virtual);
+CpvStaticDeclare(int, inittime_wallclock);
+CpvStaticDeclare(int, inittime_virtual);
 
-void registerTraceInit(void (*fn)(char **argv)) {
-  CmiTraceFn = fn;
-}
+void registerTraceInit(void (*fn)(char **argv)) { CmiTraceFn = fn; }
 
 void CommLocalHandler(comm_backend::Status status) {
   CmiFree(const_cast<void *>(status.local_buf));
@@ -236,7 +234,7 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched,
 
 #ifdef CMK_HAS_PARTITION
   CmiCreatePartitions(argv);
-  #else
+#else
   _partitionInfo.type = PARTITION_SINGLETON;
   _partitionInfo.numPartitions = 1;
   _partitionInfo.myPartition = 0;
@@ -244,7 +242,7 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched,
   _Cmi_mynode_global = Cmi_mynode;
   _Cmi_numpes_global = Cmi_npes;
   Cmi_nodestartGlobal = _Cmi_mynode_global * Cmi_mynodesize;
-  #endif
+#endif
 
   CmiStartThreads();
 }
@@ -1425,7 +1423,4 @@ void CmiCreatePartitions(char **argv) {
 // Since we are not implementing converse level seed balancers yet
 void LBTopoInit() {}
 
-int CmiDeliverMsgs(int maxmsgs)
-{
-  return CsdScheduler(maxmsgs);
-}
+int CmiDeliverMsgs(int maxmsgs) { return CsdScheduler(maxmsgs); }
diff --git a/src/converse_internal.h b/src/converse_internal.h
index 32ce4c6..49b0db5 100644
--- a/src/converse_internal.h
+++ b/src/converse_internal.h
@@ -3,8 +3,8 @@
 #ifndef CONVCORE_H
 #define CONVCORE_H
 
-#include <cstring>
 #include "converse.h"
+#include <cstring>
 
 #include "converse.h"
 #include "converse_config.h"
@@ -23,8 +23,8 @@ typedef struct GroupDef_s {
 
 #define GROUPTAB_SIZE 101
 
-//debug
-#define  DEBUGF(...)    //CmiPrintf(__VA_ARGS__)
+// debug
+#define DEBUGF(...) // CmiPrintf(__VA_ARGS__)
 
 void CmiStartThreads(char **argv);
 void converseRunPe(int rank);
@@ -41,8 +41,8 @@ void CmiGroupHandler(void *msg);
 void CmiReduceHandler(void *msg);
 
 typedef struct HandlerInfo {
-  union{
-    CmiHandler hdlr; // handler function
+  union {
+    CmiHandler hdlr;     // handler function
     CmiHandlerEx exhdlr; // handler function with user pointer
   };
   void *userPtr; // does this point to the mesage data itself
diff --git a/src/cpuaffinity.cpp b/src/cpuaffinity.cpp
index d4245c9..783368e 100644
--- a/src/cpuaffinity.cpp
+++ b/src/cpuaffinity.cpp
@@ -18,9 +18,10 @@
 #define _GNU_SOURCE
 #endif
 
-static int affMsgsRecvd = 1;  // number of affinity messages received at PE0
+static int affMsgsRecvd = 1; // number of affinity messages received at PE0
 #if defined(CPU_OR)
-static cpu_set_t core_usage;  // used to record union of CPUs used by every PE in physical node
+static cpu_set_t core_usage; // used to record union of CPUs used by every PE in
+                             // physical node
 #endif
 static int aff_is_set = 0;
 
@@ -28,9 +29,9 @@ static std::atomic<bool> cpuPhyAffCheckDone{};
 
 struct affMsg {
   char core[CmiMsgHeaderSizeBytes];
-  #if defined(CPU_OR)
+#if defined(CPU_OR)
   cpu_set_t affinity;
-  #endif
+#endif
 };
 
 CmiHwlocTopology CmiHwlocTopologyLocal;
@@ -41,92 +42,101 @@ static int cpuPhyNodeAffinityRecvHandlerIdx;
 // CmiNumCores
 static hwloc_topology_t topology, legacy_topology;
 
-int CmiNumCores(void)
-{
+int CmiNumCores(void) {
   // PU count is the intended output here rather than literal cores
   return CmiHwlocTopologyLocal.total_num_pus;
 }
 
-static int search_pemap(char *pecoremap, int pe)
-{
-  int *map = (int *)malloc(CmiNumPesGlobal()*sizeof(int));
+static int search_pemap(char *pecoremap, int pe) {
+  int *map = (int *)malloc(CmiNumPesGlobal() * sizeof(int));
   char *ptr = NULL;
   int h, i, j, k, count;
   int plusarr[128];
   char *str;
 
-  char *mapstr = (char*)malloc(strlen(pecoremap)+1);
+  char *mapstr = (char *)malloc(strlen(pecoremap) + 1);
   strcpy(mapstr, pecoremap);
 
   str = strtok_r(mapstr, ",", &ptr);
   count = 0;
-  while (str && count < CmiNumPesGlobal())
-  {
-      int hasdash=0, hascolon=0, hasdot=0, hasstar1=0, hasstar2=0, numplus=0;
-      int start, end, stride=1, block=1;
-      int iter=1;
-      plusarr[0] = 0;
-      for (i=0; i<strlen(str); i++) {
-          if (str[i] == '-' && i!=0) hasdash=1;
-          else if (str[i] == ':') hascolon=1;
-          else if (str[i] == '.') hasdot=1;
-          else if (str[i] == 'x') hasstar1=1;
-          else if (str[i] == 'X') hasstar2=1;
-          else if (str[i] == '+') {
-            if (str[i+1] == '+' || str[i+1] == '-') {
-              printf("Warning: Check the format of \"%s\".\n", str);
-            } else if (sscanf(&str[i], "+%d", &plusarr[++numplus]) != 1) {
-              printf("Warning: Check the format of \"%s\".\n", str);
-              --numplus;
-            }
-          }
-      }
-      if (hasstar1 || hasstar2) {
-          if (hasstar1) sscanf(str, "%dx", &iter);
-          if (hasstar2) sscanf(str, "%dX", &iter);
-          while (*str!='x' && *str!='X') str++;
-          str++;
-      }
-      if (hasdash) {
-          if (hascolon) {
-            if (hasdot) {
-              if (sscanf(str, "%d-%d:%d.%d", &start, &end, &stride, &block) != 4)
-                 printf("Warning: Check the format of \"%s\".\n", str);
-            }
-            else {
-              if (sscanf(str, "%d-%d:%d", &start, &end, &stride) != 3)
-                 printf("Warning: Check the format of \"%s\".\n", str);
-            }
-          }
-          else {
-            if (sscanf(str, "%d-%d", &start, &end) != 2)
-                 printf("Warning: Check the format of \"%s\".\n", str);
-          }
-      }
-      else {
-          sscanf(str, "%d", &start);
-          end = start;
+  while (str && count < CmiNumPesGlobal()) {
+    int hasdash = 0, hascolon = 0, hasdot = 0, hasstar1 = 0, hasstar2 = 0,
+        numplus = 0;
+    int start, end, stride = 1, block = 1;
+    int iter = 1;
+    plusarr[0] = 0;
+    for (i = 0; i < strlen(str); i++) {
+      if (str[i] == '-' && i != 0)
+        hasdash = 1;
+      else if (str[i] == ':')
+        hascolon = 1;
+      else if (str[i] == '.')
+        hasdot = 1;
+      else if (str[i] == 'x')
+        hasstar1 = 1;
+      else if (str[i] == 'X')
+        hasstar2 = 1;
+      else if (str[i] == '+') {
+        if (str[i + 1] == '+' || str[i + 1] == '-') {
+          printf("Warning: Check the format of \"%s\".\n", str);
+        } else if (sscanf(&str[i], "+%d", &plusarr[++numplus]) != 1) {
+          printf("Warning: Check the format of \"%s\".\n", str);
+          --numplus;
+        }
       }
-      if (block > stride) {
-        printf("Warning: invalid block size in \"%s\" ignored.\n", str);
-        block=1;
+    }
+    if (hasstar1 || hasstar2) {
+      if (hasstar1)
+        sscanf(str, "%dx", &iter);
+      if (hasstar2)
+        sscanf(str, "%dX", &iter);
+      while (*str != 'x' && *str != 'X')
+        str++;
+      str++;
+    }
+    if (hasdash) {
+      if (hascolon) {
+        if (hasdot) {
+          if (sscanf(str, "%d-%d:%d.%d", &start, &end, &stride, &block) != 4)
+            printf("Warning: Check the format of \"%s\".\n", str);
+        } else {
+          if (sscanf(str, "%d-%d:%d", &start, &end, &stride) != 3)
+            printf("Warning: Check the format of \"%s\".\n", str);
+        }
+      } else {
+        if (sscanf(str, "%d-%d", &start, &end) != 2)
+          printf("Warning: Check the format of \"%s\".\n", str);
       }
-      //if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block: %d. plus %d \n", iter, start, end, stride, block, numplus);
-      for (k = 0; k<iter; k++) {
-        for (i = start; i<=end; i+=stride) {
-          for (j=0; j<block; j++) {
-            if (i+j>end) break;
-            for (h=0; h<=numplus; h++) {
-              map[count++] = i+j+plusarr[h];
-              if (count == CmiNumPesGlobal()) break;
-            }
-            if (count == CmiNumPesGlobal()) break;
+    } else {
+      sscanf(str, "%d", &start);
+      end = start;
+    }
+    if (block > stride) {
+      printf("Warning: invalid block size in \"%s\" ignored.\n", str);
+      block = 1;
+    }
+    // if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block:
+    // %d. plus %d \n", iter, start, end, stride, block, numplus);
+    for (k = 0; k < iter; k++) {
+      for (i = start; i <= end; i += stride) {
+        for (j = 0; j < block; j++) {
+          if (i + j > end)
+            break;
+          for (h = 0; h <= numplus; h++) {
+            map[count++] = i + j + plusarr[h];
+            if (count == CmiNumPesGlobal())
+              break;
           }
-          if (count == CmiNumPesGlobal()) break;
+          if (count == CmiNumPesGlobal())
+            break;
         }
-        if (count == CmiNumPesGlobal()) break;
+        if (count == CmiNumPesGlobal())
+          break;
       }
-      str = strtok_r(NULL, ",", &ptr);
+      if (count == CmiNumPesGlobal())
+        break;
+    }
+    str = strtok_r(NULL, ",", &ptr);
   }
   i = map[pe % count];
 
@@ -135,8 +145,7 @@ static int search_pemap(char *pecoremap, int pe)
   return i;
 }
 
-static void cpuAffSyncWait(std::atomic<bool> & done)
-{
+static void cpuAffSyncWait(std::atomic<bool> &done) {
   do
     CsdSchedulePoll();
   while (!done.load());
@@ -144,8 +153,7 @@ static void cpuAffSyncWait(std::atomic<bool> & done)
   CsdSchedulePoll();
 }
 
-static void cpuPhyNodeAffinityRecvHandler(void *msg)
-{
+static void cpuPhyNodeAffinityRecvHandler(void *msg) {
   static int count = 0;
 
   affMsg *m = (affMsg *)msg;
@@ -162,7 +170,8 @@ static void cpuPhyNodeAffinityRecvHandler(void *msg)
 #if defined(CPU_OR)
 int get_thread_affinity(cpu_set_t *cpuset) {
   CPU_ZERO(cpuset);
-  if ((errno = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset))) {
+  if ((errno =
+           pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset))) {
     perror("pthread_getaffinity");
     return -1;
   }
@@ -171,9 +180,7 @@ int get_thread_affinity(cpu_set_t *cpuset) {
 #endif
 
 #if defined(CPU_OR)
-int get_affinity(cpu_set_t *cpuset) {
-  return get_thread_affinity(cpuset);
-}
+int get_affinity(cpu_set_t *cpuset) { return get_thread_affinity(cpuset); }
 #endif
 
 void CmiInitHwlocTopology(void) {
@@ -218,12 +225,11 @@ void CmiInitHwlocTopology(void) {
           : 1;
 }
 
-static int set_process_affinity(hwloc_cpuset_t cpuset)
-{
+static int set_process_affinity(hwloc_cpuset_t cpuset) {
   pid_t process = getpid();
-  #define PRINTF_PROCESS "%d"
-  if (hwloc_set_proc_cpubind(topology, process, cpuset, HWLOC_CPUBIND_PROCESS|HWLOC_CPUBIND_STRICT))
-  {
+#define PRINTF_PROCESS "%d"
+  if (hwloc_set_proc_cpubind(topology, process, cpuset,
+                             HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT)) {
     char *str;
     int error = errno;
     hwloc_bitmap_asprintf(&str, cpuset);
@@ -232,15 +238,13 @@ static int set_process_affinity(hwloc_cpuset_t cpuset)
     return -1;
   }
   return 0;
-  #undef PRINTF_PROCESS
+#undef PRINTF_PROCESS
 }
 
-
-static int set_thread_affinity(hwloc_cpuset_t cpuset)
-{
+static int set_thread_affinity(hwloc_cpuset_t cpuset) {
   pthread_t thread = pthread_self();
-  if (hwloc_set_thread_cpubind(topology, thread, cpuset, HWLOC_CPUBIND_THREAD|HWLOC_CPUBIND_STRICT))
-  {
+  if (hwloc_set_thread_cpubind(topology, thread, cpuset,
+                               HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT)) {
     char *str;
     int error = errno;
     hwloc_bitmap_asprintf(&str, cpuset);
@@ -251,61 +255,63 @@ static int set_thread_affinity(hwloc_cpuset_t cpuset)
   return 0;
 }
 
-static void bind_process_only(hwloc_obj_type_t process_unit)
-{
+static void bind_process_only(hwloc_obj_type_t process_unit) {
   hwloc_cpuset_t cpuset;
 
   int process_unitcount = hwloc_get_nbobjs_by_type(topology, process_unit);
 
   int process_assignment = CmiMyRank() % process_unitcount;
 
-  hwloc_obj_t process_obj = hwloc_get_obj_by_type(topology, process_unit, process_assignment);
+  hwloc_obj_t process_obj =
+      hwloc_get_obj_by_type(topology, process_unit, process_assignment);
   set_process_affinity(process_obj->cpuset);
 }
 
-static void bind_threads_only(hwloc_obj_type_t thread_unit)
-{
+static void bind_threads_only(hwloc_obj_type_t thread_unit) {
   hwloc_cpuset_t cpuset;
 
   int thread_unitcount = hwloc_get_nbobjs_by_type(topology, thread_unit);
 
   int thread_assignment = CmiMyRank() % thread_unitcount;
 
-  hwloc_obj_t thread_obj = hwloc_get_obj_by_type(topology, thread_unit, thread_assignment);
+  hwloc_obj_t thread_obj =
+      hwloc_get_obj_by_type(topology, thread_unit, thread_assignment);
   hwloc_cpuset_t thread_cpuset = hwloc_bitmap_dup(thread_obj->cpuset);
   hwloc_bitmap_singlify(thread_cpuset);
   set_thread_affinity(thread_cpuset);
   hwloc_bitmap_free(thread_cpuset);
 }
 
-static void bind_process_and_threads(hwloc_obj_type_t process_unit, hwloc_obj_type_t thread_unit)
-{
+static void bind_process_and_threads(hwloc_obj_type_t process_unit,
+                                     hwloc_obj_type_t thread_unit) {
   hwloc_cpuset_t cpuset;
 
   int process_unitcount = hwloc_get_nbobjs_by_type(topology, process_unit);
 
   int process_assignment = CmiMyRank() % process_unitcount;
 
-  hwloc_obj_t process_obj = hwloc_get_obj_by_type(topology, process_unit, process_assignment);
+  hwloc_obj_t process_obj =
+      hwloc_get_obj_by_type(topology, process_unit, process_assignment);
   set_process_affinity(process_obj->cpuset);
 
-  int thread_unitcount = hwloc_get_nbobjs_inside_cpuset_by_type(topology, process_obj->cpuset, thread_unit);
+  int thread_unitcount = hwloc_get_nbobjs_inside_cpuset_by_type(
+      topology, process_obj->cpuset, thread_unit);
 
   int thread_assignment = CmiMyRank() % thread_unitcount;
 
-  hwloc_obj_t thread_obj = hwloc_get_obj_inside_cpuset_by_type(topology, process_obj->cpuset, thread_unit, thread_assignment);
+  hwloc_obj_t thread_obj = hwloc_get_obj_inside_cpuset_by_type(
+      topology, process_obj->cpuset, thread_unit, thread_assignment);
   hwloc_cpuset_t thread_cpuset = hwloc_bitmap_dup(thread_obj->cpuset);
   hwloc_bitmap_singlify(thread_cpuset);
   set_thread_affinity(thread_cpuset);
   hwloc_bitmap_free(thread_cpuset);
 }
 
-static int set_default_affinity(void){
+static int set_default_affinity(void) {
   char *s;
   int n = -1;
 
-  if ((s = getenv("CmiProcessPerSocket")))
-  {
+  if ((s = getenv("CmiProcessPerSocket"))) {
     n = atoi(s);
     if (getenv("CmiOneWthPerCore"))
       bind_process_and_threads(HWLOC_OBJ_PACKAGE, HWLOC_OBJ_CORE);
@@ -313,34 +319,24 @@ static int set_default_affinity(void){
       bind_process_and_threads(HWLOC_OBJ_PACKAGE, HWLOC_OBJ_PU);
     else
       bind_process_only(HWLOC_OBJ_PACKAGE);
-  }
-  else if ((s = getenv("CmiProcessPerCore")))
-  {
+  } else if ((s = getenv("CmiProcessPerCore"))) {
     n = atoi(s);
     if (getenv("CmiOneWthPerPU"))
       bind_process_and_threads(HWLOC_OBJ_CORE, HWLOC_OBJ_PU);
     else
       bind_process_only(HWLOC_OBJ_CORE);
-  }
-  else if ((s = getenv("CmiProcessPerPU")))
-  {
+  } else if ((s = getenv("CmiProcessPerPU"))) {
     n = atoi(s);
     bind_process_only(HWLOC_OBJ_PU);
-  }
-  else // if ((s = getenv("CmiProcessPerHost")))
+  } else // if ((s = getenv("CmiProcessPerHost")))
   {
-    if (getenv("CmiOneWthPerSocket"))
-    {
+    if (getenv("CmiOneWthPerSocket")) {
       n = 0;
       bind_threads_only(HWLOC_OBJ_PACKAGE);
-    }
-    else if (getenv("CmiOneWthPerCore"))
-    {
+    } else if (getenv("CmiOneWthPerCore")) {
       n = 0;
       bind_threads_only(HWLOC_OBJ_CORE);
-    }
-    else if (getenv("CmiOneWthPerPU"))
-    {
+    } else if (getenv("CmiOneWthPerPU")) {
       n = 0;
       bind_threads_only(HWLOC_OBJ_PU);
     }
@@ -350,59 +346,65 @@ static int set_default_affinity(void){
 }
 
 void CmiInitCPUAffinity(char **argv) {
-    #if defined(CPU_OR)
-    // check for flags
-    int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity", "set cpu affinity");
-    char *pemap = NULL;
-    // 0 if OS-assigned, 1 if logical hwloc assigned
-    // for now, stick with os-assigned only
-    // also no commap, we have no commthreads
-    int pemap_logical_flag = 0;
-    CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping");
-    if (pemap!=NULL) affinity_flag = 1;
-    CmiAssignOnce(&cpuPhyNodeAffinityRecvHandlerIdx, CmiRegisterHandler((CmiHandler)cpuPhyNodeAffinityRecvHandler));
-    // setting default affinity (always needed, not the same as setting cpu affinity)
-    int done = 0;
-    CmiNodeAllBarrier();
-    /* must bind the rank 0 which is the main thread first */
-    /* binding the main thread seems to change binding for all threads */
-    if (CmiMyRank() == 0) {
-        done = set_default_affinity();
-    }
-    CmiNodeAllBarrier();
-    if (CmiMyRank() != 0) {
-        done = set_default_affinity();
-    }
-    if (done) {
-        return;
-    }
-    //set cmi affinity
-    if (!affinity_flag) {
-      if (CmiMyPe() == 0) CmiPrintf("Charm++> cpu affinity NOT enabled.\n");
-      return;
-    }
-    if (CmiMyPe() == 0) {
-     CmiPrintf("Charm++> cpu affinity enabled. \n");
-     if (pemap!=NULL)
-       CmiPrintf("Charm++> cpuaffinity PE-core map (%s): %s\n",
-           pemap_logical_flag ? "logical indices" : "OS indices", pemap);
-    }
-    // if a pemap is provided
-    if (pemap != NULL){
-      int mycore = search_pemap(pemap, CmiMyPeGlobal());
-      if (CmiSetCPUAffinity(mycore) == -1) CmiAbort("CmiSetCPUAffinity failed!");
-    }
-    // if we are just using +setcpuaffinity
-    else {
-      CmiPrintf("Charm++> +setcpuaffinity implementation in progress\n");
-    }
-    #endif
-    CmiNodeAllBarrier();
+#if defined(CPU_OR)
+  // check for flags
+  int affinity_flag =
+      CmiGetArgFlagDesc(argv, "+setcpuaffinity", "set cpu affinity");
+  char *pemap = NULL;
+  // 0 if OS-assigned, 1 if logical hwloc assigned
+  // for now, stick with os-assigned only
+  // also no commap, we have no commthreads
+  int pemap_logical_flag = 0;
+  CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping");
+  if (pemap != NULL)
+    affinity_flag = 1;
+  CmiAssignOnce(&cpuPhyNodeAffinityRecvHandlerIdx,
+                CmiRegisterHandler((CmiHandler)cpuPhyNodeAffinityRecvHandler));
+  // setting default affinity (always needed, not the same as setting cpu
+  // affinity)
+  int done = 0;
+  CmiNodeAllBarrier();
+  /* must bind the rank 0 which is the main thread first */
+  /* binding the main thread seems to change binding for all threads */
+  if (CmiMyRank() == 0) {
+    done = set_default_affinity();
+  }
+  CmiNodeAllBarrier();
+  if (CmiMyRank() != 0) {
+    done = set_default_affinity();
+  }
+  if (done) {
+    return;
+  }
+  // set cmi affinity
+  if (!affinity_flag) {
+    if (CmiMyPe() == 0)
+      CmiPrintf("Charm++> cpu affinity NOT enabled.\n");
+    return;
+  }
+  if (CmiMyPe() == 0) {
+    CmiPrintf("Charm++> cpu affinity enabled. \n");
+    if (pemap != NULL)
+      CmiPrintf("Charm++> cpuaffinity PE-core map (%s): %s\n",
+                pemap_logical_flag ? "logical indices" : "OS indices", pemap);
+  }
+  // if a pemap is provided
+  if (pemap != NULL) {
+    int mycore = search_pemap(pemap, CmiMyPeGlobal());
+    if (CmiSetCPUAffinity(mycore) == -1)
+      CmiAbort("CmiSetCPUAffinity failed!");
+  }
+  // if we are just using +setcpuaffinity
+  else {
+    CmiPrintf("Charm++> +setcpuaffinity implementation in progress\n");
+  }
+#endif
+  CmiNodeAllBarrier();
 }
 
 // Uses PU indices assigned by the OS
 int CmiSetCPUAffinity(int mycore) {
-  #if defined(CPU_OR)
+#if defined(CPU_OR)
   int core = mycore;
   if (core < 0) {
     printf("Error with core number");
@@ -431,34 +433,34 @@ int CmiSetCPUAffinity(int mycore) {
            CmiMyPe(), mycore);
 
   return result;
-  #else
+#else
   return -1;
-  #endif
+#endif
 }
 
-void CmiCheckAffinity(void)
-{
-  #if defined(CPU_OR)
-  if (!CmiCpuTopologyEnabled()) return;  // only works if cpu topology enabled
+void CmiCheckAffinity(void) {
+#if defined(CPU_OR)
+  if (!CmiCpuTopologyEnabled())
+    return; // only works if cpu topology enabled
 
   if (CmiNumPes() == 1)
     return;
 
-  if (CmiMyPe() == 0)
-  {
-    // wait for every PE affinity from my physical node (for now only done on phy node 0)
+  if (CmiMyPe() == 0) {
+    // wait for every PE affinity from my physical node (for now only done on
+    // phy node 0)
 
     cpu_set_t my_aff;
-    if (get_affinity(&my_aff) == -1) CmiAbort("get_affinity failed\n");
+    if (get_affinity(&my_aff) == -1)
+      CmiAbort("get_affinity failed\n");
     CPU_OR(&core_usage, &core_usage, &my_aff); // add my affinity (pe0)
 
     cpuAffSyncWait(cpuPhyAffCheckDone);
 
-  }
-  else if (CmiPhysicalNodeID(CmiMyPe()) == 0)
-  {
-    // send my affinity to first PE on physical node (only done on phy node 0 for now)
-    affMsg *m = (affMsg*)CmiAlloc(sizeof(affMsg));
+  } else if (CmiPhysicalNodeID(CmiMyPe()) == 0) {
+    // send my affinity to first PE on physical node (only done on phy node 0
+    // for now)
+    affMsg *m = (affMsg *)CmiAlloc(sizeof(affMsg));
     CmiSetHandler((char *)m, cpuPhyNodeAffinityRecvHandlerIdx);
     if (get_affinity(&m->affinity) == -1) { // put my affinity in msg
       CmiFree(m);
@@ -471,8 +473,7 @@ void CmiCheckAffinity(void)
 
   CmiBarrier();
 
-  if (CmiMyPe() == 0)
-  {
+  if (CmiMyPe() == 0) {
     // NOTE this test is simple and may not detect every possible case of
     // oversubscription
     const int N = CmiNumPesOnPhysicalNode(0);
@@ -480,15 +481,17 @@ void CmiCheckAffinity(void)
       // TODO suggest command line arguments?
       if (!aff_is_set) {
         CmiAbort("Multiple PEs assigned to same core. Set affinity "
-        "options to correct or lower the number of threads, or pass +setcpuaffinity to ignore.\n");
+                 "options to correct or lower the number of threads, or pass "
+                 "+setcpuaffinity to ignore.\n");
       } else {
-        CmiPrintf("WARNING: Multiple PEs assigned to same core, recommend "
-        "adjusting processor affinity or passing +CmiSleepOnIdle to reduce "
-        "interference.\n");
+        CmiPrintf(
+            "WARNING: Multiple PEs assigned to same core, recommend "
+            "adjusting processor affinity or passing +CmiSleepOnIdle to reduce "
+            "interference.\n");
       }
     }
   }
-  #endif
+#endif
 }
 #else
 // Dummy function if RECONVERSE_ENABLE_CPU_AFFINITY not set
diff --git a/src/msgmgr.cpp b/src/msgmgr.cpp
index 22e8cb4..849fd8a 100644
--- a/src/msgmgr.cpp
+++ b/src/msgmgr.cpp
@@ -1,101 +1,112 @@
 
-#include <stdlib.h>
 #include <converse.h>
+#include <stdlib.h>
 
-#define CmiAlloc  malloc
-#define CmiFree   free
+#define CmiAlloc malloc
+#define CmiFree free
 
 typedef struct CmmEntryStruct *CmmEntry;
 
-struct CmmEntryStruct
-{
+struct CmmEntryStruct {
   CmmEntry next;
-  void    *msg;
-  int      ntags;
-  int      tags[1];
+  void *msg;
+  int ntags;
+  int tags[1];
 };
 
-struct CmmTableStruct
-{
-  CmmEntry  first;
+struct CmmTableStruct {
+  CmmEntry first;
   CmmEntry *lasth;
 };
 
-
-CmmTable CmmNew(void)
-{
+CmmTable CmmNew(void) {
   CmmTable result = (CmmTable)CmiAlloc(sizeof(struct CmmTableStruct));
   result->first = 0;
   result->lasth = &(result->first);
   return result;
 }
 
-void CmmFree(CmmTable t)
-{
-  if (t==NULL) return;
-  if (t->first!=NULL) CmiAbort("Cannot free a non-empty message table!");
+void CmmFree(CmmTable t) {
+  if (t == NULL)
+    return;
+  if (t->first != NULL)
+    CmiAbort("Cannot free a non-empty message table!");
   CmiFree(t);
 }
 
 /* free all table entries but not the space pointed by "msg" */
-void CmmFreeAll(CmmTable t){
-    CmmEntry cur;
-    if(t==NULL) return;
-    cur = t->first;
-    while(cur){
-	CmmEntry toDel = cur;
-	cur = cur->next;
-	CmiFree(toDel);
-    }
+void CmmFreeAll(CmmTable t) {
+  CmmEntry cur;
+  if (t == NULL)
+    return;
+  cur = t->first;
+  while (cur) {
+    CmmEntry toDel = cur;
+    cur = cur->next;
+    CmiFree(toDel);
+  }
 }
 
-void CmmPut(CmmTable t, int ntags, int *tags, void *msg)
-{
+void CmmPut(CmmTable t, int ntags, int *tags, void *msg) {
   int i;
-  CmmEntry e=(CmmEntry)CmiAlloc(sizeof(struct CmmEntryStruct)+(ntags*sizeof(int)));
+  CmmEntry e =
+      (CmmEntry)CmiAlloc(sizeof(struct CmmEntryStruct) + (ntags * sizeof(int)));
   e->next = 0;
   e->msg = msg;
   e->ntags = ntags;
-  for (i=0; i<ntags; i++) e->tags[i] = tags[i];
+  for (i = 0; i < ntags; i++)
+    e->tags[i] = tags[i];
   *(t->lasth) = e;
   t->lasth = &(e->next);
 }
 
-static int CmmTagsMatch(int ntags1, int *tags1, int ntags2, int *tags2)
-{
+static int CmmTagsMatch(int ntags1, int *tags1, int ntags2, int *tags2) {
   int ntags = ntags1;
-  if (ntags1 != ntags2) return 0;
+  if (ntags1 != ntags2)
+    return 0;
   while (1) {
     int tag1, tag2;
-    if (ntags == 0) return 1;
+    if (ntags == 0)
+      return 1;
     ntags--;
     tag1 = *tags1++;
     tag2 = *tags2++;
-    if (tag1==tag2) continue;
-    if (tag1==CmmWildCard) continue;
-    if (tag2==CmmWildCard) continue;
+    if (tag1 == tag2)
+      continue;
+    if (tag1 == CmmWildCard)
+      continue;
+    if (tag2 == CmmWildCard)
+      continue;
     return 0;
   }
 }
 
-void *CmmFind(CmmTable t, int ntags, int *tags, int *rtags, int del)
-{
-  CmmEntry *enth; CmmEntry ent; void *msg; int i;
-  /* Added by Chao Mei to handle the case where t is already freed, which happens in ~ampi() when doing out-of-core emulation for AMPI programs. */
-  if(t==NULL) return NULL;
+void *CmmFind(CmmTable t, int ntags, int *tags, int *rtags, int del) {
+  CmmEntry *enth;
+  CmmEntry ent;
+  void *msg;
+  int i;
+  /* Added by Chao Mei to handle the case where t is already freed, which
+   * happens in ~ampi() when doing out-of-core emulation for AMPI programs. */
+  if (t == NULL)
+    return NULL;
 
   enth = &(t->first);
   while (1) {
     ent = (*enth);
-    if (ent==0) return 0;
+    if (ent == 0)
+      return 0;
     if (CmmTagsMatch(ntags, tags, ent->ntags, ent->tags)) {
-      if (rtags) for (i=0; i<ntags; i++) rtags[i] = ent->tags[i];
+      if (rtags)
+        for (i = 0; i < ntags; i++)
+          rtags[i] = ent->tags[i];
       msg = ent->msg;
       if (del) {
-	CmmEntry next = ent->next;
-	(*enth) = next;
-	if (next == 0) t->lasth = enth;
-	CmiFree(ent);
+        CmmEntry next = ent->next;
+        (*enth) = next;
+        if (next == 0)
+          t->lasth = enth;
+        CmiFree(ent);
       }
       return msg;
     }
@@ -104,23 +115,23 @@ void *CmmFind(CmmTable t, int ntags, int *tags, int *rtags, int del)
 }
 
 /* match the first ntags tags and return the last tag */
-int CmmGetLastTag(CmmTable t, int ntags, int* tags)
-{
-  CmmEntry *enth; CmmEntry ent;
+int CmmGetLastTag(CmmTable t, int ntags, int *tags) {
+  CmmEntry *enth;
+  CmmEntry ent;
   enth = &(t->first);
   while (1) {
     ent = (*enth);
-    if (ent==0) return -1;
+    if (ent == 0)
+      return -1;
     if (CmmTagsMatch(ntags, tags, ntags, ent->tags)) {
-      return (ent->tags[ent->ntags-1]);
+      return (ent->tags[ent->ntags - 1]);
     }
     enth = &(ent->next);
   }
   return -1;
 }
 
-int CmmEntries(CmmTable t)
-{
+int CmmEntries(CmmTable t) {
   int n = 0;
   CmmEntry e = t->first;
   while (e) {
diff --git a/src/queue.h b/src/queue.h
index 2c185aa..f037ab9 100644
--- a/src/queue.h
+++ b/src/queue.h
@@ -1,144 +1,109 @@
 #ifndef QUEUE_H
 #define QUEUE_H
 
-#include <queue>
+#include "concurrentqueue.h"
 #include <mutex>
-#include <stdexcept>
 #include <optional>
-#include "concurrentqueue.h"
+#include <queue>
+#include <stdexcept>
 
-template<typename T>
-using QueueResult = std::optional<T>;
+template <typename T> using QueueResult = std::optional<T>;
 
-template<typename ConcreteQ, typename MessageType>
-class MutexAccessControl {
-    ConcreteQ q;
-    std::mutex mtx;
+template <typename ConcreteQ, typename MessageType> class MutexAccessControl {
+  ConcreteQ q;
+  std::mutex mtx;
 
 public:
-    void push(MessageType message) {
-        std::lock_guard<std::mutex> lock(mtx);
-        q.push(message);
-    }
-
-    QueueResult<MessageType> pop_result() {
-        std::lock_guard<std::mutex> lock(mtx);
-        if (q.empty()) {
-            return std::nullopt;
-        } else {
-            MessageType val = q.front();
-            q.pop();
-            return QueueResult<MessageType>(val);
-        }
-    }
-
-
-    size_t size() {
-        std::lock_guard<std::mutex> lock(mtx);
-        return q.size();
-    }
-
-    bool empty() {
-         std::lock_guard<std::mutex> lock(mtx);
-        return q.empty();
-    }
+  void push(MessageType message) {
+    std::lock_guard<std::mutex> lock(mtx);
+    q.push(message);
+  }
+
+  QueueResult<MessageType> pop_result() {
+    std::lock_guard<std::mutex> lock(mtx);
+    if (q.empty()) {
+      return std::nullopt;
+    } else {
+      MessageType val = q.front();
+      q.pop();
+      return QueueResult<MessageType>(val);
+    }
+  }
+
+  size_t size() {
+    std::lock_guard<std::mutex> lock(mtx);
+    return q.size();
+  }
+
+  bool empty() {
+    std::lock_guard<std::mutex> lock(mtx);
+    return q.empty();
+  }
 };
 
-template<typename MessageType>
-class AtomicAccessControl {
-    // what default size?
-    moodycamel::ConcurrentQueue<MessageType> q{256};
+template <typename MessageType> class AtomicAccessControl {
+  // what default size?
+  moodycamel::ConcurrentQueue<MessageType> q{256};
 
-    public:
-    void push(MessageType message) {
-        q.enqueue(message);
-    }
+public:
+  void push(MessageType message) { q.enqueue(message); }
 
-    QueueResult<MessageType> pop_result() {
-        MessageType message;
-        bool success = q.try_dequeue(message);
-        return success ? QueueResult<MessageType>(message) : std::nullopt;
-    }
+  QueueResult<MessageType> pop_result() {
+    MessageType message;
+    bool success = q.try_dequeue(message);
+    return success ? QueueResult<MessageType>(message) : std::nullopt;
+  }
 
-    size_t size() {
-        return q.size_approx();
-    }
+  size_t size() { return q.size_approx(); }
 
-    bool empty() {
-        return q.size_approx() == 0;
-    }
+  bool empty() { return q.size_approx() == 0; }
 };
 
 // An MPSC queue that can be used to send messages between threads.
-template <typename MessageType, typename AccessControlPolicy>
-class MPSCQueue
-{
-    AccessControlPolicy policy;
+template <typename MessageType, typename AccessControlPolicy> class MPSCQueue {
+  AccessControlPolicy policy;
 
 public:
-    QueueResult<MessageType> pop()
-    {
-        return policy.pop_result();
-    }
+  QueueResult<MessageType> pop() { return policy.pop_result(); }
 
-    void push(MessageType message)
-    {
-        policy.push(message);
-    }
+  void push(MessageType message) { policy.push(message); }
 
-    bool empty()
-    {
-        return policy.empty();
-    }
+  bool empty() { return policy.empty(); }
 
-    size_t size()
-    {
-        return policy.size();
-    }
+  size_t size() { return policy.size(); }
 };
 
-template <typename MessageType, typename AccessControlPolicy>
-class MPMCQueue
-{
-    AccessControlPolicy policy;
+template <typename MessageType, typename AccessControlPolicy> class MPMCQueue {
+  AccessControlPolicy policy;
 
 public:
-    QueueResult<MessageType> pop()
-    {
-        return policy.pop_result();
-    }
+  QueueResult<MessageType> pop() { return policy.pop_result(); }
 
-    void push(MessageType message)
-    {
-        policy.push(message);
-    }
+  void push(MessageType message) { policy.push(message); }
 
-    bool empty()
-    {
-        return policy.empty();
-    }
+  bool empty() { return policy.empty(); }
 
-    size_t size()
-    {
-        return policy.size();
-    }
+  size_t size() { return policy.size(); }
 };
 
-
 #ifdef ATOMIC_QUEUE_ENABLED
 template <typename MessageType>
 using ConverseQueue = MPSCQueue<MessageType, AtomicAccessControl<MessageType>>;
 
 template <typename MessageType>
-using ConverseNodeQueue = MPMCQueue<MessageType, AtomicAccessControl<MessageType>>;
+using ConverseNodeQueue =
+    MPMCQueue<MessageType, AtomicAccessControl<MessageType>>;
 
 #else
 template <typename MessageType>
-using ConverseQueue = MPSCQueue<MessageType, MutexAccessControl<std::queue<MessageType>, MessageType>>;
+using ConverseQueue =
+    MPSCQueue<MessageType,
+              MutexAccessControl<std::queue<MessageType>, MessageType>>;
 
 template <typename MessageType>
-using ConverseNodeQueue = MPMCQueue<MessageType, MutexAccessControl<std::queue<MessageType>, MessageType>>;
+using ConverseNodeQueue =
+    MPMCQueue<MessageType,
+              MutexAccessControl<std::queue<MessageType>, MessageType>>;
 #endif
 
-
 #endif
\ No newline at end of file
diff --git a/src/scheduler.cpp b/src/scheduler.cpp
index 8368424..332425b 100644
--- a/src/scheduler.cpp
+++ b/src/scheduler.cpp
@@ -85,7 +85,7 @@ void CsdSchedulePoll() {
   // get node level queue
   ConverseNodeQueue<void *> *nodeQueue = CmiGetNodeQueue();
 
-  while(1){
+  while (1) {
 
     CcdCallBacks();
 
@@ -124,27 +124,26 @@ void CsdSchedulePoll() {
 
     else {
       comm_backend::progress();
-      break; //break when queues are empty
+      break; // break when queues are empty
     }
-
   }
-
 }
 
-int CsdScheduler(int maxmsgs){
+int CsdScheduler(int maxmsgs) {
   if (maxmsgs < 0) {
-    CsdScheduler(); //equivalent to CsdScheduleForever in old converse
-  }
-  else CsdSchedulePoll(); //not implementing CsdScheduleCount
+    CsdScheduler(); // equivalent to CsdScheduleForever in old converse
+  } else
+    CsdSchedulePoll(); // not implementing CsdScheduleCount
   return 0;
-  
 }
 
-void CsdEnqueueGeneral(void *Message, int strategy, int priobits, int *prioptr){
+void CsdEnqueueGeneral(void *Message, int strategy, int priobits,
+                       int *prioptr) {
   CmiPushPE(CmiMyPe(), sizeof(Message), Message);
 }
 
-void CsdNodeEnqueueGeneral(void *Message, int strategy, int priobits, unsigned int *prioptr){
+void CsdNodeEnqueueGeneral(void *Message, int strategy, int priobits,
+                           unsigned int *prioptr) {
   CmiGetNodeQueue()->push(Message);
 }
 
diff --git a/src/threads.cpp b/src/threads.cpp
index 44883e9..bbc5bb9 100644
--- a/src/threads.cpp
+++ b/src/threads.cpp
@@ -358,17 +358,17 @@ void CthAwaken(CthThread th) {
   awakenfn(token, strategy, 0, 0); // If this crashes, disable ASLR.
 }
 
-void CthAwakenPrio(CthThread th, int s, int pb, unsigned int *prio)
-{
+void CthAwakenPrio(CthThread th, int s, int pb, unsigned int *prio) {
   CthAwkFn awakenfn = B(th)->awakenfn;
-  if (awakenfn == 0) CthNoStrategy();
+  if (awakenfn == 0)
+    CthNoStrategy();
 #if CMK_TRACE_ENABLED
-#if ! CMK_TRACE_IN_CHARM
-  if(CpvAccess(traceOn))
+#if !CMK_TRACE_IN_CHARM
+  if (CpvAccess(traceOn))
     traceAwaken(th);
 #endif
 #endif
-  CthThreadToken * token = B(th)->token;
+  CthThreadToken *token = B(th)->token;
   awakenfn(token, s, pb, prio); // If this crashes, disable ASLR.
   B(th)->scheduled++;
 }
@@ -497,15 +497,14 @@ void CthRegistered(size_t maxOffset) {
 /* possible hack? CW */
 char *CthGetData(CthThread t) { return B(t)->data; }
 
-void CthSetEventInfo(CthThread t, int event, int srcPE) 
-{
+void CthSetEventInfo(CthThread t, int event, int srcPE) {
   B(t)->eventID = event;
   B(t)->srcPE = srcPE;
 }
 
-void CthFree(CthThread t)
-{
-  if (t==NULL) return;
+void CthFree(CthThread t) {
+  if (t == NULL)
+    return;
 
   if (t != CthSelf()) {
     CthThreadFree(t);
diff --git a/tests/ping_ack/ping.cpp b/tests/ping_ack/ping.cpp
index 48b40f7..308a0a6 100644
--- a/tests/ping_ack/ping.cpp
+++ b/tests/ping_ack/ping.cpp
@@ -1,15 +1,16 @@
-#include <stdio.h>
+#include "converse.h"
 #include <math.h>
+#include <stdio.h>
 #include <time.h>
-#include "converse.h"
 
 CpvDeclare(int, bigmsg_index);
 CpvDeclare(int, ackmsg_index);
 CpvDeclare(int, shortmsg_index);
 CpvDeclare(int, msg_size);
-CpvDeclare(int, trial);               // increments per trial, gets set to 0 at the start of a new msg size
-CpvDeclare(int, round);               // increments per msg size
-CpvDeclare(int, warmup_flag);         // 1 when in warmup round, 0 when not
+CpvDeclare(int, trial); // increments per trial, gets set to 0 at the start of a
+                        // new msg size
+CpvDeclare(int, round); // increments per msg size
+CpvDeclare(int, warmup_flag); // 1 when in warmup round, 0 when not
 CpvDeclare(int, recv_count);
 CpvDeclare(int, ack_count);
 CpvDeclare(double, total_time);
@@ -17,21 +18,20 @@ CpvDeclare(double, process_time);
 CpvDeclare(double, send_time);
 
 int msg_count;
-#define nMSG_SIZE 3                   // if the msg_sizes are hard_coded, this should be the same as the length of the hard coded array
+#define nMSG_SIZE                                                              \
+  3 // if the msg_sizes are hard_coded, this should be the same as the length of
+    // the hard coded array
 #define nTRIALS_PER_SIZE 10
-#define CALCULATION_PRECISION 0.0001  // the decimal place that the output data is rounded to
+#define CALCULATION_PRECISION                                                  \
+  0.0001 // the decimal place that the output data is rounded to
 
-double total_time[nTRIALS_PER_SIZE];  // times are stored in us
+double total_time[nTRIALS_PER_SIZE]; // times are stored in us
 double process_time[nTRIALS_PER_SIZE];
 double send_time[nTRIALS_PER_SIZE];
 
-
 int msg_sizes[nMSG_SIZE] = {56, 4096, 65536}; // hard coded msg_size values
 
-
-
-typedef struct myMsg
-{
+typedef struct myMsg {
   char header[CmiMsgHeaderSizeBytes];
   int payload[1];
 } *message;
@@ -44,9 +44,9 @@ double round_to(double val, double precision) {
 
 double get_average(double arr[]) {
   double tot = 0;
-  for (int i = 0; i < nTRIALS_PER_SIZE; ++i) tot += arr[i];
+  for (int i = 0; i < nTRIALS_PER_SIZE; ++i)
+    tot += arr[i];
   return (round_to(tot, CALCULATION_PRECISION) / nTRIALS_PER_SIZE);
-
 }
 
 double get_stdev(double arr[]) {
@@ -61,29 +61,36 @@ double get_stdev(double arr[]) {
 double get_max(double arr[]) {
   double max = arr[0];
   for (int i = 1; i < nTRIALS_PER_SIZE; ++i)
-                if (arr[i] > arr[0]) max = arr[i];
-        return max;
+    if (arr[i] > arr[0])
+      max = arr[i];
+  return max;
 }
 
-
 void print_results() {
   if (!CpvAccess(warmup_flag)) {
     CmiPrintf("msg_size=%d\n", CpvAccess(msg_size));
-    //for (int i = 0; i < nTRIALS_PER_SIZE; ++i) {
-      //DEBUG: print without trial number:
-      //CmiPrintf("Send time: %f, process time: %f, total time: %f\n", send_time[i], process_time[i], total_time[i]);
-
-      //DEBUG: print with trial number:
-      //CmiPrintf("%d %f\n  %f\n  %f\n", i, send_time[i], process_time[i], total_time[i]);
+    // for (int i = 0; i < nTRIALS_PER_SIZE; ++i) {
+    // DEBUG: print without trial number:
+    // CmiPrintf("Send time: %f, process time: %f, total time: %f\n",
+    // send_time[i], process_time[i], total_time[i]);
+
+    // DEBUG: print with trial number:
+    // CmiPrintf("%d %f\n  %f\n  %f\n", i, send_time[i], process_time[i],
+    // total_time[i]);
     //}
     // print data:
-    CmiPrintf("Format: {#PEs},{msg_size},{average send/process/total time (us)},{stdevs*3},{maxs*3}\n");
-    CmiPrintf("DATA,%d,%d,%f,%f,%f,%f,%f,%f,%f,%f,%f\n", CmiNumPes(), CpvAccess(msg_size), get_average(send_time), get_average(process_time), get_average(total_time),
-                                get_stdev(send_time), get_stdev(process_time), get_stdev(total_time), get_max(send_time), get_max(process_time), get_max(total_time));
-                
-                
+    CmiPrintf("Format: {#PEs},{msg_size},{average send/process/total time "
+              "(us)},{stdevs*3},{maxs*3}\n");
+    CmiPrintf("DATA,%d,%d,%f,%f,%f,%f,%f,%f,%f,%f,%f\n", CmiNumPes(),
+              CpvAccess(msg_size), get_average(send_time),
+              get_average(process_time), get_average(total_time),
+              get_stdev(send_time), get_stdev(process_time),
+              get_stdev(total_time), get_max(send_time), get_max(process_time),
+              get_max(total_time));
+
   } else {
-    if (CpvAccess(round) == nMSG_SIZE - 1)  // if this is the end of the warmup round
+    if (CpvAccess(round) ==
+        nMSG_SIZE - 1) // if this is the end of the warmup round
       CmiPrintf("Warm up Done!\n");
 
     // DEBUG: Print what msg_size the warmup round is on
@@ -95,25 +102,29 @@ void print_results() {
 void send_msg() {
   double start_time, crt_time;
   struct myMsg *msg;
-  //  CmiPrintf("\nSending msg fron pe%d to pe%d\n",CmiMyPe(), CmiNumPes()/2+CmiMyPe());
+  //  CmiPrintf("\nSending msg fron pe%d to pe%d\n",CmiMyPe(),
+  //  CmiNumPes()/2+CmiMyPe());
   CpvAccess(process_time) = 0.0;
   CpvAccess(send_time) = 0.0;
   CpvAccess(total_time) = CmiWallTimer();
-  for(int k = 0; k < msg_count; k++) {
+  for (int k = 0; k < msg_count; k++) {
     crt_time = CmiWallTimer();
     msg = (message)CmiAlloc(CpvAccess(msg_size));
 
     // Fills payload with ints
-    for (int i = 0; i < (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int); ++i) msg->payload[i] = i;
-    
+    for (int i = 0;
+         i < (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int); ++i)
+      msg->payload[i] = i;
+
     // DEBUG: Print ints stored in payload
-    // for (int i = 0; i < (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int); ++i) CmiPrintf("%d ", msg->payload[i]);
-    // CmiPrintf("\n");
+    // for (int i = 0; i < (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) /
+    // sizeof(int); ++i) CmiPrintf("%d ", msg->payload[i]); CmiPrintf("\n");
 
     CmiSetHandler(msg, CpvAccess(bigmsg_index));
-    CpvAccess(process_time) = CmiWallTimer() - crt_time + CpvAccess(process_time);
+    CpvAccess(process_time) =
+        CmiWallTimer() - crt_time + CpvAccess(process_time);
     start_time = CmiWallTimer();
-    //Send from my pe-i on node-0 to q+i on node-1
+    // Send from my pe-i on node-0 to q+i on node-1
     CmiSyncSendAndFree(CmiNumPes() / 2 + CmiMyPe(), CpvAccess(msg_size), msg);
     CpvAccess(send_time) = CmiWallTimer() - start_time + CpvAccess(send_time);
   }
@@ -122,18 +133,20 @@ void send_msg() {
 void shortmsg_handler(void *vmsg) {
   message smsg = (message)vmsg;
   CmiFree(smsg);
-  if (!CpvAccess(warmup_flag)) {     // normal round handling
-    if (CpvAccess(trial) == nTRIALS_PER_SIZE) { // if we have run the current msg size for nTRIALS
+  if (!CpvAccess(warmup_flag)) { // normal round handling
+    if (CpvAccess(trial) ==
+        nTRIALS_PER_SIZE) { // if we have run the current msg size for nTRIALS
       CpvAccess(round) = CpvAccess(round) + 1;
       CpvAccess(trial) = 0;
       CpvAccess(msg_size) = msg_sizes[CpvAccess(round)];
-    } 
-  } else {   // warmup round handling
-    if (CpvAccess(round) == nMSG_SIZE - 1) {  // if this is the end of the warmup round
+    }
+  } else { // warmup round handling
+    if (CpvAccess(round) ==
+        nMSG_SIZE - 1) { // if this is the end of the warmup round
       CpvAccess(round) = 0;
       CpvAccess(msg_size) = msg_sizes[0];
       CpvAccess(warmup_flag) = 0;
-    } else {                                  // otherwise warm up the next msg size
+    } else { // otherwise warm up the next msg size
       CpvAccess(round) = CpvAccess(round) + 1;
       CpvAccess(msg_size) = msg_sizes[CpvAccess(round)];
     }
@@ -143,16 +156,14 @@ void shortmsg_handler(void *vmsg) {
 }
 
 void do_work(long start, long end, void *result) {
-  long tmp=0;
-  for (long i=start; i<=end; i++) {
-    tmp+=(long)(sqrt(1+cos(i*1.57)));
+  long tmp = 0;
+  for (long i = start; i <= end; i++) {
+    tmp += (long)(sqrt(1 + cos(i * 1.57)));
   }
   *(long *)result = tmp + *(long *)result;
 }
 
-
-void bigmsg_handler(void *vmsg)
-{
+void bigmsg_handler(void *vmsg) {
   int i, next;
   message msg = (message)vmsg;
   // if this is a receiving PE
@@ -160,25 +171,28 @@ void bigmsg_handler(void *vmsg)
     CpvAccess(recv_count) = 1 + CpvAccess(recv_count);
     long sum = 0;
     long result = 0;
-    double num_ints = (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int);
+    double num_ints =
+        (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int);
     double exp_avg = (num_ints - 1) / 2;
     for (i = 0; i < num_ints; ++i) {
       sum += msg->payload[i];
-      do_work(i,sum,&result);
+      do_work(i, sum, &result);
     }
-    if(result < 0) {
+    if (result < 0) {
       CmiPrintf("Error! in computation");
     }
     double calced_avg = sum / num_ints;
     if (calced_avg != exp_avg) {
-      CmiPrintf("Calculated average of %f does not match expected value of %f, exiting\n", calced_avg, exp_avg);
+      CmiPrintf("Calculated average of %f does not match expected value of %f, "
+                "exiting\n",
+                calced_avg, exp_avg);
       CmiExit(1);
-    } 
+    }
     // else
     //   CmiPrintf("Calculation OK\n"); // DEBUG: Computation Check
-    if(CpvAccess(recv_count) == msg_count) {
+    if (CpvAccess(recv_count) == msg_count) {
       CpvAccess(recv_count) = 0;
-      
+
       CmiFree(msg);
       msg = (message)CmiAlloc(CpvAccess(msg_size));
       CmiSetHandler(msg, CpvAccess(ackmsg_index));
@@ -189,44 +203,50 @@ void bigmsg_handler(void *vmsg)
     CmiPrintf("\nError: Only node-1 can be receiving node!!!!\n");
 }
 
-void pe0_ack_handler(void *vmsg)
-{
+void pe0_ack_handler(void *vmsg) {
   int pe;
   message msg = (message)vmsg;
-   //Pe-0 receives all acks
+  // Pe-0 receives all acks
   CpvAccess(ack_count) = 1 + CpvAccess(ack_count);
 
   // DEBUG: Computation Print Check
-  // CmiPrintf("All %d messages of size %d on trial %d OK\n", MSG_COUNT, CpvAccess(msg_size), CpvAccess(trial));
-    
+  // CmiPrintf("All %d messages of size %d on trial %d OK\n", MSG_COUNT,
+  // CpvAccess(msg_size), CpvAccess(trial));
 
-  if(CpvAccess(ack_count) == CmiNumPes()/2) {
+  if (CpvAccess(ack_count) == CmiNumPes() / 2) {
     CpvAccess(ack_count) = 0;
     CpvAccess(total_time) = CmiWallTimer() - CpvAccess(total_time);
 
     // DEBUG: Original Print Statement
-    //CmiPrintf("Received [Trial=%d, msg size=%d] ack on PE-#%d send time=%lf, process time=%lf, total time=%lf\n",
-    //         CpvAccess(trial), CpvAccess(msg_size), CmiMyPe(), CpvAccess(send_time), CpvAccess(process_time), CpvAccess(total_time));
+    // CmiPrintf("Received [Trial=%d, msg size=%d] ack on PE-#%d send time=%lf,
+    // process time=%lf, total time=%lf\n",
+    //         CpvAccess(trial), CpvAccess(msg_size), CmiMyPe(),
+    //         CpvAccess(send_time), CpvAccess(process_time),
+    //         CpvAccess(total_time));
 
     CmiFree(msg);
 
     // store times in arrays
-    send_time[CpvAccess(trial)] =  CpvAccess(send_time) * 1000000.0;       // convert to microsecs.
+    send_time[CpvAccess(trial)] =
+        CpvAccess(send_time) * 1000000.0; // convert to microsecs.
     process_time[CpvAccess(trial)] = CpvAccess(process_time) * 1000000.0;
     total_time[CpvAccess(trial)] = CpvAccess(total_time) * 1000000.0;
 
     CpvAccess(trial) = CpvAccess(trial) + 1;
 
     // print results
-    if (CpvAccess(warmup_flag) || CpvAccess(trial) == nTRIALS_PER_SIZE) print_results();
+    if (CpvAccess(warmup_flag) || CpvAccess(trial) == nTRIALS_PER_SIZE)
+      print_results();
 
-    // if this is not the warmup round, and we have finished the final trial, and we are on the final msg size, exit
-    if(!CpvAccess(warmup_flag) && CpvAccess(trial) == nTRIALS_PER_SIZE && CpvAccess(round) == nMSG_SIZE - 1)
+    // if this is not the warmup round, and we have finished the final trial,
+    // and we are on the final msg size, exit
+    if (!CpvAccess(warmup_flag) && CpvAccess(trial) == nTRIALS_PER_SIZE &&
+        CpvAccess(round) == nMSG_SIZE - 1)
       CmiExit(0);
     else {
       // CmiPrintf("\nSending short msgs from PE-%d", CmiMyPe());
-      for(pe = 0 ; pe<CmiNumPes() / 2; pe++) {
-        int smsg_size = 4+CmiMsgHeaderSizeBytes;
+      for (pe = 0; pe < CmiNumPes() / 2; pe++) {
+        int smsg_size = 4 + CmiMsgHeaderSizeBytes;
         message smsg = (message)CmiAlloc(smsg_size);
         CmiSetHandler(smsg, CpvAccess(shortmsg_index));
         CmiSyncSendAndFree(pe, smsg_size, smsg);
@@ -235,8 +255,7 @@ void pe0_ack_handler(void *vmsg)
   }
 }
 
-void bigmsg_moduleinit(int argc, char **argv)
-{
+void bigmsg_moduleinit(int argc, char **argv) {
   CpvInitialize(int, bigmsg_index);
   CpvInitialize(int, ackmsg_index);
   CpvInitialize(int, shortmsg_index);
@@ -253,22 +272,22 @@ void bigmsg_moduleinit(int argc, char **argv)
   CpvAccess(bigmsg_index) = CmiRegisterHandler(bigmsg_handler);
   CpvAccess(shortmsg_index) = CmiRegisterHandler(shortmsg_handler);
   CpvAccess(ackmsg_index) = CmiRegisterHandler(pe0_ack_handler);
-  CpvAccess(msg_size) = 16+CmiMsgHeaderSizeBytes;
+  CpvAccess(msg_size) = 16 + CmiMsgHeaderSizeBytes;
   CpvAccess(trial) = 0;
   CpvAccess(round) = 0;
   CpvAccess(warmup_flag) = 1;
   msg_count = 100; // default msg count
   CmiGetArgInt(argv, "-msg_count", &msg_count);
 
-
   // Set runtime cpuaffinity
   CmiInitCPUAffinity(argv);
   // Initialize CPU topology
-  //CmiInitCPUTopology(argv);
-  if (CmiMyPe()==0 && CmiNumPes()%2 != 0) {
-    CmiPrintf("note: this test requires at multiple of 2 pes, skipping test.\n");
+  // CmiInitCPUTopology(argv);
+  if (CmiMyPe() == 0 && CmiNumPes() % 2 != 0) {
+    CmiPrintf(
+        "note: this test requires at multiple of 2 pes, skipping test.\n");
     CmiPrintf("exiting.\n");
-    //CsdExitScheduler();
+    // CsdExitScheduler();
     CmiExit(1);
   }
 
@@ -277,11 +296,10 @@ void bigmsg_moduleinit(int argc, char **argv)
 
   // Update the argc after runtime parameters are extracted out
   argc = CmiGetArgc(argv);
-  if(CmiMyPe() < CmiNumPes()/2)
+  if (CmiMyPe() < CmiNumPes() / 2)
     send_msg();
 }
 
-int main(int argc, char **argv)
-{
-	ConverseInit(argc,argv,bigmsg_moduleinit,0,0);
+int main(int argc, char **argv) {
+  ConverseInit(argc, argv, bigmsg_moduleinit, 0, 0);
 }
\ No newline at end of file
diff --git a/tests/rdma_pingpong/pingpong.cpp b/tests/rdma_pingpong/pingpong.cpp
index 2204c9b..f083edb 100644
--- a/tests/rdma_pingpong/pingpong.cpp
+++ b/tests/rdma_pingpong/pingpong.cpp
@@ -18,7 +18,6 @@ CpvDeclare(int, setRemoteBufHIdx);
 CpvDeclare(int, rmaGetSignalHIdx);
 CpvDeclare(int, exitBenchmarkHIdx);
 
-
 void setupRMABuf();
 
 void startRing() {
@@ -39,7 +38,8 @@ void setupRMABuf() {
   char *msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes + sizeof(CmiNcpyBuffer));
   *((CmiNcpyBuffer *)(msg + CmiMsgHeaderSizeBytes)) = CpvAccess(localBuf);
   CmiSetHandler(msg, CpvAccess(setRemoteBufHIdx));
-  CmiSyncSendAndFree(1 - CmiMyPe(), CmiMsgHeaderSizeBytes + sizeof(CmiNcpyBuffer), msg);
+  CmiSyncSendAndFree(1 - CmiMyPe(),
+                     CmiMsgHeaderSizeBytes + sizeof(CmiNcpyBuffer), msg);
 }
 
 void startWarmUp();
@@ -63,8 +63,7 @@ void startWarmUp() {
 }
 
 void doRMA() {
-  CpvAccess(localBuf).rdmaGet(CpvAccess(remoteBuf), 0,
-                          nullptr, nullptr);
+  CpvAccess(localBuf).rdmaGet(CpvAccess(remoteBuf), 0, nullptr, nullptr);
 }
 
 void rmaGetLocalComp(void *context) {
@@ -77,13 +76,12 @@ void rmaGetLocalComp(void *context) {
   // send remote completion signal
   char *msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes);
   CmiSetHandler(msg, CpvAccess(rmaGetSignalHIdx));
-  CmiSyncSendAndFree(1 - CmiMyPe(),
-                     CmiMsgHeaderSizeBytes, msg);
+  CmiSyncSendAndFree(1 - CmiMyPe(), CmiMsgHeaderSizeBytes, msg);
 }
 
 void endRing();
 
-void rmaGetSignalHandler(char*) {
+void rmaGetSignalHandler(char *) {
   if (CmiMyPe() == 1) {
     // PE 1 just do a "pong" get back
     doRMA();
@@ -113,7 +111,8 @@ void endRing() {
   CpvAccess(endTime) = CmiWallTimer();
 
   // Print the time for that message size
-  CmiPrintf("Size=%zu bytes, time=%lf microseconds one-way\n", CpvAccess(msgSize),
+  CmiPrintf("Size=%zu bytes, time=%lf microseconds one-way\n",
+            CpvAccess(msgSize),
             (1e6 * (CpvAccess(endTime) - CpvAccess(startTime))) /
                 (2. * CpvAccess(nCycles)));
 
@@ -126,7 +125,7 @@ void endRing() {
     exitBenchmark();
   }
 }
-void exitBenchmarkHandler(char*);
+void exitBenchmarkHandler(char *);
 
 void exitBenchmark() {
   char *msg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes);
@@ -134,7 +133,7 @@ void exitBenchmark() {
   CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes, msg);
 }
 
-void exitBenchmarkHandler(char*) {
+void exitBenchmarkHandler(char *) {
   // clean up resources
   if (CpvAccess(localBuf).ptr != nullptr) {
     CpvAccess(localBuf).deregisterMem();
@@ -159,11 +158,14 @@ CmiStartFn mymain(int argc, char *argv[]) {
 
   // Register Handlers
   CpvInitialize(int, setRemoteBufHIdx);
-  CpvAccess(setRemoteBufHIdx) = CmiRegisterHandler((CmiHandler)setRemoteBufHandler);
+  CpvAccess(setRemoteBufHIdx) =
+      CmiRegisterHandler((CmiHandler)setRemoteBufHandler);
   CpvInitialize(int, rmaGetSignalHIdx);
-  CpvAccess(rmaGetSignalHIdx) = CmiRegisterHandler((CmiHandler)rmaGetSignalHandler);
+  CpvAccess(rmaGetSignalHIdx) =
+      CmiRegisterHandler((CmiHandler)rmaGetSignalHandler);
   CpvInitialize(int, exitBenchmarkHIdx);
-  CpvAccess(exitBenchmarkHIdx) = CmiRegisterHandler((CmiHandler)exitBenchmarkHandler);
+  CpvAccess(exitBenchmarkHIdx) =
+      CmiRegisterHandler((CmiHandler)exitBenchmarkHandler);
 
   CpvInitialize(double, startTime);
   CpvInitialize(double, endTime);
@@ -206,8 +208,7 @@ CmiStartFn mymain(int argc, char *argv[]) {
   }
 
   if (CmiNumNodes() != 2 && CmiNumPes() != 2 && CmiMyPe() == 0) {
-    CmiAbort(
-        "This test is designed for only 2 nodes and with 1 PE per node\n");
+    CmiAbort("This test is designed for only 2 nodes and with 1 PE per node\n");
   }
 
   CpvAccess(msgSize) = CpvAccess(minMsgSize);
diff --git a/tests/self_send/self_send.cpp b/tests/self_send/self_send.cpp
index f15f25d..ea06616 100644
--- a/tests/self_send/self_send.cpp
+++ b/tests/self_send/self_send.cpp
@@ -126,10 +126,10 @@ CmiStartFn mymain(int argc, char *argv[]) {
   CpvInitialize(double, endTime);
 
   // Set runtime cpuaffinity
-//   CmiInitCPUAffinity(argv);
+  //   CmiInitCPUAffinity(argv);
 
   // Initialize CPU topology
-//   CmiInitCPUTopology(argv);
+  //   CmiInitCPUTopology(argv);
 
   // Wait for all PEs of the node to complete topology init
   CmiNodeAllBarrier();
@@ -168,10 +168,10 @@ CmiStartFn mymain(int argc, char *argv[]) {
               iterations, CpvAccess(msgSize));
   }
 
-    CpvAccess(startTime) = CmiWallTimer();
-    char *initiateMsg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes);
-    CmiSetHandler(initiateMsg, CpvAccess(initiateHandler));
-    CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes, initiateMsg);
+  CpvAccess(startTime) = CmiWallTimer();
+  char *initiateMsg = (char *)CmiAlloc(CmiMsgHeaderSizeBytes);
+  CmiSetHandler(initiateMsg, CpvAccess(initiateHandler));
+  CmiSyncBroadcastAllAndFree(CmiMsgHeaderSizeBytes, initiateMsg);
   return 0;
 }