From cf49fd902596310d4aad8b74fe5f25501172cbb9 Mon Sep 17 00:00:00 2001 From: Jiakun Yan Date: Mon, 29 Sep 2025 11:24:36 -0500 Subject: [PATCH] fix the clang-format ci and clang-format all files --- .clang-format | 39 +- .github/workflows/clang-format.yaml | 19 +- include/converse.h | 71 +- src/barrier.h | 24 +- src/cldb.none.cpp | 8 +- src/cldb.rand.cpp | 5 +- src/cmirdmautils.cpp | 96 +- src/collectives.cpp | 366 +- src/comm_backend/comm_backend.h | 6 +- src/comm_backend/comm_backend_internal.cpp | 13 +- src/comm_backend/comm_backend_internal.h | 5 +- src/comm_backend/lci2/comm_backend_lci2.cpp | 5 +- src/comm_backend/lci2/comm_backend_lci2.h | 11 +- src/concurrentqueue.h | 7400 ++++++++++--------- src/conv-conds.cpp | 23 +- src/conv-rdma.cpp | 97 +- src/conv-topology.cpp | 433 +- src/conv-topology.h | 5 +- src/convcore.cpp | 21 +- src/converse_internal.h | 10 +- src/cpuaffinity.cpp | 387 +- src/msgmgr.cpp | 123 +- src/queue.h | 163 +- src/scheduler.cpp | 21 +- src/threads.cpp | 21 +- tests/ping_ack/ping.cpp | 184 +- tests/rdma_pingpong/pingpong.cpp | 31 +- tests/self_send/self_send.cpp | 12 +- 28 files changed, 5024 insertions(+), 4575 deletions(-) diff --git a/.clang-format b/.clang-format index 1934577..b68024f 100644 --- a/.clang-format +++ b/.clang-format @@ -1,6 +1,6 @@ --- Language: Cpp -BasedOnStyle: LLVM +# BasedOnStyle: LLVM AccessModifierOffset: -2 AlignAfterOpenBracket: Align AlignArrayOfStructures: None @@ -36,29 +36,7 @@ AlignConsecutiveShortCaseStatements: Enabled: false AcrossEmptyLines: false AcrossComments: false - AlignCaseArrows: false AlignCaseColons: false -AlignConsecutiveTableGenBreakingDAGArgColons: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - AlignFunctionPointers: false - PadOperators: false -AlignConsecutiveTableGenCondOperatorColons: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - AlignFunctionPointers: false - PadOperators: false -AlignConsecutiveTableGenDefinitionColons: - Enabled: false - AcrossEmptyLines: false - AcrossComments: false - AlignCompound: false - AlignFunctionPointers: false - PadOperators: false AlignEscapedNewlines: Right AlignOperands: Align AlignTrailingComments: @@ -68,7 +46,6 @@ AllowAllArgumentsOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowBreakBeforeNoexceptSpecifier: Never AllowShortBlocksOnASingleLine: Never -AllowShortCaseExpressionOnASingleLine: true AllowShortCaseLabelsOnASingleLine: false AllowShortCompoundRequirementOnASingleLine: true AllowShortEnumsOnASingleLine: true @@ -77,7 +54,9 @@ AllowShortIfStatementsOnASingleLine: Never AllowShortLambdasOnASingleLine: All AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine AttributeMacros: - __capability BinPackArguments: true @@ -105,7 +84,6 @@ BraceWrapping: BreakAdjacentStringLiterals: true BreakAfterAttributes: Leave BreakAfterJavaFieldAnnotations: false -BreakAfterReturnType: None BreakArrays: true BreakBeforeBinaryOperators: None BreakBeforeConceptDeclarations: Always @@ -113,10 +91,8 @@ BreakBeforeBraces: Attach BreakBeforeInlineASMColon: OnlyMultiline BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeColon -BreakFunctionDefinitionParameters: false BreakInheritanceList: BeforeColon BreakStringLiterals: true -BreakTemplateDeclarations: MultiLine ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false @@ -172,15 +148,12 @@ IntegerLiteralSeparator: HexMinDigits: 0 JavaScriptQuotes: Leave JavaScriptWrapImports: true -KeepEmptyLines: - AtEndOfFile: false - AtStartOfBlock: true - AtStartOfFile: true +KeepEmptyLinesAtTheStartOfBlocks: true +KeepEmptyLinesAtEOF: false LambdaBodyIndentation: Signature LineEnding: DeriveLF MacroBlockBegin: '' MacroBlockEnd: '' -MainIncludeChar: Quote MaxEmptyLinesToKeep: 1 NamespaceIndentation: None ObjCBinPackProtocolList: Auto @@ -249,7 +222,6 @@ SpacesInLineCommentPrefix: Maximum: -1 SpacesInParens: Never SpacesInParensOptions: - ExceptDoubleParentheses: false InCStyleCasts: false InConditionalStatements: false InEmptyParentheses: false @@ -261,7 +233,6 @@ StatementAttributeLikeMacros: StatementMacros: - Q_UNUSED - QT_REQUIRE_VERSION -TableGenBreakInsideDAGArg: DontBreak TabWidth: 8 UseTab: Never VerilogBreakBetweenInstancePorts: true diff --git a/.github/workflows/clang-format.yaml b/.github/workflows/clang-format.yaml index 710accd..a53e3f9 100644 --- a/.github/workflows/clang-format.yaml +++ b/.github/workflows/clang-format.yaml @@ -21,10 +21,19 @@ jobs: sudo apt-get update sudo apt-get install clang-format - - name: Run clang-format + - name: Verify clang-format installation run: | - git diff --exit-code --ignore-submodules -- '*.cpp' '*.h' '*.c' '*.hpp' - if [ $? -ne 0 ]; then - echo "Clang-format failed. Please format your code." + clang-format --version + + - name: Run clang-format on all files + run: | + # From anywhere inside the repo: + repo_root=$(git rev-parse --show-toplevel) && cd "$repo_root" + git ls-files -z -- '*.c' '*.cc' '*.cpp' '*.cxx' '*.h' '*.hh' '*.hpp' '*.hxx' \ + | xargs -0 -r clang-format -i + + # Fail if formatting made changes + git diff --exit-code || { + echo "clang-format changed files above. Please commit the formatting." exit 1 - fi \ No newline at end of file + } \ No newline at end of file diff --git a/include/converse.h b/include/converse.h index 85bb459..bf74caa 100644 --- a/include/converse.h +++ b/include/converse.h @@ -302,7 +302,7 @@ int CmiMyRank(); int CmiNumPes(); int CmiNumNodes(); // FIXME -//#define CmiPhysicalNodeID(node) (node) +// #define CmiPhysicalNodeID(node) (node) extern int CmiPhysicalNodeID(int pe); int CmiNodeOf(int pe); int CmiRankOf(int pe); @@ -836,12 +836,12 @@ enum ncpyFreeNcpyOpInfoMode { #define CMK_SPANTREE_MAXSPAN 4 #define CST_W (CMK_SPANTREE_MAXSPAN) #define CST_NN (CmiNumNodes()) -#define CmiNodeSpanTreeParent(n) ((n) ? (((n)-1) / CST_W) : (-1)) +#define CmiNodeSpanTreeParent(n) ((n) ? (((n) - 1) / CST_W) : (-1)) #define CmiNodeSpanTreeChildren(n, c) \ do { \ int _i; \ for (_i = 0; _i < CST_W; _i++) { \ - int _x = (n)*CST_W + _i + 1; \ + int _x = (n) * CST_W + _i + 1; \ if (_x < CST_NN) \ (c)[_i] = _x; \ } \ @@ -849,7 +849,7 @@ enum ncpyFreeNcpyOpInfoMode { #define CmiNumNodeSpanTreeChildren(n) \ ((((n) + 1) * CST_W < CST_NN) \ ? CST_W \ - : ((((n)*CST_W + 1) >= CST_NN) ? 0 : ((CST_NN - 1) - (n)*CST_W))) + : ((((n) * CST_W + 1) >= CST_NN) ? 0 : ((CST_NN - 1) - (n) * CST_W))) #define CST_R(p) (CmiRankOf(p)) #define CST_NF(n) (CmiNodeFirst(n)) #define CST_SP(n) (CmiNodeSpanTreeParent(n)) @@ -920,32 +920,42 @@ void registerTraceInit(void (*fn)(char **argv)); int CmiDeliverMsgs(int maxmsgs); -#define CmiMemoryReadFence() std::atomic_thread_fence(std::memory_order_seq_cst) -#define CmiMemoryWriteFence() std::atomic_thread_fence(std::memory_order_seq_cst) +#define CmiMemoryReadFence() std::atomic_thread_fence(std::memory_order_seq_cst) +#define CmiMemoryWriteFence() \ + std::atomic_thread_fence(std::memory_order_seq_cst) extern CmiNodeLock CmiMemLock_lock; -#define CmiMemLock() do{if (CmiMemLock_lock) CmiLock(CmiMemLock_lock);} while (0) +#define CmiMemLock() \ + do { \ + if (CmiMemLock_lock) \ + CmiLock(CmiMemLock_lock); \ + } while (0) -#define CmiMemUnlock() do{if (CmiMemLock_lock) CmiUnlock(CmiMemLock_lock);} while (0) +#define CmiMemUnlock() \ + do { \ + if (CmiMemLock_lock) \ + CmiUnlock(CmiMemLock_lock); \ + } while (0) template struct CmiIsAtomic : std::false_type {}; template struct CmiIsAtomic> : std::true_type {}; template typename std::enable_if::value, typename T::value_type>::type -CmiAtomicFetchAndIncImpl(T& input) { - return std::atomic_fetch_add(&input, typename T::value_type(1)); +CmiAtomicFetchAndIncImpl(T &input) { + return std::atomic_fetch_add(&input, typename T::value_type(1)); } template typename std::enable_if::value, T>::type -CmiAtomicFetchAndIncImpl(T& input) { - T old = input; - ++input; - return old; +CmiAtomicFetchAndIncImpl(T &input) { + T old = input; + ++input; + return old; } -#define CmiMemoryAtomicFetchAndInc(input, output) ((output) = CmiAtomicFetchAndIncImpl(input)) +#define CmiMemoryAtomicFetchAndInc(input, output) \ + ((output) = CmiAtomicFetchAndIncImpl(input)) #define CmiEnableUrgentSend(yn) /* intentionally left empty */ @@ -953,30 +963,29 @@ typedef struct CmmTableStruct *CmmTable; #define CmmWildCard (-1) -//typedef void (*CmmPupMessageFn)(pup_er p,void **msg); -//CmmTable CmmPup(pup_er p, CmmTable t, CmmPupMessageFn msgpup); - -CmmTable CmmNew(void); -void CmmFree(CmmTable t); -void CmmFreeAll(CmmTable t); -void CmmPut(CmmTable t, int ntags, int *tags, void *msg); -void *CmmFind(CmmTable t, int ntags, int *tags, int *returntags, int del); -int CmmEntries(CmmTable t); -int CmmGetLastTag(CmmTable t, int ntags, int *tags); -#define CmmGet(t,nt,tg,rt) (CmmFind((t),(nt),(tg),(rt),1)) -#define CmmProbe(t,nt,tg,rt) (CmmFind((t),(nt),(tg),(rt),0)) +// typedef void (*CmmPupMessageFn)(pup_er p,void **msg); +// CmmTable CmmPup(pup_er p, CmmTable t, CmmPupMessageFn msgpup); +CmmTable CmmNew(void); +void CmmFree(CmmTable t); +void CmmFreeAll(CmmTable t); +void CmmPut(CmmTable t, int ntags, int *tags, void *msg); +void *CmmFind(CmmTable t, int ntags, int *tags, int *returntags, int del); +int CmmEntries(CmmTable t); +int CmmGetLastTag(CmmTable t, int ntags, int *tags); +#define CmmGet(t, nt, tg, rt) (CmmFind((t), (nt), (tg), (rt), 1)) +#define CmmProbe(t, nt, tg, rt) (CmmFind((t), (nt), (tg), (rt), 0)) #ifndef CMI_CACHE_LINE_SIZE #ifdef __cpp_lib_hardware_interference_size -# define CMI_CACHE_LINE_SIZE std::hardware_destructive_interference_size +#define CMI_CACHE_LINE_SIZE std::hardware_destructive_interference_size #elif CMK_PPC64 || (defined __APPLE__ && defined __arm64__) -# define CMI_CACHE_LINE_SIZE 128 +#define CMI_CACHE_LINE_SIZE 128 #else -# define CMI_CACHE_LINE_SIZE 64 +#define CMI_CACHE_LINE_SIZE 64 #endif #endif -//partitions +// partitions typedef enum Partition_Type { PARTITION_SINGLETON, diff --git a/src/barrier.h b/src/barrier.h index a77a804..51f6f9f 100644 --- a/src/barrier.h +++ b/src/barrier.h @@ -1,22 +1,21 @@ -// Acknowledgement: adopted from https://github.com/uiuc-hpc/lci/blob/master/lct/tbarrier/tbarrier.cpp +// Acknowledgement: adopted from +// https://github.com/uiuc-hpc/lci/blob/master/lct/tbarrier/tbarrier.cpp -#include #include +#include #include class Barrier { - private: +private: alignas(64) std::atomic waiting; alignas(64) std::atomic step; alignas(64) int thread_num_; - public: +public: explicit Barrier(int thread_num) - : waiting(0), step(0), thread_num_(thread_num) - {} + : waiting(0), step(0), thread_num_(thread_num) {} - int64_t arrive() - { + int64_t arrive() { int64_t mstep = step.load(); if (++waiting == thread_num_) { waiting = 0; @@ -27,13 +26,12 @@ class Barrier { bool test_ticket(int64_t ticket) { return ticket != step; } - void wait_ticket(int64_t ticket) - { - while (!test_ticket(ticket)) continue; + void wait_ticket(int64_t ticket) { + while (!test_ticket(ticket)) + continue; } - void wait() - { + void wait() { int64_t ticket = arrive(); wait_ticket(ticket); } diff --git a/src/cldb.none.cpp b/src/cldb.none.cpp index 39d4aaa..3e625e8 100644 --- a/src/cldb.none.cpp +++ b/src/cldb.none.cpp @@ -25,9 +25,9 @@ void CldHandler(char *msg) { msg); // use priority queue when we add priority queue } -void CldEnqueueGroup(CmiGroup grp, void *msg, int infofn) -{ - int len, queueing, priobits; unsigned int *prioptr; +void CldEnqueueGroup(CmiGroup grp, void *msg, int infofn) { + int len, queueing, priobits; + unsigned int *prioptr; CldInfoFn ifn = (CldInfoFn)CmiHandlerToFunction(infofn); CldPackFn pfn; ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr); @@ -36,7 +36,7 @@ void CldEnqueueGroup(CmiGroup grp, void *msg, int infofn) ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr); } CldSwitchHandler((char *)msg, CldHandlerIndex); - CmiSetInfo(msg,infofn); + CmiSetInfo(msg, infofn); CmiSyncMulticastAndFree(grp, len, msg); } diff --git a/src/cldb.rand.cpp b/src/cldb.rand.cpp index ccda7d4..e936339 100644 --- a/src/cldb.rand.cpp +++ b/src/cldb.rand.cpp @@ -103,9 +103,8 @@ void CldEnqueue(int pe, void *msg, int infofn) { /* CsdEnqueueGeneral is not thread or SIGIO safe */ // CmiPrintf(" myself processor %d ==> %d, length=%d Timer:%f , priori=%d // \n", CmiMyPe(), pe, len, CmiWallTimer(), *prioptr); - //CsdEnqueueGeneral(msg, queueing, priobits, prioptr); - CmiPushPE(CmiMyPe(), len, - msg); + // CsdEnqueueGeneral(msg, queueing, priobits, prioptr); + CmiPushPE(CmiMyPe(), len, msg); } else { ifn(msg, &pfn, &len, &queueing, &priobits, &prioptr); if (pfn && CmiNodeOf(pe) != CmiMyNode()) { diff --git a/src/cmirdmautils.cpp b/src/cmirdmautils.cpp index 990c4b5..ea77245 100644 --- a/src/cmirdmautils.cpp +++ b/src/cmirdmautils.cpp @@ -1,43 +1,25 @@ #include "cmirdmautils.h" #include "converse.h" // for CmiAbort usage to avoid undeclared warning +#include #include #include -#include -int getNcpyOpInfoTotalSize( - int srcLayerSize, - int srcAckSize, - int destLayerSize, - int destAckSize) { - return sizeof(NcpyOperationInfo) + srcLayerSize + destLayerSize + srcAckSize + destAckSize; +int getNcpyOpInfoTotalSize(int srcLayerSize, int srcAckSize, int destLayerSize, + int destAckSize) { + return sizeof(NcpyOperationInfo) + srcLayerSize + destLayerSize + srcAckSize + + destAckSize; } void setNcpyOpInfo( - const void *srcPtr, - char *srcLayerInfo, - int srcLayerSize, - char *srcAck, - int srcAckSize, - size_t srcSize, - unsigned short int srcRegMode, - unsigned short int srcDeregMode, - unsigned short int isSrcRegistered, - int srcPe, - const void *srcRef, - const void *destPtr, - char *destLayerInfo, - int destLayerSize, - char *destAck, - int destAckSize, - size_t destSize, - unsigned short int destRegMode, - unsigned short int destDeregMode, - unsigned short int isDestRegistered, - int destPe, - const void *destRef, - int rootNode, - NcpyOperationInfo *ncpyOpInfo) { + const void *srcPtr, char *srcLayerInfo, int srcLayerSize, char *srcAck, + int srcAckSize, size_t srcSize, unsigned short int srcRegMode, + unsigned short int srcDeregMode, unsigned short int isSrcRegistered, + int srcPe, const void *srcRef, const void *destPtr, char *destLayerInfo, + int destLayerSize, char *destAck, int destAckSize, size_t destSize, + unsigned short int destRegMode, unsigned short int destDeregMode, + unsigned short int isDestRegistered, int destPe, const void *destRef, + int rootNode, NcpyOperationInfo *ncpyOpInfo) { char *base = (char *)ncpyOpInfo + sizeof(NcpyOperationInfo); @@ -47,28 +29,28 @@ void setNcpyOpInfo( ncpyOpInfo->destAck = NULL; // memcpy srcLayerInfo - if(srcLayerInfo != NULL && srcLayerSize != 0) { + if (srcLayerInfo != NULL && srcLayerSize != 0) { memcpy(base, srcLayerInfo, srcLayerSize); ncpyOpInfo->srcLayerInfo = base; base = base + srcLayerSize; } // memcpy srcAckInfo - if(srcAck != NULL && srcAckSize != 0) { + if (srcAck != NULL && srcAckSize != 0) { memcpy(base, srcAck, srcAckSize); ncpyOpInfo->srcAck = base; base = base + srcAckSize; } // memcpy destLayerInfo - if(srcLayerInfo != NULL && destLayerSize != 0) { + if (srcLayerInfo != NULL && destLayerSize != 0) { memcpy(base, destLayerInfo, destLayerSize); ncpyOpInfo->destLayerInfo = base; base = base + destLayerSize; } // memcpy destAck Info - if(destAck != NULL && destAckSize != 0) { + if (destAck != NULL && destAckSize != 0) { memcpy(base, destAck, destAckSize); ncpyOpInfo->destAck = base; } @@ -103,50 +85,56 @@ void setNcpyOpInfo( CmiAssert(isDestRegistered <= std::numeric_limits::max()); ncpyOpInfo->isDestRegistered = (unsigned char)isDestRegistered; - ncpyOpInfo->opMode = CMK_DIRECT_API; // default operation mode is CMK_DIRECT_API - ncpyOpInfo->ackMode = CMK_SRC_DEST_ACK; // default ack mode is CMK_SRC_DEST_ACK - ncpyOpInfo->freeMe = CMK_FREE_NCPYOPINFO; // default freeMe mode is CMK_FREE_NCPYOPINFO + ncpyOpInfo->opMode = + CMK_DIRECT_API; // default operation mode is CMK_DIRECT_API + ncpyOpInfo->ackMode = + CMK_SRC_DEST_ACK; // default ack mode is CMK_SRC_DEST_ACK + ncpyOpInfo->freeMe = + CMK_FREE_NCPYOPINFO; // default freeMe mode is CMK_FREE_NCPYOPINFO ncpyOpInfo->rootNode = rootNode; - ncpyOpInfo->ncpyOpInfoSize = (unsigned short int)(sizeof(NcpyOperationInfo) + srcLayerSize + destLayerSize + srcAckSize + destAckSize); + ncpyOpInfo->ncpyOpInfoSize = + (unsigned short int)(sizeof(NcpyOperationInfo) + srcLayerSize + + destLayerSize + srcAckSize + destAckSize); } - void resetNcpyOpInfoPointers(NcpyOperationInfo *ncpyOpInfo) { char *base = (char *)ncpyOpInfo + sizeof(NcpyOperationInfo); - if(ncpyOpInfo->srcLayerInfo) { + if (ncpyOpInfo->srcLayerInfo) { ncpyOpInfo->srcLayerInfo = base; base = base + ncpyOpInfo->srcLayerSize; } - if(ncpyOpInfo->srcAck) { + if (ncpyOpInfo->srcAck) { ncpyOpInfo->srcAck = base; base = base + ncpyOpInfo->srcAckSize; } - if(ncpyOpInfo->destLayerInfo) { + if (ncpyOpInfo->destLayerInfo) { ncpyOpInfo->destLayerInfo = base; base = base + ncpyOpInfo->destLayerSize; } - if(ncpyOpInfo->destAck) { + if (ncpyOpInfo->destAck) { ncpyOpInfo->destAck = base; } - } void setReverseModeForNcpyOpInfo(NcpyOperationInfo *ncpyOpInfo) { - switch(ncpyOpInfo->opMode) { - case CMK_EM_API : ncpyOpInfo->opMode = CMK_EM_API_REVERSE; - break; - case CMK_DIRECT_API : // Do nothing - break; - case CMK_BCAST_EM_API : ncpyOpInfo->opMode = CMK_BCAST_EM_API_REVERSE; - break; - default : CmiAbort("Unknown opcode"); - break; + switch (ncpyOpInfo->opMode) { + case CMK_EM_API: + ncpyOpInfo->opMode = CMK_EM_API_REVERSE; + break; + case CMK_DIRECT_API: // Do nothing + break; + case CMK_BCAST_EM_API: + ncpyOpInfo->opMode = CMK_BCAST_EM_API_REVERSE; + break; + default: + CmiAbort("Unknown opcode"); + break; } } diff --git a/src/collectives.cpp b/src/collectives.cpp index 94f4b85..e0037b5 100644 --- a/src/collectives.cpp +++ b/src/collectives.cpp @@ -27,187 +27,187 @@ void collectiveInit(void) { /* Broadcast to everyone but the source pe. Source does not free. */ void CmiSyncBroadcast(int size, void *msg) { - DEBUGF("[%d] CmiSyncBroadcast\n", CmiMyPe()); - int pe = CmiMyPe(); - - CmiMessageHeader *header = static_cast(msg); - header->messageSize = size; - - #ifdef SPANTREE - #if SPANTREE ON - DEBUGF("[%d] Spanning tree option\n", CmiMyPe()); - CmiSetBcastSource(msg, pe); // used to skip the source - header->swapHandlerId = header->handlerId; - header->handlerId = Cmi_bcastHandler; - CmiSyncSend(0, size, msg); - #else - for (int i = pe + 1; i < CmiNumPes(); i++) - CmiSyncSend(i, size, msg); - - for (int i = 0; i < pe; i++) - CmiSyncSend(i, size, msg); - #endif - #else - - for (int i = pe + 1; i < CmiNumPes(); i++) - CmiSyncSend(i, size, msg); - - for (int i = 0; i < pe; i++) - CmiSyncSend(i, size, msg); - #endif - } - - void CmiSyncBroadcastAndFree(int size, void *msg) { - CmiSyncBroadcast(size, msg); - CmiFree(msg); - } - - void CmiSyncBroadcastAll(int size, void *msg) { - DEBUGF("[%d] CmiSyncBroadcastAll\n", CmiMyPe()); - CmiMessageHeader *header = static_cast(msg); - header->messageSize = size; - - #ifdef SPANTREE - #if SPANTREE ON - CmiSetBcastSource(msg, -1); // don't skip the source - header->swapHandlerId = header->handlerId; - - header->handlerId = Cmi_bcastHandler; - CmiSyncSend(0, size, msg); - #else - for (int i = 0; i < CmiNumPes(); i++) - CmiSyncSend(i, size, msg); - #endif - #else - for (int i = 0; i < CmiNumPes(); i++) - CmiSyncSend(i, size, msg); - #endif - } - - void CmiSyncBroadcastAllAndFree(int size, void *msg) { - CmiSyncBroadcastAll(size, msg); - CmiFree(msg); - } - - void CmiWithinNodeBroadcast(int size, void *msg) { - for (int i = 0; i < CmiMyNodeSize(); i++) { - int destPe = CmiMyNode() * CmiMyNodeSize() + i; - CmiSyncSend(destPe, size, msg); - } - } - - void CmiSyncNodeBroadcast(unsigned int size, void *msg) { - - int node = CmiMyNode(); - - CmiMessageHeader *header = static_cast(msg); - header->messageSize = size; - - #ifdef SPANTREE - #if SPANTREE ON - CmiSetBcastSource(msg, node); // used to skip the source - header->swapHandlerId = header->handlerId; - header->handlerId = Cmi_nodeBcastHandler; - CmiSyncNodeSend(0, size, msg); - #else - - for (int i = node + 1; i < CmiNumNodes(); i++) - CmiSyncNodeSend(i, size, msg); - - for (int i = 0; i < node; i++) - CmiSyncNodeSend(i, size, msg); - #endif - #else - - for (int i = node + 1; i < CmiNumNodes(); i++) - CmiSyncNodeSend(i, size, msg); - - for (int i = 0; i < node; i++) - CmiSyncNodeSend(i, size, msg); - #endif - } - - void CmiSyncNodeBroadcastAndFree(unsigned int size, void *msg) { - CmiSyncNodeBroadcast(size, msg); - CmiFree(msg); - } - - void CmiSyncNodeBroadcastAll(unsigned int size, void *msg) { - CmiMessageHeader *header = static_cast(msg); - header->messageSize = size; - - #ifdef SPANTREE - #if SPANTREE ON - CmiSetBcastSource(msg, -1); // don't skip the source - header->swapHandlerId = header->handlerId; - header->handlerId = Cmi_nodeBcastHandler; - CmiSyncNodeSend(0, size, msg); - #else - - for (int i = 0; i < CmiNumNodes(); i++) - CmiSyncNodeSend(i, size, msg); - #endif - #else - - for (int i = 0; i < CmiNumNodes(); i++) - CmiSyncNodeSend(i, size, msg); - #endif - } - - void CmiSyncNodeBroadcastAllAndFree(unsigned int size, void *msg) { - CmiSyncNodeBroadcastAll(size, msg); - CmiFree(msg); + DEBUGF("[%d] CmiSyncBroadcast\n", CmiMyPe()); + int pe = CmiMyPe(); + + CmiMessageHeader *header = static_cast(msg); + header->messageSize = size; + +#ifdef SPANTREE +#if SPANTREE ON + DEBUGF("[%d] Spanning tree option\n", CmiMyPe()); + CmiSetBcastSource(msg, pe); // used to skip the source + header->swapHandlerId = header->handlerId; + header->handlerId = Cmi_bcastHandler; + CmiSyncSend(0, size, msg); +#else + for (int i = pe + 1; i < CmiNumPes(); i++) + CmiSyncSend(i, size, msg); + + for (int i = 0; i < pe; i++) + CmiSyncSend(i, size, msg); +#endif +#else + + for (int i = pe + 1; i < CmiNumPes(); i++) + CmiSyncSend(i, size, msg); + + for (int i = 0; i < pe; i++) + CmiSyncSend(i, size, msg); +#endif +} + +void CmiSyncBroadcastAndFree(int size, void *msg) { + CmiSyncBroadcast(size, msg); + CmiFree(msg); +} + +void CmiSyncBroadcastAll(int size, void *msg) { + DEBUGF("[%d] CmiSyncBroadcastAll\n", CmiMyPe()); + CmiMessageHeader *header = static_cast(msg); + header->messageSize = size; + +#ifdef SPANTREE +#if SPANTREE ON + CmiSetBcastSource(msg, -1); // don't skip the source + header->swapHandlerId = header->handlerId; + + header->handlerId = Cmi_bcastHandler; + CmiSyncSend(0, size, msg); +#else + for (int i = 0; i < CmiNumPes(); i++) + CmiSyncSend(i, size, msg); +#endif +#else + for (int i = 0; i < CmiNumPes(); i++) + CmiSyncSend(i, size, msg); +#endif +} + +void CmiSyncBroadcastAllAndFree(int size, void *msg) { + CmiSyncBroadcastAll(size, msg); + CmiFree(msg); +} + +void CmiWithinNodeBroadcast(int size, void *msg) { + for (int i = 0; i < CmiMyNodeSize(); i++) { + int destPe = CmiMyNode() * CmiMyNodeSize() + i; + CmiSyncSend(destPe, size, msg); } - - /* Handler for broadcast via the spanning tree. */ - void CmiBcastHandler(void *msg) { - int mype = CmiMyPe(); - int numChildren = CmiNumSpanTreeChildren(mype); - int children[numChildren]; - CmiSpanTreeChildren(mype, children); - - CmiMessageHeader *header = static_cast(msg); - - // send broadcast to all children - for (int i = 0; i < numChildren; i++) { - CmiSyncSend(children[i], header->messageSize, msg); - } - - // call handler locally (unless I am source of broadcast, and bcast is - // exclusive) - if (CmiGetBcastSource(msg) != mype) { - CmiCallHandler(header->swapHandlerId, msg); - } +} + +void CmiSyncNodeBroadcast(unsigned int size, void *msg) { + + int node = CmiMyNode(); + + CmiMessageHeader *header = static_cast(msg); + header->messageSize = size; + +#ifdef SPANTREE +#if SPANTREE ON + CmiSetBcastSource(msg, node); // used to skip the source + header->swapHandlerId = header->handlerId; + header->handlerId = Cmi_nodeBcastHandler; + CmiSyncNodeSend(0, size, msg); +#else + + for (int i = node + 1; i < CmiNumNodes(); i++) + CmiSyncNodeSend(i, size, msg); + + for (int i = 0; i < node; i++) + CmiSyncNodeSend(i, size, msg); +#endif +#else + + for (int i = node + 1; i < CmiNumNodes(); i++) + CmiSyncNodeSend(i, size, msg); + + for (int i = 0; i < node; i++) + CmiSyncNodeSend(i, size, msg); +#endif +} + +void CmiSyncNodeBroadcastAndFree(unsigned int size, void *msg) { + CmiSyncNodeBroadcast(size, msg); + CmiFree(msg); +} + +void CmiSyncNodeBroadcastAll(unsigned int size, void *msg) { + CmiMessageHeader *header = static_cast(msg); + header->messageSize = size; + +#ifdef SPANTREE +#if SPANTREE ON + CmiSetBcastSource(msg, -1); // don't skip the source + header->swapHandlerId = header->handlerId; + header->handlerId = Cmi_nodeBcastHandler; + CmiSyncNodeSend(0, size, msg); +#else + + for (int i = 0; i < CmiNumNodes(); i++) + CmiSyncNodeSend(i, size, msg); +#endif +#else + + for (int i = 0; i < CmiNumNodes(); i++) + CmiSyncNodeSend(i, size, msg); +#endif +} + +void CmiSyncNodeBroadcastAllAndFree(unsigned int size, void *msg) { + CmiSyncNodeBroadcastAll(size, msg); + CmiFree(msg); +} + +/* Handler for broadcast via the spanning tree. */ +void CmiBcastHandler(void *msg) { + int mype = CmiMyPe(); + int numChildren = CmiNumSpanTreeChildren(mype); + int children[numChildren]; + CmiSpanTreeChildren(mype, children); + + CmiMessageHeader *header = static_cast(msg); + + // send broadcast to all children + for (int i = 0; i < numChildren; i++) { + CmiSyncSend(children[i], header->messageSize, msg); } - - /* Handler for node broadcast via the spanning tree. */ - void CmiNodeBcastHandler(void *msg) { - int mynode = CmiMyNode(); - int numChildren = CmiNumNodeSpanTreeChildren(mynode); - int children[numChildren]; - CmiNodeSpanTreeChildren(mynode, children); - - CmiMessageHeader *header = static_cast(msg); - - // send broadcast to node children - for (int i = 0; i < numChildren; i++) { - CmiSyncNodeSend(children[i], header->messageSize, msg); - } - - if (CmiGetBcastSource(msg) != mynode) { - CmiCallHandler(header->swapHandlerId, msg); - } + + // call handler locally (unless I am source of broadcast, and bcast is + // exclusive) + if (CmiGetBcastSource(msg) != mype) { + CmiCallHandler(header->swapHandlerId, msg); } - - void CmiSetBcastSource(void *msg, CmiBroadcastSource source) { - CmiMessageHeader *header = static_cast(msg); - header->collectiveMetaInfo = source; +} + +/* Handler for node broadcast via the spanning tree. */ +void CmiNodeBcastHandler(void *msg) { + int mynode = CmiMyNode(); + int numChildren = CmiNumNodeSpanTreeChildren(mynode); + int children[numChildren]; + CmiNodeSpanTreeChildren(mynode, children); + + CmiMessageHeader *header = static_cast(msg); + + // send broadcast to node children + for (int i = 0; i < numChildren; i++) { + CmiSyncNodeSend(children[i], header->messageSize, msg); } - - CmiBroadcastSource CmiGetBcastSource(void *msg) { - CmiMessageHeader *header = static_cast(msg); - return header->collectiveMetaInfo; + + if (CmiGetBcastSource(msg) != mynode) { + CmiCallHandler(header->swapHandlerId, msg); } +} + +void CmiSetBcastSource(void *msg, CmiBroadcastSource source) { + CmiMessageHeader *header = static_cast(msg); + header->collectiveMetaInfo = source; +} + +CmiBroadcastSource CmiGetBcastSource(void *msg) { + CmiMessageHeader *header = static_cast(msg); + return header->collectiveMetaInfo; +} /************* Reductions ***************/ @@ -239,7 +239,6 @@ void CmiReductionsInit(void) { // node reduction must be initialized with a valid lock nodered.lock = CmiCreateLock(); // in non-smp this would just be a nullptr - } CsvAccess(_node_reduction_info) = noderedinfo; CsvAccess(_node_reduction_counter) = 0; @@ -281,7 +280,6 @@ static inline CmiReductionID getNextID(std::atomic &ctr) { return old; } - unsigned CmiGetReductionIndex(CmiReductionID id) { // treating the id as the index into the reduction table // utilized in getCreateReduction and clearReduction to find the reduction @@ -292,7 +290,6 @@ unsigned CmiGetReductionIndex(CmiReductionID id) { return id; } - // PROCESS REDUCTIONS static void CmiClearReduction(CmiReductionID id) { auto &reduction_ref = CpvAccess(_reduction_info)[CmiGetReductionIndex(id)]; @@ -310,9 +307,7 @@ CmiReductionID CmiGetNextReductionID() { return getNextID(CpvAccess(_reduction_counter)); } -void CmiResetGlobalReduceSeqID(void) { - CpvAccess(_reduction_counter) = 0; -} +void CmiResetGlobalReduceSeqID(void) { CpvAccess(_reduction_counter) = 0; } static CmiReduction *CmiGetCreateReduction(CmiReductionID id) { // should handle the 2 cases: @@ -437,7 +432,6 @@ void CmiNodeReduce(void *msg, int size, CmiReduceMergeFn mergeFn) { CmiReduction *red = CmiGetCreateNodeReduction(id); CmiInternalNodeReduce(msg, size, mergeFn, red); - CmiUnlock(nodeRed.lock); } @@ -445,9 +439,7 @@ CmiReductionID CmiGetNextNodeReductionID() { return getNextID(CsvAccess(_node_reduction_counter)); } -void CmiResetGlobalNodeReduceSeqID(){ - CsvAccess(_node_reduction_counter) = 0; -} +void CmiResetGlobalNodeReduceSeqID() { CsvAccess(_node_reduction_counter) = 0; } static CmiReduction *CmiGetCreateNodeReduction(CmiReductionID id) { // should handle the 2 cases: @@ -544,9 +536,7 @@ void CmiNodeReduceHandler(void *msg) { reduction->messagesReceived++; CmiSendNodeReduce(reduction); - CmiUnlock(nodeRed.lock); - } /************* Groups ***************/ diff --git a/src/comm_backend/comm_backend.h b/src/comm_backend/comm_backend.h index d9939be..70562ff 100644 --- a/src/comm_backend/comm_backend.h +++ b/src/comm_backend/comm_backend.h @@ -57,12 +57,14 @@ void issueAm(int rank, const void *local_buf, size_t size, mr_t mr, * @brief Issue a remote get operation. Thread-safe. */ void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr, - uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context); + uintptr_t remote_disp, void *rmr, CompHandler localComp, + void *user_context); /** * @brief Issue a remote put operation. Thread-safe. */ void issueRput(int rank, const void *local_buf, size_t size, mr_t local_mr, - uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context); + uintptr_t remote_disp, void *rmr, CompHandler localComp, + void *user_context); /** * @brief Make progress on the communication backend. Thread-safe. */ diff --git a/src/comm_backend/comm_backend_internal.cpp b/src/comm_backend/comm_backend_internal.cpp index 0cd14a6..43cc460 100644 --- a/src/comm_backend/comm_backend_internal.cpp +++ b/src/comm_backend/comm_backend_internal.cpp @@ -89,16 +89,18 @@ AmHandler registerAmHandler(CompHandler handler) { return gCommBackend->registerAmHandler(handler); } -void issueAm(int rank, const void *msg, size_t size, mr_t mr, CompHandler localComp, - AmHandler remoteComp, void *user_context) { +void issueAm(int rank, const void *msg, size_t size, mr_t mr, + CompHandler localComp, AmHandler remoteComp, void *user_context) { if (gCommBackend == nullptr) { return; } - gCommBackend->issueAm(rank, msg, size, mr, localComp, remoteComp, user_context); + gCommBackend->issueAm(rank, msg, size, mr, localComp, remoteComp, + user_context); } void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr, - uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context) { + uintptr_t remote_disp, void *rmr, CompHandler localComp, + void *user_context) { if (gCommBackend == nullptr) { return; } @@ -107,7 +109,8 @@ void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr, } void issueRput(int rank, const void *local_buf, size_t size, mr_t local_mr, - uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context) { + uintptr_t remote_disp, void *rmr, CompHandler localComp, + void *user_context) { if (gCommBackend == nullptr) { return; } diff --git a/src/comm_backend/comm_backend_internal.h b/src/comm_backend/comm_backend_internal.h index 8df6cbf..0cfde8d 100644 --- a/src/comm_backend/comm_backend_internal.h +++ b/src/comm_backend/comm_backend_internal.h @@ -20,7 +20,8 @@ class CommBackendBase { virtual bool isRMACapable() { return false; } virtual AmHandler registerAmHandler(CompHandler handler) = 0; virtual void issueAm(int rank, const void *local_buf, size_t size, mr_t mr, - CompHandler localComp, AmHandler remoteComp, void *user_context) = 0; + CompHandler localComp, AmHandler remoteComp, + void *user_context) = 0; virtual void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr, uintptr_t remote_disp, void *rmr, CompHandler localComp, void *user_context) { @@ -39,7 +40,7 @@ class CommBackendBase { virtual mr_t registerMemory(void *addr, size_t size) { return MR_NULL; } virtual size_t getRMR(mr_t mr, void *addr, size_t size) { return 0; } virtual void deregisterMemory(mr_t mr) {} - virtual ~CommBackendBase() {}; + virtual ~CommBackendBase() = default; }; } // namespace comm_backend diff --git a/src/comm_backend/lci2/comm_backend_lci2.cpp b/src/comm_backend/lci2/comm_backend_lci2.cpp index 0349d1c..5de5be0 100644 --- a/src/comm_backend/lci2/comm_backend_lci2.cpp +++ b/src/comm_backend/lci2/comm_backend_lci2.cpp @@ -87,8 +87,9 @@ AmHandler CommBackendLCI2::registerAmHandler(CompHandler handler) { return g_handlers.size() - 1; } -void CommBackendLCI2::issueAm(int rank, const void *local_buf, size_t size, mr_t mr, - CompHandler localComp, AmHandler remoteComp, void *user_context) { +void CommBackendLCI2::issueAm(int rank, const void *local_buf, size_t size, + mr_t mr, CompHandler localComp, + AmHandler remoteComp, void *user_context) { auto args = new localCallbackArgs{localComp, user_context}; lci::status_t status; do { diff --git a/src/comm_backend/lci2/comm_backend_lci2.h b/src/comm_backend/lci2/comm_backend_lci2.h index e1149ca..051afdf 100644 --- a/src/comm_backend/lci2/comm_backend_lci2.h +++ b/src/comm_backend/lci2/comm_backend_lci2.h @@ -24,13 +24,14 @@ class CommBackendLCI2 : public CommBackendBase { bool isRMACapable() override { return true; } AmHandler registerAmHandler(CompHandler handler) override; void issueAm(int rank, const void *local_buf, size_t size, mr_t mr, - CompHandler localComp, AmHandler remoteComp, void *user_context) override; + CompHandler localComp, AmHandler remoteComp, + void *user_context) override; void issueRget(int rank, const void *local_buf, size_t size, mr_t local_mr, - uintptr_t remote_disp, void *rmr, - CompHandler localComp, void *user_context) override; + uintptr_t remote_disp, void *rmr, CompHandler localComp, + void *user_context) override; void issueRput(int rank, const void *local_buf, size_t size, mr_t local_mr, - uintptr_t remote_disp, void *rmr, - CompHandler localComp, void *user_context) override; + uintptr_t remote_disp, void *rmr, CompHandler localComp, + void *user_context) override; bool progress(void) override; void barrier(void) override; mr_t registerMemory(void *addr, size_t size) override; diff --git a/src/concurrentqueue.h b/src/concurrentqueue.h index db4835b..593680b 100644 --- a/src/concurrentqueue.h +++ b/src/concurrentqueue.h @@ -1,5 +1,5 @@ -// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free queue. -// An overview, including benchmark results, is provided here: +// Provides a C++11 implementation of a multi-producer, multi-consumer lock-free +// queue. An overview, including benchmark results, is provided here: // http://moodycamel.com/blog/2014/a-fast-general-purpose-lock-free-queue-for-c++ // The full design is also described in excruciating detail at: // http://moodycamel.com/blog/2014/detailed-design-of-a-lock-free-queue @@ -8,24 +8,26 @@ // Copyright (c) 2013-2020, Cameron Desrochers. // All rights reserved. // -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: // -// - Redistributions of source code must retain the above copyright notice, this list of -// conditions and the following disclaimer. -// - Redistributions in binary form must reproduce the above copyright notice, this list of -// conditions and the following disclaimer in the documentation and/or other materials -// provided with the distribution. +// - Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// - Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. // -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL -// THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT -// OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR -// TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, -// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. // Also dual-licensed under the Boost Software License (see LICENSE.md) @@ -33,8 +35,8 @@ #if defined(__GNUC__) && !defined(__INTEL_COMPILER) // Disable -Wconversion warnings (spuriously triggered when Traits::size_t and -// Traits::index_t are set to < 32 bits, causing integer promotion, causing warnings -// upon assigning any computed values) +// Traits::index_t are set to < 32 bits, causing integer promotion, causing +// warnings upon assigning any computed values) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" @@ -44,10 +46,11 @@ #endif #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) -// VS2019 with /W4 warns about constant conditional expressions but unless /std=c++17 or higher -// does not support `if constexpr`, so we have no choice but to simply disable the warning +// VS2019 with /W4 warns about constant conditional expressions but unless +// /std=c++17 or higher does not support `if constexpr`, so we have no choice +// but to simply disable the warning #pragma warning(push) -#pragma warning(disable: 4127) // conditional expression is constant +#pragma warning(disable : 4127) // conditional expression is constant #endif #if defined(__APPLE__) @@ -57,92 +60,117 @@ #ifdef MCDBGQ_USE_RELACY #include "relacy/relacy_std.hpp" #include "relacy_shims.h" -// We only use malloc/free anyway, and the delete macro messes up `= delete` method declarations. -// We'll override the default trait malloc ourselves without a macro. +// We only use malloc/free anyway, and the delete macro messes up `= delete` +// method declarations. We'll override the default trait malloc ourselves +// without a macro. #undef new #undef delete #undef malloc #undef free #else -#include // Requires C++11. Sorry VS2010. +#include // Requires C++11. Sorry VS2010. #include #endif -#include // for max_align_t +#include +#include +#include // for CHAR_BIT +#include // for max_align_t #include #include +#include +#include // used for thread exit synchronization +#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading #include -#include #include -#include -#include // for CHAR_BIT -#include -#include // partly for __WINPTHREADS_VERSION if on MinGW-w64 w/ POSIX threading -#include // used for thread exit synchronization - -// Platform-specific definitions of a numeric thread ID type and an invalid value -namespace moodycamel { namespace details { - template struct thread_id_converter { - typedef thread_id_t thread_id_numeric_size_t; - typedef thread_id_t thread_id_hash_t; - static thread_id_hash_t prehash(thread_id_t const& x) { return x; } - }; -} } + +// Platform-specific definitions of a numeric thread ID type and an invalid +// value +namespace moodycamel { +namespace details { +template struct thread_id_converter { + typedef thread_id_t thread_id_numeric_size_t; + typedef thread_id_t thread_id_hash_t; + static thread_id_hash_t prehash(thread_id_t const &x) { return x; } +}; +} // namespace details +} // namespace moodycamel #if defined(MCDBGQ_USE_RELACY) -namespace moodycamel { namespace details { - typedef std::uint32_t thread_id_t; - static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; - static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; - static inline thread_id_t thread_id() { return rl::thread_index(); } -} } +namespace moodycamel { +namespace details { +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = 0xFFFFFFFFU; +static const thread_id_t invalid_thread_id2 = 0xFFFFFFFEU; +static inline thread_id_t thread_id() { return rl::thread_index(); } +} // namespace details +} // namespace moodycamel #elif defined(_WIN32) || defined(__WINDOWS__) || defined(__WIN32__) -// No sense pulling in windows.h in a header, we'll manually declare the function -// we use and rely on backwards-compatibility for this not to break -extern "C" __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); -namespace moodycamel { namespace details { - static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), "Expected size of unsigned long to be 32 bits on Windows"); - typedef std::uint32_t thread_id_t; - static const thread_id_t invalid_thread_id = 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx - static const thread_id_t invalid_thread_id2 = 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used in practice. Note that all Win32 thread IDs are presently multiples of 4. - static inline thread_id_t thread_id() { return static_cast(::GetCurrentThreadId()); } -} } -#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || defined(MOODYCAMEL_NO_THREAD_LOCAL) -namespace moodycamel { namespace details { - static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, "std::thread::id is expected to be either 4 or 8 bytes"); - - typedef std::thread::id thread_id_t; - static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID - - // Note we don't define a invalid_thread_id2 since std::thread::id doesn't have one; it's - // only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined anyway, which it won't - // be. - static inline thread_id_t thread_id() { return std::this_thread::get_id(); } - - template struct thread_id_size { }; - template<> struct thread_id_size<4> { typedef std::uint32_t numeric_t; }; - template<> struct thread_id_size<8> { typedef std::uint64_t numeric_t; }; - - template<> struct thread_id_converter { - typedef thread_id_size::numeric_t thread_id_numeric_size_t; +// No sense pulling in windows.h in a header, we'll manually declare the +// function we use and rely on backwards-compatibility for this not to break +extern "C" + __declspec(dllimport) unsigned long __stdcall GetCurrentThreadId(void); +namespace moodycamel { +namespace details { +static_assert(sizeof(unsigned long) == sizeof(std::uint32_t), + "Expected size of unsigned long to be 32 bits on Windows"); +typedef std::uint32_t thread_id_t; +static const thread_id_t invalid_thread_id = + 0; // See http://blogs.msdn.com/b/oldnewthing/archive/2004/02/23/78395.aspx +static const thread_id_t invalid_thread_id2 = + 0xFFFFFFFFU; // Not technically guaranteed to be invalid, but is never used + // in practice. Note that all Win32 thread IDs are presently + // multiples of 4. +static inline thread_id_t thread_id() { + return static_cast(::GetCurrentThreadId()); +} +} // namespace details +} // namespace moodycamel +#elif defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \ + (defined(__APPLE__) && TARGET_OS_IPHONE) || defined(__MVS__) || \ + defined(MOODYCAMEL_NO_THREAD_LOCAL) +namespace moodycamel { +namespace details { +static_assert(sizeof(std::thread::id) == 4 || sizeof(std::thread::id) == 8, + "std::thread::id is expected to be either 4 or 8 bytes"); + +typedef std::thread::id thread_id_t; +static const thread_id_t invalid_thread_id; // Default ctor creates invalid ID + +// Note we don't define a invalid_thread_id2 since std::thread::id doesn't have +// one; it's only used if MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is defined +// anyway, which it won't be. +static inline thread_id_t thread_id() { return std::this_thread::get_id(); } + +template struct thread_id_size {}; +template <> struct thread_id_size<4> { + typedef std::uint32_t numeric_t; +}; +template <> struct thread_id_size<8> { + typedef std::uint64_t numeric_t; +}; + +template <> struct thread_id_converter { + typedef thread_id_size::numeric_t + thread_id_numeric_size_t; #ifndef __APPLE__ - typedef std::size_t thread_id_hash_t; + typedef std::size_t thread_id_hash_t; #else - typedef thread_id_numeric_size_t thread_id_hash_t; + typedef thread_id_numeric_size_t thread_id_hash_t; #endif - static thread_id_hash_t prehash(thread_id_t const& x) - { + static thread_id_hash_t prehash(thread_id_t const &x) { #ifndef __APPLE__ - return std::hash()(x); + return std::hash()(x); #else - return *reinterpret_cast(&x); + return *reinterpret_cast(&x); #endif - } - }; -} } + } +}; +} +} #else // Use a nice trick from this answer: http://stackoverflow.com/a/8438730/21475 -// In order to get a numeric thread ID in a platform-independent way, we use a thread-local -// static variable's address as a thread identifier :-) +// In order to get a numeric thread ID in a platform-independent way, we use a +// thread-local static variable's address as a thread identifier :-) #if defined(__GNUC__) || defined(__INTEL_COMPILER) #define MOODYCAMEL_THREADLOCAL __thread #elif defined(_MSC_VER) @@ -151,17 +179,25 @@ namespace moodycamel { namespace details { // Assume C++11 compliant compiler #define MOODYCAMEL_THREADLOCAL thread_local #endif -namespace moodycamel { namespace details { - typedef std::uintptr_t thread_id_t; - static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr - static const thread_id_t invalid_thread_id2 = 1; // Member accesses off a null pointer are also generally invalid. Plus it's not aligned. - inline thread_id_t thread_id() { static MOODYCAMEL_THREADLOCAL int x; return reinterpret_cast(&x); } -} } +namespace moodycamel { +namespace details { +typedef std::uintptr_t thread_id_t; +static const thread_id_t invalid_thread_id = 0; // Address can't be nullptr +static const thread_id_t invalid_thread_id2 = + 1; // Member accesses off a null pointer are also generally invalid. Plus + // it's not aligned. +inline thread_id_t thread_id() { + static MOODYCAMEL_THREADLOCAL int x; + return reinterpret_cast(&x); +} +} +} #endif // Constexpr if #ifndef MOODYCAMEL_CONSTEXPR_IF -#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || __cplusplus > 201402L +#if (defined(_MSC_VER) && defined(_HAS_CXX17) && _HAS_CXX17) || \ + __cplusplus > 201402L #define MOODYCAMEL_CONSTEXPR_IF if constexpr #define MOODYCAMEL_MAYBE_UNUSED [[maybe_unused]] #else @@ -172,18 +208,20 @@ namespace moodycamel { namespace details { // Exceptions #ifndef MOODYCAMEL_EXCEPTIONS_ENABLED -#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || (defined(__GNUC__) && defined(__EXCEPTIONS)) || (!defined(_MSC_VER) && !defined(__GNUC__)) +#if (defined(_MSC_VER) && defined(_CPPUNWIND)) || \ + (defined(__GNUC__) && defined(__EXCEPTIONS)) || \ + (!defined(_MSC_VER) && !defined(__GNUC__)) #define MOODYCAMEL_EXCEPTIONS_ENABLED #endif #endif #ifdef MOODYCAMEL_EXCEPTIONS_ENABLED #define MOODYCAMEL_TRY try -#define MOODYCAMEL_CATCH(...) catch(__VA_ARGS__) +#define MOODYCAMEL_CATCH(...) catch (__VA_ARGS__) #define MOODYCAMEL_RETHROW throw -#define MOODYCAMEL_THROW(expr) throw (expr) +#define MOODYCAMEL_THROW(expr) throw(expr) #else -#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF (true) -#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF (false) +#define MOODYCAMEL_TRY MOODYCAMEL_CONSTEXPR_IF(true) +#define MOODYCAMEL_CATCH(...) else MOODYCAMEL_CONSTEXPR_IF(false) #define MOODYCAMEL_RETHROW #define MOODYCAMEL_THROW(expr) #endif @@ -194,15 +232,40 @@ namespace moodycamel { namespace details { #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) true #define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) true #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1800 -// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when it shouldn't :-( -// We have to assume *all* non-trivial constructors may throw on VS2012! +// VS2012's std::is_nothrow_[move_]constructible is broken and returns true when +// it shouldn't :-( We have to assume *all* non-trivial constructors may throw +// on VS2012! #define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value : std::is_trivially_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && \ + std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value \ + : std::is_trivially_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && \ + std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value || \ + std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value || \ + std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #elif defined(_MSC_VER) && defined(_NOEXCEPT) && _MSC_VER < 1900 #define MOODYCAMEL_NOEXCEPT _NOEXCEPT -#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) (std::is_rvalue_reference::value && std::is_move_constructible::value ? std::is_trivially_move_constructible::value || std::is_nothrow_move_constructible::value : std::is_trivially_copy_constructible::value || std::is_nothrow_copy_constructible::value) -#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) ((std::is_rvalue_reference::value && std::is_move_assignable::value ? std::is_trivially_move_assignable::value || std::is_nothrow_move_assignable::value : std::is_trivially_copy_assignable::value || std::is_nothrow_copy_assignable::value) && MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) +#define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) \ + (std::is_rvalue_reference::value && \ + std::is_move_constructible::value \ + ? std::is_trivially_move_constructible::value || \ + std::is_nothrow_move_constructible::value \ + : std::is_trivially_copy_constructible::value || \ + std::is_nothrow_copy_constructible::value) +#define MOODYCAMEL_NOEXCEPT_ASSIGN(type, valueType, expr) \ + ((std::is_rvalue_reference::value && \ + std::is_move_assignable::value \ + ? std::is_trivially_move_assignable::value || \ + std::is_nothrow_move_assignable::value \ + : std::is_trivially_copy_assignable::value || \ + std::is_nothrow_copy_assignable::value) && \ + MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr)) #else #define MOODYCAMEL_NOEXCEPT noexcept #define MOODYCAMEL_NOEXCEPT_CTOR(type, valueType, expr) noexcept(expr) @@ -214,18 +277,31 @@ namespace moodycamel { namespace details { #ifdef MCDBGQ_USE_RELACY #define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #else -// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 -// g++ <=4.7 doesn't support thread_local either. -// Finally, iOS/ARM doesn't have support for it either, and g++/ARM allows it to compile but it's unconfirmed to actually work -#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && (!defined(__MINGW32__) && !defined(__MINGW64__) || !defined(__WINPTHREADS_VERSION)) && (!defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__) -// Assume `thread_local` is fully supported in all other C++11 compilers/platforms -#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; years ago several users report having problems with it on +// VS2013 doesn't support `thread_local`, and MinGW-w64 w/ POSIX threading has a +// crippling bug: http://sourceforge.net/p/mingw-w64/bugs/445 g++ <=4.7 doesn't +// support thread_local either. Finally, iOS/ARM doesn't have support for it +// either, and g++/ARM allows it to compile but it's unconfirmed to actually +// work +#if (!defined(_MSC_VER) || _MSC_VER >= 1900) && \ + (!defined(__MINGW32__) && !defined(__MINGW64__) || \ + !defined(__WINPTHREADS_VERSION)) && \ + (!defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) && \ + (!defined(__APPLE__) || !TARGET_OS_IPHONE) && !defined(__arm__) && \ + !defined(_M_ARM) && !defined(__aarch64__) && !defined(__MVS__) +// Assume `thread_local` is fully supported in all other C++11 +// compilers/platforms +#define MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED // tentatively enabled for now; + // years ago several users + // report having problems with + // it on #endif #endif #endif -// VS2012 doesn't support deleted functions. -// In this case, we declare the function normally but don't define it. A link error will be generated if the function is called. +// VS2012 doesn't support deleted functions. +// In this case, we declare the function normally but don't define it. A link +// error will be generated if the function is called. #ifndef MOODYCAMEL_DELETE_FUNCTION #if defined(_MSC_VER) && _MSC_VER < 1800 #define MOODYCAMEL_DELETE_FUNCTION @@ -234,54 +310,82 @@ namespace moodycamel { namespace details { #endif #endif -namespace moodycamel { namespace details { +namespace moodycamel { +namespace details { #ifndef MOODYCAMEL_ALIGNAS -// VS2013 doesn't support alignas or alignof, and align() requires a constant literal +// VS2013 doesn't support alignas or alignof, and align() requires a constant +// literal #if defined(_MSC_VER) && _MSC_VER <= 1800 #define MOODYCAMEL_ALIGNAS(alignment) __declspec(align(alignment)) #define MOODYCAMEL_ALIGNOF(obj) __alignof(obj) -#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) typename details::Vs2013Aligned::value, T>::type - template struct Vs2013Aligned { }; // default, unsupported alignment - template struct Vs2013Aligned<1, T> { typedef __declspec(align(1)) T type; }; - template struct Vs2013Aligned<2, T> { typedef __declspec(align(2)) T type; }; - template struct Vs2013Aligned<4, T> { typedef __declspec(align(4)) T type; }; - template struct Vs2013Aligned<8, T> { typedef __declspec(align(8)) T type; }; - template struct Vs2013Aligned<16, T> { typedef __declspec(align(16)) T type; }; - template struct Vs2013Aligned<32, T> { typedef __declspec(align(32)) T type; }; - template struct Vs2013Aligned<64, T> { typedef __declspec(align(64)) T type; }; - template struct Vs2013Aligned<128, T> { typedef __declspec(align(128)) T type; }; - template struct Vs2013Aligned<256, T> { typedef __declspec(align(256)) T type; }; +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ + typename details::Vs2013Aligned::value, T>::type +template +struct Vs2013Aligned {}; // default, unsupported alignment +template struct Vs2013Aligned<1, T> { + typedef __declspec(align(1)) T type; +}; +template struct Vs2013Aligned<2, T> { + typedef __declspec(align(2)) T type; +}; +template struct Vs2013Aligned<4, T> { + typedef __declspec(align(4)) T type; +}; +template struct Vs2013Aligned<8, T> { + typedef __declspec(align(8)) T type; +}; +template struct Vs2013Aligned<16, T> { + typedef __declspec(align(16)) T type; +}; +template struct Vs2013Aligned<32, T> { + typedef __declspec(align(32)) T type; +}; +template struct Vs2013Aligned<64, T> { + typedef __declspec(align(64)) T type; +}; +template struct Vs2013Aligned<128, T> { + typedef __declspec(align(128)) T type; +}; +template struct Vs2013Aligned<256, T> { + typedef __declspec(align(256)) T type; +}; #else - template struct identity { typedef T type; }; +template struct identity { + typedef T type; +}; #define MOODYCAMEL_ALIGNAS(alignment) alignas(alignment) #define MOODYCAMEL_ALIGNOF(obj) alignof(obj) -#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) alignas(alignof(obj)) typename details::identity::type +#define MOODYCAMEL_ALIGNED_TYPE_LIKE(T, obj) \ + alignas(alignof(obj)) typename details::identity::type #endif #endif -} } +} // namespace details +} // namespace moodycamel - -// TSAN can false report races in lock-free code. To enable TSAN to be used from projects that use this one, -// we can apply per-function compile-time suppression. -// See https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer +// TSAN can false report races in lock-free code. To enable TSAN to be used +// from projects that use this one, we can apply per-function compile-time +// suppression. See +// https://clang.llvm.org/docs/ThreadSanitizer.html#has-feature-thread-sanitizer #define MOODYCAMEL_NO_TSAN #if defined(__has_feature) - #if __has_feature(thread_sanitizer) - #undef MOODYCAMEL_NO_TSAN - #define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) - #endif // TSAN +#if __has_feature(thread_sanitizer) +#undef MOODYCAMEL_NO_TSAN +#define MOODYCAMEL_NO_TSAN __attribute__((no_sanitize("thread"))) +#endif // TSAN #endif // TSAN // Compiler-specific likely/unlikely hints -namespace moodycamel { namespace details { +namespace moodycamel { +namespace details { #if defined(__GNUC__) - static inline bool (likely)(bool x) { return __builtin_expect((x), true); } - static inline bool (unlikely)(bool x) { return __builtin_expect((x), false); } +static inline bool(likely)(bool x) { return __builtin_expect((x), true); } +static inline bool(unlikely)(bool x) { return __builtin_expect((x), false); } #else - static inline bool (likely)(bool x) { return x; } - static inline bool (unlikely)(bool x) { return x; } +static inline bool(likely)(bool x) { return x; } +static inline bool(unlikely)(bool x) { return x; } #endif -} } +} // namespace details +} // namespace moodycamel #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG #include "internal/concurrentqueue_internal_debug.h" @@ -289,28 +393,33 @@ namespace moodycamel { namespace details { namespace moodycamel { namespace details { - template - struct const_numeric_max { - static_assert(std::is_integral::value, "const_numeric_max can only be used with integers"); - static const T value = std::numeric_limits::is_signed - ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - static_cast(1) - : static_cast(-1); - }; +template struct const_numeric_max { + static_assert(std::is_integral::value, + "const_numeric_max can only be used with integers"); + static const T value = + std::numeric_limits::is_signed + ? (static_cast(1) << (sizeof(T) * CHAR_BIT - 1)) - + static_cast(1) + : static_cast(-1); +}; #if defined(__GLIBCXX__) - typedef ::max_align_t std_max_align_t; // libstdc++ forgot to add it to std:: for a while +typedef ::max_align_t + std_max_align_t; // libstdc++ forgot to add it to std:: for a while #else - typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can *only* be accessed via std:: +typedef std::max_align_t std_max_align_t; // Others (e.g. MSVC) insist it can + // *only* be accessed via std:: #endif - // Some platforms have incorrectly set max_align_t to a type with <8 bytes alignment even while supporting - // 8-byte aligned scalar values (*cough* 32-bit iOS). Work around this with our own union. See issue #64. - typedef union { - std_max_align_t x; - long long y; - void* z; - } max_align_t; -} +// Some platforms have incorrectly set max_align_t to a type with <8 bytes +// alignment even while supporting 8-byte aligned scalar values (*cough* 32-bit +// iOS). Work around this with our own union. See issue #64. +typedef union { + std_max_align_t x; + long long y; + void *z; +} max_align_t; +} // namespace details // Default traits for the ConcurrentQueue. To change some of the // traits without re-implementing all of them, inherit from this @@ -318,99 +427,103 @@ namespace details { // since the traits are used as a template type parameter, the // shadowed declarations will be used where defined, and the defaults // otherwise. -struct ConcurrentQueueDefaultTraits -{ - // General-purpose size type. std::size_t is strongly recommended. - typedef std::size_t size_t; - - // The type used for the enqueue and dequeue indices. Must be at least as - // large as size_t. Should be significantly larger than the number of elements - // you expect to hold at once, especially if you have a high turnover rate; - // for example, on 32-bit x86, if you expect to have over a hundred million - // elements or pump several million elements through your queue in a very - // short space of time, using a 32-bit type *may* trigger a race condition. - // A 64-bit int type is recommended in that case, and in practice will - // prevent a race condition no matter the usage of the queue. Note that - // whether the queue is lock-free with a 64-int type depends on the whether - // std::atomic is lock-free, which is platform-specific. - typedef std::size_t index_t; - - // Internally, all elements are enqueued and dequeued from multi-element - // blocks; this is the smallest controllable unit. If you expect few elements - // but many producers, a smaller block size should be favoured. For few producers - // and/or many elements, a larger block size is preferred. A sane default - // is provided. Must be a power of 2. - static const size_t BLOCK_SIZE = 32; - - // For explicit producers (i.e. when using a producer token), the block is - // checked for being empty by iterating through a list of flags, one per element. - // For large block sizes, this is too inefficient, and switching to an atomic - // counter-based approach is faster. The switch is made for block sizes strictly - // larger than this threshold. - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; - - // How many full blocks can be expected for a single explicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; - - // How many full blocks can be expected for a single implicit producer? This should - // reflect that number's maximum for optimal performance. Must be a power of 2. - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; - - // The initial size of the hash table mapping thread IDs to implicit producers. - // Note that the hash is resized every time it becomes half full. - // Must be a power of two, and either 0 or at least 1. If 0, implicit production - // (using the enqueue methods without an explicit producer token) is disabled. - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; - - // Controls the number of items that an explicit consumer (i.e. one with a token) - // must consume before it causes all consumers to rotate and move on to the next - // internal queue. - static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = 256; - - // The maximum number of elements (inclusive) that can be enqueued to a sub-queue. - // Enqueue operations that would cause this limit to be surpassed will fail. Note - // that this limit is enforced at the block level (for performance reasons), i.e. - // it's rounded up to the nearest block size. - static const size_t MAX_SUBQUEUE_SIZE = details::const_numeric_max::value; - - // The number of times to spin before sleeping when waiting on a semaphore. - // Recommended values are on the order of 1000-10000 unless the number of - // consumer threads exceeds the number of idle cores (in which case try 0-100). - // Only affects instances of the BlockingConcurrentQueue. - static const int MAX_SEMA_SPINS = 10000; - - // Whether to recycle dynamically-allocated blocks into an internal free list or - // not. If false, only pre-allocated blocks (controlled by the constructor - // arguments) will be recycled, and all others will be `free`d back to the heap. - // Note that blocks consumed by explicit producers are only freed on destruction - // of the queue (not following destruction of the token) regardless of this trait. - static const bool RECYCLE_ALLOCATED_BLOCKS = false; - - +struct ConcurrentQueueDefaultTraits { + // General-purpose size type. std::size_t is strongly recommended. + typedef std::size_t size_t; + + // The type used for the enqueue and dequeue indices. Must be at least as + // large as size_t. Should be significantly larger than the number of elements + // you expect to hold at once, especially if you have a high turnover rate; + // for example, on 32-bit x86, if you expect to have over a hundred million + // elements or pump several million elements through your queue in a very + // short space of time, using a 32-bit type *may* trigger a race condition. + // A 64-bit int type is recommended in that case, and in practice will + // prevent a race condition no matter the usage of the queue. Note that + // whether the queue is lock-free with a 64-int type depends on the whether + // std::atomic is lock-free, which is platform-specific. + typedef std::size_t index_t; + + // Internally, all elements are enqueued and dequeued from multi-element + // blocks; this is the smallest controllable unit. If you expect few elements + // but many producers, a smaller block size should be favoured. For few + // producers and/or many elements, a larger block size is preferred. A sane + // default is provided. Must be a power of 2. + static const size_t BLOCK_SIZE = 32; + + // For explicit producers (i.e. when using a producer token), the block is + // checked for being empty by iterating through a list of flags, one per + // element. For large block sizes, this is too inefficient, and switching to + // an atomic counter-based approach is faster. The switch is made for block + // sizes strictly larger than this threshold. + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = 32; + + // How many full blocks can be expected for a single explicit producer? This + // should reflect that number's maximum for optimal performance. Must be a + // power of 2. + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = 32; + + // How many full blocks can be expected for a single implicit producer? This + // should reflect that number's maximum for optimal performance. Must be a + // power of 2. + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = 32; + + // The initial size of the hash table mapping thread IDs to implicit + // producers. Note that the hash is resized every time it becomes half full. + // Must be a power of two, and either 0 or at least 1. If 0, implicit + // production (using the enqueue methods without an explicit producer token) + // is disabled. + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = 32; + + // Controls the number of items that an explicit consumer (i.e. one with a + // token) must consume before it causes all consumers to rotate and move on to + // the next internal queue. + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = + 256; + + // The maximum number of elements (inclusive) that can be enqueued to a + // sub-queue. Enqueue operations that would cause this limit to be surpassed + // will fail. Note that this limit is enforced at the block level (for + // performance reasons), i.e. it's rounded up to the nearest block size. + static const size_t MAX_SUBQUEUE_SIZE = + details::const_numeric_max::value; + + // The number of times to spin before sleeping when waiting on a semaphore. + // Recommended values are on the order of 1000-10000 unless the number of + // consumer threads exceeds the number of idle cores (in which case try + // 0-100). Only affects instances of the BlockingConcurrentQueue. + static const int MAX_SEMA_SPINS = 10000; + + // Whether to recycle dynamically-allocated blocks into an internal free list + // or not. If false, only pre-allocated blocks (controlled by the constructor + // arguments) will be recycled, and all others will be `free`d back to the + // heap. Note that blocks consumed by explicit producers are only freed on + // destruction of the queue (not following destruction of the token) + // regardless of this trait. + static const bool RECYCLE_ALLOCATED_BLOCKS = false; + #ifndef MCDBGQ_USE_RELACY - // Memory allocation can be customized if needed. - // malloc should return nullptr on failure, and handle alignment like std::malloc. + // Memory allocation can be customized if needed. + // malloc should return nullptr on failure, and handle alignment like + // std::malloc. #if defined(malloc) || defined(free) - // Gah, this is 2015, stop defining macros that break standard code already! - // Work around malloc/free being special macros: - static inline void* WORKAROUND_malloc(size_t size) { return malloc(size); } - static inline void WORKAROUND_free(void* ptr) { return free(ptr); } - static inline void* (malloc)(size_t size) { return WORKAROUND_malloc(size); } - static inline void (free)(void* ptr) { return WORKAROUND_free(ptr); } + // Gah, this is 2015, stop defining macros that break standard code already! + // Work around malloc/free being special macros: + static inline void *WORKAROUND_malloc(size_t size) { return malloc(size); } + static inline void WORKAROUND_free(void *ptr) { return free(ptr); } + static inline void *(malloc)(size_t size) { return WORKAROUND_malloc(size); } + static inline void(free)(void *ptr) { return WORKAROUND_free(ptr); } #else - static inline void* malloc(size_t size) { return std::malloc(size); } - static inline void free(void* ptr) { return std::free(ptr); } + static inline void *malloc(size_t size) { return std::malloc(size); } + static inline void free(void *ptr) { return std::free(ptr); } #endif #else - // Debug versions when running under the Relacy race detector (ignore - // these in user code) - static inline void* malloc(size_t size) { return rl::rl_malloc(size, $); } - static inline void free(void* ptr) { return rl::rl_free(ptr, $); } + // Debug versions when running under the Relacy race detector (ignore + // these in user code) + static inline void *malloc(size_t size) { return rl::rl_malloc(size, $); } + static inline void free(void *ptr) { return rl::rl_free(ptr, $); } #endif }; - // When producing or consuming many elements, the most efficient way is to: // 1) Use one of the bulk-operation methods of the queue with a token // 2) Failing that, use the bulk-operation methods without a token @@ -421,3322 +534,3729 @@ struct ConcurrentQueueDefaultTraits struct ProducerToken; struct ConsumerToken; -template class ConcurrentQueue; -template class BlockingConcurrentQueue; +template class ConcurrentQueue; +template class BlockingConcurrentQueue; class ConcurrentQueueTests; +namespace details { +struct ConcurrentQueueProducerTypelessBase { + ConcurrentQueueProducerTypelessBase *next; + std::atomic inactive; + ProducerToken *token; -namespace details -{ - struct ConcurrentQueueProducerTypelessBase - { - ConcurrentQueueProducerTypelessBase* next; - std::atomic inactive; - ProducerToken* token; - - ConcurrentQueueProducerTypelessBase() - : next(nullptr), inactive(false), token(nullptr) - { - } - }; - - template struct _hash_32_or_64 { - static inline std::uint32_t hash(std::uint32_t h) - { - // MurmurHash3 finalizer -- see https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp - // Since the thread ID is already unique, all we really want to do is propagate that - // uniqueness evenly across all the bits, so that we can use a subset of the bits while - // reducing collisions significantly - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - return h ^ (h >> 16); - } - }; - template<> struct _hash_32_or_64<1> { - static inline std::uint64_t hash(std::uint64_t h) - { - h ^= h >> 33; - h *= 0xff51afd7ed558ccd; - h ^= h >> 33; - h *= 0xc4ceb9fe1a85ec53; - return h ^ (h >> 33); - } - }; - template struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> { }; - - static inline size_t hash_thread_id(thread_id_t id) - { - static_assert(sizeof(thread_id_t) <= 8, "Expected a platform where thread IDs are at most 64-bit values"); - return static_cast(hash_32_or_64::thread_id_hash_t)>::hash( - thread_id_converter::prehash(id))); - } - - template - static inline bool circular_less_than(T a, T b) - { - static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "circular_less_than is intended to be used only with unsigned integer types"); - return static_cast(a - b) > static_cast(static_cast(1) << (static_cast(sizeof(T) * CHAR_BIT - 1))); - // Note: extra parens around rhs of operator<< is MSVC bug: https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931 - // silencing the bug requires #pragma warning(disable: 4554) around the calling code and has no effect when done here. - } - - template - static inline char* align_for(char* ptr) - { - const std::size_t alignment = std::alignment_of::value; - return ptr + (alignment - (reinterpret_cast(ptr) % alignment)) % alignment; - } - - template - static inline T ceil_to_pow_2(T x) - { - static_assert(std::is_integral::value && !std::numeric_limits::is_signed, "ceil_to_pow_2 is intended to be used only with unsigned integer types"); - - // Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 - --x; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - for (std::size_t i = 1; i < sizeof(T); i <<= 1) { - x |= x >> (i << 3); - } - ++x; - return x; - } - - template - static inline void swap_relaxed(std::atomic& left, std::atomic& right) - { - T temp = left.load(std::memory_order_relaxed); - left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed); - right.store(temp, std::memory_order_relaxed); - } - - template - static inline T const& nomove(T const& x) - { - return x; - } - - template - struct nomove_if - { - template - static inline T const& eval(T const& x) - { - return x; - } - }; - - template<> - struct nomove_if - { - template - static inline auto eval(U&& x) - -> decltype(std::forward(x)) - { - return std::forward(x); - } - }; - - template - static inline auto deref_noexcept(It& it) MOODYCAMEL_NOEXCEPT -> decltype(*it) - { - return *it; - } - -#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) - template struct is_trivially_destructible : std::is_trivially_destructible { }; + ConcurrentQueueProducerTypelessBase() + : next(nullptr), inactive(false), token(nullptr) {} +}; + +template struct _hash_32_or_64 { + static inline std::uint32_t hash(std::uint32_t h) { + // MurmurHash3 finalizer -- see + // https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp + // Since the thread ID is already unique, all we really want to do is + // propagate that uniqueness evenly across all the bits, so that we can use + // a subset of the bits while reducing collisions significantly + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + return h ^ (h >> 16); + } +}; +template <> struct _hash_32_or_64<1> { + static inline std::uint64_t hash(std::uint64_t h) { + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + return h ^ (h >> 33); + } +}; +template +struct hash_32_or_64 : public _hash_32_or_64<(size > 4)> {}; + +static inline size_t hash_thread_id(thread_id_t id) { + static_assert( + sizeof(thread_id_t) <= 8, + "Expected a platform where thread IDs are at most 64-bit values"); + return static_cast( + hash_32_or_64::thread_id_hash_t)>:: + hash(thread_id_converter::prehash(id))); +} + +template static inline bool circular_less_than(T a, T b) { + static_assert(std::is_integral::value && + !std::numeric_limits::is_signed, + "circular_less_than is intended to be used only with unsigned " + "integer types"); + return static_cast(a - b) > + static_cast(static_cast(1) + << (static_cast(sizeof(T) * CHAR_BIT - 1))); + // Note: extra parens around rhs of operator<< is MSVC bug: + // https://developercommunity2.visualstudio.com/t/C4554-triggers-when-both-lhs-and-rhs-is/10034931 + // silencing the bug requires #pragma warning(disable: 4554) around the + // calling code and has no effect when done here. +} + +template static inline char *align_for(char *ptr) { + const std::size_t alignment = std::alignment_of::value; + return ptr + + (alignment - (reinterpret_cast(ptr) % alignment)) % + alignment; +} + +template static inline T ceil_to_pow_2(T x) { + static_assert( + std::is_integral::value && !std::numeric_limits::is_signed, + "ceil_to_pow_2 is intended to be used only with unsigned integer types"); + + // Adapted from + // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + --x; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + for (std::size_t i = 1; i < sizeof(T); i <<= 1) { + x |= x >> (i << 3); + } + ++x; + return x; +} + +template +static inline void swap_relaxed(std::atomic &left, std::atomic &right) { + T temp = left.load(std::memory_order_relaxed); + left.store(right.load(std::memory_order_relaxed), std::memory_order_relaxed); + right.store(temp, std::memory_order_relaxed); +} + +template static inline T const &nomove(T const &x) { return x; } + +template struct nomove_if { + template static inline T const &eval(T const &x) { return x; } +}; + +template <> struct nomove_if { + template + static inline auto eval(U &&x) -> decltype(std::forward(x)) { + return std::forward(x); + } +}; + +template +static inline auto deref_noexcept(It &it) MOODYCAMEL_NOEXCEPT -> decltype(*it) { + return *it; +} + +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ > 4 || \ + (__GNUC__ == 4 && __GNUC_MINOR__ >= 8) +template +struct is_trivially_destructible : std::is_trivially_destructible {}; #else - template struct is_trivially_destructible : std::has_trivial_destructor { }; +template +struct is_trivially_destructible : std::has_trivial_destructor {}; #endif - + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED #ifdef MCDBGQ_USE_RELACY - typedef RelacyThreadExitListener ThreadExitListener; - typedef RelacyThreadExitNotifier ThreadExitNotifier; +typedef RelacyThreadExitListener ThreadExitListener; +typedef RelacyThreadExitNotifier ThreadExitNotifier; #else - class ThreadExitNotifier; - - struct ThreadExitListener - { - typedef void (*callback_t)(void*); - callback_t callback; - void* userData; - - ThreadExitListener* next; // reserved for use by the ThreadExitNotifier - ThreadExitNotifier* chain; // reserved for use by the ThreadExitNotifier - }; - - class ThreadExitNotifier - { - public: - static void subscribe(ThreadExitListener* listener) - { - auto& tlsInst = instance(); - std::lock_guard guard(mutex()); - listener->next = tlsInst.tail; - listener->chain = &tlsInst; - tlsInst.tail = listener; - } - - static void unsubscribe(ThreadExitListener* listener) - { - std::lock_guard guard(mutex()); - if (!listener->chain) { - return; // race with ~ThreadExitNotifier - } - auto& tlsInst = *listener->chain; - listener->chain = nullptr; - ThreadExitListener** prev = &tlsInst.tail; - for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { - if (ptr == listener) { - *prev = ptr->next; - break; - } - prev = &ptr->next; - } - } - - private: - ThreadExitNotifier() : tail(nullptr) { } - ThreadExitNotifier(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; - ThreadExitNotifier& operator=(ThreadExitNotifier const&) MOODYCAMEL_DELETE_FUNCTION; - - ~ThreadExitNotifier() - { - // This thread is about to exit, let everyone know! - assert(this == &instance() && "If this assert fails, you likely have a buggy compiler! Change the preprocessor conditions such that MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); - std::lock_guard guard(mutex()); - for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { - ptr->chain = nullptr; - ptr->callback(ptr->userData); - } - } - - // Thread-local - static inline ThreadExitNotifier& instance() - { - static thread_local ThreadExitNotifier notifier; - return notifier; - } - - static inline std::mutex& mutex() - { - // Must be static because the ThreadExitNotifier could be destroyed while unsubscribe is called - static std::mutex mutex; - return mutex; - } - - private: - ThreadExitListener* tail; - }; -#endif -#endif - - template struct static_is_lock_free_num { enum { value = 0 }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_CHAR_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_SHORT_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_INT_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_LONG_LOCK_FREE }; }; - template<> struct static_is_lock_free_num { enum { value = ATOMIC_LLONG_LOCK_FREE }; }; - template struct static_is_lock_free : static_is_lock_free_num::type> { }; - template<> struct static_is_lock_free { enum { value = ATOMIC_BOOL_LOCK_FREE }; }; - template struct static_is_lock_free { enum { value = ATOMIC_POINTER_LOCK_FREE }; }; -} +class ThreadExitNotifier; + +struct ThreadExitListener { + typedef void (*callback_t)(void *); + callback_t callback; + void *userData; + ThreadExitListener *next; // reserved for use by the ThreadExitNotifier + ThreadExitNotifier *chain; // reserved for use by the ThreadExitNotifier +}; + +class ThreadExitNotifier { +public: + static void subscribe(ThreadExitListener *listener) { + auto &tlsInst = instance(); + std::lock_guard guard(mutex()); + listener->next = tlsInst.tail; + listener->chain = &tlsInst; + tlsInst.tail = listener; + } + + static void unsubscribe(ThreadExitListener *listener) { + std::lock_guard guard(mutex()); + if (!listener->chain) { + return; // race with ~ThreadExitNotifier + } + auto &tlsInst = *listener->chain; + listener->chain = nullptr; + ThreadExitListener **prev = &tlsInst.tail; + for (auto ptr = tlsInst.tail; ptr != nullptr; ptr = ptr->next) { + if (ptr == listener) { + *prev = ptr->next; + break; + } + prev = &ptr->next; + } + } -struct ProducerToken -{ - template - explicit ProducerToken(ConcurrentQueue& queue); - - template - explicit ProducerToken(BlockingConcurrentQueue& queue); - - ProducerToken(ProducerToken&& other) MOODYCAMEL_NOEXCEPT - : producer(other.producer) - { - other.producer = nullptr; - if (producer != nullptr) { - producer->token = this; - } - } - - inline ProducerToken& operator=(ProducerToken&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ProducerToken& other) MOODYCAMEL_NOEXCEPT - { - std::swap(producer, other.producer); - if (producer != nullptr) { - producer->token = this; - } - if (other.producer != nullptr) { - other.producer->token = &other; - } - } - - // A token is always valid unless: - // 1) Memory allocation failed during construction - // 2) It was moved via the move constructor - // (Note: assignment does a swap, leaving both potentially valid) - // 3) The associated queue was destroyed - // Note that if valid() returns true, that only indicates - // that the token is valid for use with a specific queue, - // but not which one; that's up to the user to track. - inline bool valid() const { return producer != nullptr; } - - ~ProducerToken() - { - if (producer != nullptr) { - producer->token = nullptr; - producer->inactive.store(true, std::memory_order_release); - } - } - - // Disable copying and assignment - ProducerToken(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; - ProducerToken& operator=(ProducerToken const&) MOODYCAMEL_DELETE_FUNCTION; - private: - template friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - + ThreadExitNotifier() : tail(nullptr) {} + ThreadExitNotifier(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; + ThreadExitNotifier & + operator=(ThreadExitNotifier const &) MOODYCAMEL_DELETE_FUNCTION; + + ~ThreadExitNotifier() { + // This thread is about to exit, let everyone know! + assert(this == &instance() && + "If this assert fails, you likely have a buggy compiler! Change the " + "preprocessor conditions such that " + "MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED is no longer defined."); + std::lock_guard guard(mutex()); + for (auto ptr = tail; ptr != nullptr; ptr = ptr->next) { + ptr->chain = nullptr; + ptr->callback(ptr->userData); + } + } + + // Thread-local + static inline ThreadExitNotifier &instance() { + static thread_local ThreadExitNotifier notifier; + return notifier; + } + + static inline std::mutex &mutex() { + // Must be static because the ThreadExitNotifier could be destroyed while + // unsubscribe is called + static std::mutex mutex; + return mutex; + } + +private: + ThreadExitListener *tail; +}; +#endif +#endif + +template struct static_is_lock_free_num { + enum { value = 0 }; +}; +template <> struct static_is_lock_free_num { + enum { value = ATOMIC_CHAR_LOCK_FREE }; +}; +template <> struct static_is_lock_free_num { + enum { value = ATOMIC_SHORT_LOCK_FREE }; +}; +template <> struct static_is_lock_free_num { + enum { value = ATOMIC_INT_LOCK_FREE }; +}; +template <> struct static_is_lock_free_num { + enum { value = ATOMIC_LONG_LOCK_FREE }; +}; +template <> struct static_is_lock_free_num { + enum { value = ATOMIC_LLONG_LOCK_FREE }; +}; +template +struct static_is_lock_free + : static_is_lock_free_num::type> {}; +template <> struct static_is_lock_free { + enum { value = ATOMIC_BOOL_LOCK_FREE }; +}; +template struct static_is_lock_free { + enum { value = ATOMIC_POINTER_LOCK_FREE }; +}; +} // namespace details + +struct ProducerToken { + template + explicit ProducerToken(ConcurrentQueue &queue); + + template + explicit ProducerToken(BlockingConcurrentQueue &queue); + + ProducerToken(ProducerToken &&other) MOODYCAMEL_NOEXCEPT + : producer(other.producer) { + other.producer = nullptr; + if (producer != nullptr) { + producer->token = this; + } + } + + inline ProducerToken &operator=(ProducerToken &&other) MOODYCAMEL_NOEXCEPT { + swap(other); + return *this; + } + + void swap(ProducerToken &other) MOODYCAMEL_NOEXCEPT { + std::swap(producer, other.producer); + if (producer != nullptr) { + producer->token = this; + } + if (other.producer != nullptr) { + other.producer->token = &other; + } + } + + // A token is always valid unless: + // 1) Memory allocation failed during construction + // 2) It was moved via the move constructor + // (Note: assignment does a swap, leaving both potentially valid) + // 3) The associated queue was destroyed + // Note that if valid() returns true, that only indicates + // that the token is valid for use with a specific queue, + // but not which one; that's up to the user to track. + inline bool valid() const { return producer != nullptr; } + + ~ProducerToken() { + if (producer != nullptr) { + producer->token = nullptr; + producer->inactive.store(true, std::memory_order_release); + } + } + + // Disable copying and assignment + ProducerToken(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; + ProducerToken &operator=(ProducerToken const &) MOODYCAMEL_DELETE_FUNCTION; + +private: + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + protected: - details::ConcurrentQueueProducerTypelessBase* producer; + details::ConcurrentQueueProducerTypelessBase *producer; }; +struct ConsumerToken { + template + explicit ConsumerToken(ConcurrentQueue &q); + + template + explicit ConsumerToken(BlockingConcurrentQueue &q); + + ConsumerToken(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT + : initialOffset(other.initialOffset), + lastKnownGlobalOffset(other.lastKnownGlobalOffset), + itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), + currentProducer(other.currentProducer), + desiredProducer(other.desiredProducer) {} + + inline ConsumerToken &operator=(ConsumerToken &&other) MOODYCAMEL_NOEXCEPT { + swap(other); + return *this; + } -struct ConsumerToken -{ - template - explicit ConsumerToken(ConcurrentQueue& q); - - template - explicit ConsumerToken(BlockingConcurrentQueue& q); - - ConsumerToken(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT - : initialOffset(other.initialOffset), lastKnownGlobalOffset(other.lastKnownGlobalOffset), itemsConsumedFromCurrent(other.itemsConsumedFromCurrent), currentProducer(other.currentProducer), desiredProducer(other.desiredProducer) - { - } - - inline ConsumerToken& operator=(ConsumerToken&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - void swap(ConsumerToken& other) MOODYCAMEL_NOEXCEPT - { - std::swap(initialOffset, other.initialOffset); - std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); - std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); - std::swap(currentProducer, other.currentProducer); - std::swap(desiredProducer, other.desiredProducer); - } - - // Disable copying and assignment - ConsumerToken(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; - ConsumerToken& operator=(ConsumerToken const&) MOODYCAMEL_DELETE_FUNCTION; + void swap(ConsumerToken &other) MOODYCAMEL_NOEXCEPT { + std::swap(initialOffset, other.initialOffset); + std::swap(lastKnownGlobalOffset, other.lastKnownGlobalOffset); + std::swap(itemsConsumedFromCurrent, other.itemsConsumedFromCurrent); + std::swap(currentProducer, other.currentProducer); + std::swap(desiredProducer, other.desiredProducer); + } + + // Disable copying and assignment + ConsumerToken(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; + ConsumerToken &operator=(ConsumerToken const &) MOODYCAMEL_DELETE_FUNCTION; private: - template friend class ConcurrentQueue; - friend class ConcurrentQueueTests; - + template friend class ConcurrentQueue; + friend class ConcurrentQueueTests; + private: // but shared with ConcurrentQueue - std::uint32_t initialOffset; - std::uint32_t lastKnownGlobalOffset; - std::uint32_t itemsConsumedFromCurrent; - details::ConcurrentQueueProducerTypelessBase* currentProducer; - details::ConcurrentQueueProducerTypelessBase* desiredProducer; + std::uint32_t initialOffset; + std::uint32_t lastKnownGlobalOffset; + std::uint32_t itemsConsumedFromCurrent; + details::ConcurrentQueueProducerTypelessBase *currentProducer; + details::ConcurrentQueueProducerTypelessBase *desiredProducer; }; // Need to forward-declare this swap because it's in a namespace. -// See http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces -template -inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT; +// See +// http://stackoverflow.com/questions/4492062/why-does-a-c-friend-class-need-a-forward-declaration-only-in-other-namespaces +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP &a, + typename ConcurrentQueue::ImplicitProducerKVP &b) + MOODYCAMEL_NOEXCEPT; - -template -class ConcurrentQueue -{ +template +class ConcurrentQueue { public: - typedef ::moodycamel::ProducerToken producer_token_t; - typedef ::moodycamel::ConsumerToken consumer_token_t; - - typedef typename Traits::index_t index_t; - typedef typename Traits::size_t size_t; - - static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); - static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); - static const size_t EXPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); - static const size_t IMPLICIT_INITIAL_INDEX_SIZE = static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); - static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); - static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = static_cast(Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); + typedef ::moodycamel::ProducerToken producer_token_t; + typedef ::moodycamel::ConsumerToken consumer_token_t; + + typedef typename Traits::index_t index_t; + typedef typename Traits::size_t size_t; + + static const size_t BLOCK_SIZE = static_cast(Traits::BLOCK_SIZE); + static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = + static_cast(Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD); + static const size_t EXPLICIT_INITIAL_INDEX_SIZE = + static_cast(Traits::EXPLICIT_INITIAL_INDEX_SIZE); + static const size_t IMPLICIT_INITIAL_INDEX_SIZE = + static_cast(Traits::IMPLICIT_INITIAL_INDEX_SIZE); + static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = + static_cast(Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE); + static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = + static_cast( + Traits::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE); #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable: 4307) // + integral constant overflow (that's what the ternary expression is for!) -#pragma warning(disable: 4309) // static_cast: Truncation of constant value +#pragma warning(disable : 4307) // + integral constant overflow (that's what the + // ternary expression is for!) +#pragma warning(disable : 4309) // static_cast: Truncation of constant value #endif - static const size_t MAX_SUBQUEUE_SIZE = (details::const_numeric_max::value - static_cast(Traits::MAX_SUBQUEUE_SIZE) < BLOCK_SIZE) ? details::const_numeric_max::value : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + (BLOCK_SIZE - 1)) / BLOCK_SIZE * BLOCK_SIZE); + static const size_t MAX_SUBQUEUE_SIZE = + (details::const_numeric_max::value - + static_cast(Traits::MAX_SUBQUEUE_SIZE) < + BLOCK_SIZE) + ? details::const_numeric_max::value + : ((static_cast(Traits::MAX_SUBQUEUE_SIZE) + + (BLOCK_SIZE - 1)) / + BLOCK_SIZE * BLOCK_SIZE); #ifdef _MSC_VER #pragma warning(pop) #endif - static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::size_t must be an unsigned integral type"); - static_assert(!std::numeric_limits::is_signed && std::is_integral::value, "Traits::index_t must be an unsigned integral type"); - static_assert(sizeof(index_t) >= sizeof(size_t), "Traits::index_t must be at least as wide as Traits::size_t"); - static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); - static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a power of 2 (and greater than 1)"); - static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && !(EXPLICIT_INITIAL_INDEX_SIZE & (EXPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); - static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && !(IMPLICIT_INITIAL_INDEX_SIZE & (IMPLICIT_INITIAL_INDEX_SIZE - 1)), "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and greater than 1)"); - static_assert((INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); - static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least 1 (or 0 to disable implicit enqueueing)"); + static_assert(!std::numeric_limits::is_signed && + std::is_integral::value, + "Traits::size_t must be an unsigned integral type"); + static_assert(!std::numeric_limits::is_signed && + std::is_integral::value, + "Traits::index_t must be an unsigned integral type"); + static_assert(sizeof(index_t) >= sizeof(size_t), + "Traits::index_t must be at least as wide as Traits::size_t"); + static_assert((BLOCK_SIZE > 1) && !(BLOCK_SIZE & (BLOCK_SIZE - 1)), + "Traits::BLOCK_SIZE must be a power of 2 (and at least 2)"); + static_assert((EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD > 1) && + !(EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD & + (EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD - 1)), + "Traits::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD must be a " + "power of 2 (and greater than 1)"); + static_assert((EXPLICIT_INITIAL_INDEX_SIZE > 1) && + !(EXPLICIT_INITIAL_INDEX_SIZE & + (EXPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::EXPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and " + "greater than 1)"); + static_assert((IMPLICIT_INITIAL_INDEX_SIZE > 1) && + !(IMPLICIT_INITIAL_INDEX_SIZE & + (IMPLICIT_INITIAL_INDEX_SIZE - 1)), + "Traits::IMPLICIT_INITIAL_INDEX_SIZE must be a power of 2 (and " + "greater than 1)"); + static_assert( + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) || + !(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE & + (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - 1)), + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be a power of 2"); + static_assert(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0 || + INITIAL_IMPLICIT_PRODUCER_HASH_SIZE >= 1, + "Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE must be at least " + "1 (or 0 to disable implicit enqueueing)"); public: - // Creates a queue with at least `capacity` element slots; note that the - // actual number of elements that can be inserted without additional memory - // allocation depends on the number of producers and the block size (e.g. if - // the block size is equal to `capacity`, only a single block will be allocated - // up-front, which means only a single producer will be able to enqueue elements - // without an extra allocation -- blocks aren't shared between producers). - // This method is not thread safe -- it is up to the user to ensure that the - // queue is fully constructed before it starts being used by other threads (this - // includes making the memory effects of construction visible, possibly with a - // memory barrier). - explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE) - : producerListTail(nullptr), - producerCount(0), - initialBlockPoolIndex(0), - nextExplicitConsumerId(0), - globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - populate_initial_block_list(capacity / BLOCK_SIZE + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); - + // Creates a queue with at least `capacity` element slots; note that the + // actual number of elements that can be inserted without additional memory + // allocation depends on the number of producers and the block size (e.g. if + // the block size is equal to `capacity`, only a single block will be + // allocated up-front, which means only a single producer will be able to + // enqueue elements without an extra allocation -- blocks aren't shared + // between producers). This method is not thread safe -- it is up to the user + // to ensure that the queue is fully constructed before it starts being used + // by other threads (this includes making the memory effects of construction + // visible, possibly with a memory barrier). + explicit ConcurrentQueue(size_t capacity = 32 * BLOCK_SIZE) + : producerListTail(nullptr), producerCount(0), initialBlockPoolIndex(0), + nextExplicitConsumerId(0), globalExplicitConsumerOffset(0) { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + populate_initial_block_list(capacity / BLOCK_SIZE + + ((capacity & (BLOCK_SIZE - 1)) == 0 ? 0 : 1)); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - // Track all the producers using a fully-resolved typed list for - // each kind; this makes it possible to debug them starting from - // the root queue object (otherwise wacky casts are needed that - // don't compile in the debugger's expression evaluator). - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Computes the correct amount of pre-allocated blocks for you based - // on the minimum number of elements you want available at any given - // time, and the maximum concurrent number of each type of producer. - ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers) - : producerListTail(nullptr), - producerCount(0), - initialBlockPoolIndex(0), - nextExplicitConsumerId(0), - globalExplicitConsumerOffset(0) - { - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * (maxExplicitProducers + 1) + 2 * (maxExplicitProducers + maxImplicitProducers); - populate_initial_block_list(blocks); - + // Track all the producers using a fully-resolved typed list for + // each kind; this makes it possible to debug them starting from + // the root queue object (otherwise wacky casts are needed that + // don't compile in the debugger's expression evaluator). + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Computes the correct amount of pre-allocated blocks for you based + // on the minimum number of elements you want available at any given + // time, and the maximum concurrent number of each type of producer. + ConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, + size_t maxImplicitProducers) + : producerListTail(nullptr), producerCount(0), initialBlockPoolIndex(0), + nextExplicitConsumerId(0), globalExplicitConsumerOffset(0) { + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + size_t blocks = (((minCapacity + BLOCK_SIZE - 1) / BLOCK_SIZE) - 1) * + (maxExplicitProducers + 1) + + 2 * (maxExplicitProducers + maxImplicitProducers); + populate_initial_block_list(blocks); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - } - - // Note: The queue should not be accessed concurrently while it's - // being deleted. It's up to the user to synchronize this. - // This method is not thread safe. - ~ConcurrentQueue() - { - // Destroy producers - auto ptr = producerListTail.load(std::memory_order_relaxed); - while (ptr != nullptr) { - auto next = ptr->next_prod(); - if (ptr->token != nullptr) { - ptr->token->producer = nullptr; - } - destroy(ptr); - ptr = next; - } - - // Destroy implicit producer hash tables - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { - auto hash = implicitProducerHash.load(std::memory_order_relaxed); - while (hash != nullptr) { - auto prev = hash->prev; - if (prev != nullptr) { // The last hash is part of this object and was not allocated dynamically - for (size_t i = 0; i != hash->capacity; ++i) { - hash->entries[i].~ImplicitProducerKVP(); - } - hash->~ImplicitProducerHash(); - (Traits::free)(hash); - } - hash = prev; - } - } - - // Destroy global free list - auto block = freeList.head_unsafe(); - while (block != nullptr) { - auto next = block->freeListNext.load(std::memory_order_relaxed); - if (block->dynamicallyAllocated) { - destroy(block); - } - block = next; - } - - // Destroy initial free list - destroy_array(initialBlockPool, initialBlockPoolSize); - } - - // Disable copying and copy assignment - ConcurrentQueue(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; - ConcurrentQueue& operator=(ConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION; - - // Moving is supported, but note that it is *not* a thread-safe operation. - // Nobody can use the queue while it's being moved, and the memory effects - // of that move must be propagated to other threads before they can use it. - // Note: When a queue is moved, its tokens are still valid but can only be - // used with the destination queue (i.e. semantically they are moved along - // with the queue itself). - ConcurrentQueue(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT - : producerListTail(other.producerListTail.load(std::memory_order_relaxed)), - producerCount(other.producerCount.load(std::memory_order_relaxed)), - initialBlockPoolIndex(other.initialBlockPoolIndex.load(std::memory_order_relaxed)), - initialBlockPool(other.initialBlockPool), - initialBlockPoolSize(other.initialBlockPoolSize), - freeList(std::move(other.freeList)), - nextExplicitConsumerId(other.nextExplicitConsumerId.load(std::memory_order_relaxed)), - globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load(std::memory_order_relaxed)) - { - // Move the other one into this, and leave the other one as an empty queue - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - populate_initial_implicit_producer_hash(); - swap_implicit_producer_hashes(other); - - other.producerListTail.store(nullptr, std::memory_order_relaxed); - other.producerCount.store(0, std::memory_order_relaxed); - other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); - other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); - + explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + } + + // Note: The queue should not be accessed concurrently while it's + // being deleted. It's up to the user to synchronize this. + // This method is not thread safe. + ~ConcurrentQueue() { + // Destroy producers + auto ptr = producerListTail.load(std::memory_order_relaxed); + while (ptr != nullptr) { + auto next = ptr->next_prod(); + if (ptr->token != nullptr) { + ptr->token->producer = nullptr; + } + destroy(ptr); + ptr = next; + } + + // Destroy implicit producer hash tables + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE != 0) { + auto hash = implicitProducerHash.load(std::memory_order_relaxed); + while (hash != nullptr) { + auto prev = hash->prev; + if (prev != nullptr) { // The last hash is part of this object and was + // not allocated dynamically + for (size_t i = 0; i != hash->capacity; ++i) { + hash->entries[i].~ImplicitProducerKVP(); + } + hash->~ImplicitProducerHash(); + (Traits::free)(hash); + } + hash = prev; + } + } + + // Destroy global free list + auto block = freeList.head_unsafe(); + while (block != nullptr) { + auto next = block->freeListNext.load(std::memory_order_relaxed); + if (block->dynamicallyAllocated) { + destroy(block); + } + block = next; + } + + // Destroy initial free list + destroy_array(initialBlockPool, initialBlockPoolSize); + } + + // Disable copying and copy assignment + ConcurrentQueue(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; + ConcurrentQueue & + operator=(ConcurrentQueue const &) MOODYCAMEL_DELETE_FUNCTION; + + // Moving is supported, but note that it is *not* a thread-safe operation. + // Nobody can use the queue while it's being moved, and the memory effects + // of that move must be propagated to other threads before they can use it. + // Note: When a queue is moved, its tokens are still valid but can only be + // used with the destination queue (i.e. semantically they are moved along + // with the queue itself). + ConcurrentQueue(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT + : producerListTail( + other.producerListTail.load(std::memory_order_relaxed)), + producerCount(other.producerCount.load(std::memory_order_relaxed)), + initialBlockPoolIndex( + other.initialBlockPoolIndex.load(std::memory_order_relaxed)), + initialBlockPool(other.initialBlockPool), + initialBlockPoolSize(other.initialBlockPoolSize), + freeList(std::move(other.freeList)), + nextExplicitConsumerId( + other.nextExplicitConsumerId.load(std::memory_order_relaxed)), + globalExplicitConsumerOffset(other.globalExplicitConsumerOffset.load( + std::memory_order_relaxed)) { + // Move the other one into this, and leave the other one as an empty queue + implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); + populate_initial_implicit_producer_hash(); + swap_implicit_producer_hashes(other); + + other.producerListTail.store(nullptr, std::memory_order_relaxed); + other.producerCount.store(0, std::memory_order_relaxed); + other.nextExplicitConsumerId.store(0, std::memory_order_relaxed); + other.globalExplicitConsumerOffset.store(0, std::memory_order_relaxed); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - explicitProducers.store(other.explicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); - other.explicitProducers.store(nullptr, std::memory_order_relaxed); - implicitProducers.store(other.implicitProducers.load(std::memory_order_relaxed), std::memory_order_relaxed); - other.implicitProducers.store(nullptr, std::memory_order_relaxed); -#endif - - other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); - other.initialBlockPoolSize = 0; - other.initialBlockPool = nullptr; - - reown_producers(); - } - - inline ConcurrentQueue& operator=(ConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT - { - return swap_internal(other); - } - - // Swaps this queue's state with the other's. Not thread-safe. - // Swapping two queues does not invalidate their tokens, however - // the tokens that were created for one queue must be used with - // only the swapped queue (i.e. the tokens are tied to the - // queue's movable state, not the object itself). - inline void swap(ConcurrentQueue& other) MOODYCAMEL_NOEXCEPT - { - swap_internal(other); - } - + explicitProducers.store( + other.explicitProducers.load(std::memory_order_relaxed), + std::memory_order_relaxed); + other.explicitProducers.store(nullptr, std::memory_order_relaxed); + implicitProducers.store( + other.implicitProducers.load(std::memory_order_relaxed), + std::memory_order_relaxed); + other.implicitProducers.store(nullptr, std::memory_order_relaxed); +#endif + + other.initialBlockPoolIndex.store(0, std::memory_order_relaxed); + other.initialBlockPoolSize = 0; + other.initialBlockPool = nullptr; + + reown_producers(); + } + + inline ConcurrentQueue & + operator=(ConcurrentQueue &&other) MOODYCAMEL_NOEXCEPT { + return swap_internal(other); + } + + // Swaps this queue's state with the other's. Not thread-safe. + // Swapping two queues does not invalidate their tokens, however + // the tokens that were created for one queue must be used with + // only the swapped queue (i.e. the tokens are tied to the + // queue's movable state, not the object itself). + inline void swap(ConcurrentQueue &other) MOODYCAMEL_NOEXCEPT { + swap_internal(other); + } + private: - ConcurrentQueue& swap_internal(ConcurrentQueue& other) - { - if (this == &other) { - return *this; - } - - details::swap_relaxed(producerListTail, other.producerListTail); - details::swap_relaxed(producerCount, other.producerCount); - details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); - std::swap(initialBlockPool, other.initialBlockPool); - std::swap(initialBlockPoolSize, other.initialBlockPoolSize); - freeList.swap(other.freeList); - details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); - details::swap_relaxed(globalExplicitConsumerOffset, other.globalExplicitConsumerOffset); - - swap_implicit_producer_hashes(other); - - reown_producers(); - other.reown_producers(); - + ConcurrentQueue &swap_internal(ConcurrentQueue &other) { + if (this == &other) { + return *this; + } + + details::swap_relaxed(producerListTail, other.producerListTail); + details::swap_relaxed(producerCount, other.producerCount); + details::swap_relaxed(initialBlockPoolIndex, other.initialBlockPoolIndex); + std::swap(initialBlockPool, other.initialBlockPool); + std::swap(initialBlockPoolSize, other.initialBlockPoolSize); + freeList.swap(other.freeList); + details::swap_relaxed(nextExplicitConsumerId, other.nextExplicitConsumerId); + details::swap_relaxed(globalExplicitConsumerOffset, + other.globalExplicitConsumerOffset); + + swap_implicit_producer_hashes(other); + + reown_producers(); + other.reown_producers(); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - details::swap_relaxed(explicitProducers, other.explicitProducers); - details::swap_relaxed(implicitProducers, other.implicitProducers); + details::swap_relaxed(explicitProducers, other.explicitProducers); + details::swap_relaxed(implicitProducers, other.implicitProducers); #endif - - return *this; - } - + + return *this; + } + public: - // Enqueues a single item (by copying it). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T const& item) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Allocates memory if required. Only fails if memory allocation fails (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, - // or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(T&& item) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const& token, T const& item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails (or - // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Thread-safe. - inline bool enqueue(producer_token_t const& token, T&& item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Allocates memory if required. Only fails if memory allocation fails (or - // implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved instead of copied. - // Thread-safe. - template - bool enqueue_bulk(It itemFirst, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Allocates memory if required. Only fails if memory allocation fails - // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - // Enqueues a single item (by copying it). - // Does not allocate memory. Fails if not enough room to enqueue (or implicit - // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE - // is 0). - // Thread-safe. - inline bool try_enqueue(T const& item) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(item); - } - - // Enqueues a single item (by moving it, if possible). - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Thread-safe. - inline bool try_enqueue(T&& item) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue(std::move(item)); - } - - // Enqueues a single item (by copying it) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const& token, T const& item) - { - return inner_enqueue(token, item); - } - - // Enqueues a single item (by moving it, if possible) using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Thread-safe. - inline bool try_enqueue(producer_token_t const& token, T&& item) - { - return inner_enqueue(token, std::move(item)); - } - - // Enqueues several items. - // Does not allocate memory (except for one-time implicit producer). - // Fails if not enough room to enqueue (or implicit production is - // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(It itemFirst, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) return false; - else return inner_enqueue_bulk(itemFirst, count); - } - - // Enqueues several items using an explicit producer token. - // Does not allocate memory. Fails if not enough room to enqueue. - // Note: Use std::make_move_iterator if the elements should be moved - // instead of copied. - // Thread-safe. - template - bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return inner_enqueue_bulk(token, itemFirst, count); - } - - - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(U& item) - { - // Instead of simply trying each producer in turn (which could cause needless contention on the first - // producer), we score them heuristically. - size_t nonEmptyCount = 0; - ProducerBase* best = nullptr; - size_t bestSize = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { - auto size = ptr->size_approx(); - if (size > 0) { - if (size > bestSize) { - bestSize = size; - best = ptr; - } - ++nonEmptyCount; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (nonEmptyCount > 0) { - if ((details::likely)(best->dequeue(item))) { - return true; - } - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr != best && ptr->dequeue(item)) { - return true; - } - } - } - return false; - } - - // Attempts to dequeue from the queue. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // This differs from the try_dequeue(item) method in that this one does - // not attempt to reduce contention by interleaving the order that producer - // streams are dequeued from. So, using this method can reduce overall throughput - // under contention, but will give more predictable results in single-threaded - // consumer scenarios. This is mostly only useful for internal unit tests. - // Never allocates. Thread-safe. - template - bool try_dequeue_non_interleaved(U& item) - { - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr->dequeue(item)) { - return true; - } - } - return false; - } - - // Attempts to dequeue from the queue using an explicit consumer token. - // Returns false if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - bool try_dequeue(consumer_token_t& token, U& item) - { - // The idea is roughly as follows: - // Every 256 items from one producer, make everyone rotate (increase the global offset) -> this means the highest efficiency consumer dictates the rotation speed of everyone else, more or less - // If you see that the global offset has changed, you must reset your consumption counter and move to your designated place - // If there's no items where you're supposed to be, keep moving until you find a producer with some items - // If the global offset has not changed but you've run out of items to consume, move over from your current position until you find an producer with something in it - - if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation(token)) { - return false; - } - } - - // If there was at least one non-empty queue but it appears empty at the time - // we try to dequeue from it, we need to make sure every queue's been tried - if (static_cast(token.currentProducer)->dequeue(item)) { - if (++token.itemsConsumedFromCurrent == EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); - } - return true; - } - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) { - if (ptr->dequeue(item)) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = 1; - return true; - } - ptr = ptr->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - } - return false; - } - - // Attempts to dequeue several elements from the queue. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(It itemFirst, size_t max) - { - size_t count = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - count += ptr->dequeue_bulk(itemFirst, max - count); - if (count == max) { - break; - } - } - return count; - } - - // Attempts to dequeue several elements from the queue using an explicit consumer token. - // Returns the number of items actually dequeued. - // Returns 0 if all producer streams appeared empty at the time they - // were checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max) - { - if (token.desiredProducer == nullptr || token.lastKnownGlobalOffset != globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { - if (!update_current_producer_after_rotation(token)) { - return 0; - } - } - - size_t count = static_cast(token.currentProducer)->dequeue_bulk(itemFirst, max); - if (count == max) { - if ((token.itemsConsumedFromCurrent += static_cast(max)) >= EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { - globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); - } - return max; - } - token.itemsConsumedFromCurrent += static_cast(count); - max -= count; - - auto tail = producerListTail.load(std::memory_order_acquire); - auto ptr = static_cast(token.currentProducer)->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - while (ptr != static_cast(token.currentProducer)) { - auto dequeued = ptr->dequeue_bulk(itemFirst, max); - count += dequeued; - if (dequeued != 0) { - token.currentProducer = ptr; - token.itemsConsumedFromCurrent = static_cast(dequeued); - } - if (dequeued == max) { - break; - } - max -= dequeued; - ptr = ptr->next_prod(); - if (ptr == nullptr) { - ptr = tail; - } - } - return count; - } - - - - // Attempts to dequeue from a specific producer's inner queue. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns false if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline bool try_dequeue_from_producer(producer_token_t const& producer, U& item) - { - return static_cast(producer.producer)->dequeue(item); - } - - // Attempts to dequeue several elements from a specific producer's inner queue. - // Returns the number of items actually dequeued. - // If you happen to know which producer you want to dequeue from, this - // is significantly faster than using the general-case try_dequeue methods. - // Returns 0 if the producer's queue appeared empty at the time it - // was checked (so, the queue is likely but not guaranteed to be empty). - // Never allocates. Thread-safe. - template - inline size_t try_dequeue_bulk_from_producer(producer_token_t const& producer, It itemFirst, size_t max) - { - return static_cast(producer.producer)->dequeue_bulk(itemFirst, max); - } - - - // Returns an estimate of the total number of elements currently in the queue. This - // estimate is only accurate if the queue has completely stabilized before it is called - // (i.e. all enqueue and dequeue operations have completed and their memory effects are - // visible on the calling thread, and no further operations start while this method is - // being called). - // Thread-safe. - size_t size_approx() const - { - size_t size = 0; - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - size += ptr->size_approx(); - } - return size; - } - - - // Returns true if the underlying atomic variables used by - // the queue are lock-free (they should be on most platforms). - // Thread-safe. - static constexpr bool is_lock_free() - { - return - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::value == 2 && - details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; - } + // Enqueues a single item (by copying it). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T const &item) { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(T &&item) { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(std::move(item)); + } + // Enqueues a single item (by copying it) using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails (or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Thread-safe. + inline bool enqueue(producer_token_t const &token, T const &item) { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. Allocates memory if required. Only fails if memory + // allocation fails (or Traits::MAX_SUBQUEUE_SIZE has been defined and would + // be surpassed). Thread-safe. + inline bool enqueue(producer_token_t const &token, T &&item) { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Allocates memory if required. Only fails if memory allocation fails (or + // implicit production is disabled because + // Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0, or + // Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). Note: + // Use std::make_move_iterator if the elements should be moved instead of + // copied. Thread-safe. + template bool enqueue_bulk(It itemFirst, size_t count) { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Allocates memory if required. Only fails if memory allocation fails + // (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool enqueue_bulk(producer_token_t const &token, It itemFirst, size_t count) { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Enqueues a single item (by copying it). + // Does not allocate memory. Fails if not enough room to enqueue (or implicit + // production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE + // is 0). + // Thread-safe. + inline bool try_enqueue(T const &item) { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(item); + } + + // Enqueues a single item (by moving it, if possible). + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Thread-safe. + inline bool try_enqueue(T &&item) { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue(std::move(item)); + } + + // Enqueues a single item (by copying it) using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Thread-safe. + inline bool try_enqueue(producer_token_t const &token, T const &item) { + return inner_enqueue(token, item); + } + + // Enqueues a single item (by moving it, if possible) using an explicit + // producer token. Does not allocate memory. Fails if not enough room to + // enqueue. Thread-safe. + inline bool try_enqueue(producer_token_t const &token, T &&item) { + return inner_enqueue(token, std::move(item)); + } + + // Enqueues several items. + // Does not allocate memory (except for one-time implicit producer). + // Fails if not enough room to enqueue (or implicit production is + // disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0). + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template bool try_enqueue_bulk(It itemFirst, size_t count) { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) + return false; + else return inner_enqueue_bulk(itemFirst, count); + } + + // Enqueues several items using an explicit producer token. + // Does not allocate memory. Fails if not enough room to enqueue. + // Note: Use std::make_move_iterator if the elements should be moved + // instead of copied. + // Thread-safe. + template + bool try_enqueue_bulk(producer_token_t const &token, It itemFirst, + size_t count) { + return inner_enqueue_bulk(token, itemFirst, count); + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template bool try_dequeue(U &item) { + // Instead of simply trying each producer in turn (which could cause + // needless contention on the first producer), we score them heuristically. + size_t nonEmptyCount = 0; + ProducerBase *best = nullptr; + size_t bestSize = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + nonEmptyCount < 3 && ptr != nullptr; ptr = ptr->next_prod()) { + auto size = ptr->size_approx(); + if (size > 0) { + if (size > bestSize) { + bestSize = size; + best = ptr; + } + ++nonEmptyCount; + } + } + + // If there was at least one non-empty queue but it appears empty at the + // time we try to dequeue from it, we need to make sure every queue's been + // tried + if (nonEmptyCount > 0) { + if ((details::likely)(best->dequeue(item))) { + return true; + } + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr != best && ptr->dequeue(item)) { + return true; + } + } + } + return false; + } + + // Attempts to dequeue from the queue. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // This differs from the try_dequeue(item) method in that this one does + // not attempt to reduce contention by interleaving the order that producer + // streams are dequeued from. So, using this method can reduce overall + // throughput under contention, but will give more predictable results in + // single-threaded consumer scenarios. This is mostly only useful for internal + // unit tests. Never allocates. Thread-safe. + template bool try_dequeue_non_interleaved(U &item) { + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->dequeue(item)) { + return true; + } + } + return false; + } + + // Attempts to dequeue from the queue using an explicit consumer token. + // Returns false if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template bool try_dequeue(consumer_token_t &token, U &item) { + // The idea is roughly as follows: + // Every 256 items from one producer, make everyone rotate (increase the + // global offset) -> this means the highest efficiency consumer dictates the + // rotation speed of everyone else, more or less If you see that the global + // offset has changed, you must reset your consumption counter and move to + // your designated place If there's no items where you're supposed to be, + // keep moving until you find a producer with some items If the global + // offset has not changed but you've run out of items to consume, move over + // from your current position until you find an producer with something in + // it + + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != + globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return false; + } + } + + // If there was at least one non-empty queue but it appears empty at the + // time we try to dequeue from it, we need to make sure every queue's been + // tried + if (static_cast(token.currentProducer)->dequeue(item)) { + if (++token.itemsConsumedFromCurrent == + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return true; + } + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + if (ptr->dequeue(item)) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = 1; + return true; + } + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return false; + } + + // Attempts to dequeue several elements from the queue. + // Returns the number of items actually dequeued. + // Returns 0 if all producer streams appeared empty at the time they + // were checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template size_t try_dequeue_bulk(It itemFirst, size_t max) { + size_t count = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) { + count += ptr->dequeue_bulk(itemFirst, max - count); + if (count == max) { + break; + } + } + return count; + } + + // Attempts to dequeue several elements from the queue using an explicit + // consumer token. Returns the number of items actually dequeued. Returns 0 if + // all producer streams appeared empty at the time they were checked (so, the + // queue is likely but not guaranteed to be empty). Never allocates. + // Thread-safe. + template + size_t try_dequeue_bulk(consumer_token_t &token, It itemFirst, size_t max) { + if (token.desiredProducer == nullptr || + token.lastKnownGlobalOffset != + globalExplicitConsumerOffset.load(std::memory_order_relaxed)) { + if (!update_current_producer_after_rotation(token)) { + return 0; + } + } + + size_t count = static_cast(token.currentProducer) + ->dequeue_bulk(itemFirst, max); + if (count == max) { + if ((token.itemsConsumedFromCurrent += static_cast(max)) >= + EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE) { + globalExplicitConsumerOffset.fetch_add(1, std::memory_order_relaxed); + } + return max; + } + token.itemsConsumedFromCurrent += static_cast(count); + max -= count; + + auto tail = producerListTail.load(std::memory_order_acquire); + auto ptr = static_cast(token.currentProducer)->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + while (ptr != static_cast(token.currentProducer)) { + auto dequeued = ptr->dequeue_bulk(itemFirst, max); + count += dequeued; + if (dequeued != 0) { + token.currentProducer = ptr; + token.itemsConsumedFromCurrent = static_cast(dequeued); + } + if (dequeued == max) { + break; + } + max -= dequeued; + ptr = ptr->next_prod(); + if (ptr == nullptr) { + ptr = tail; + } + } + return count; + } + + // Attempts to dequeue from a specific producer's inner queue. + // If you happen to know which producer you want to dequeue from, this + // is significantly faster than using the general-case try_dequeue methods. + // Returns false if the producer's queue appeared empty at the time it + // was checked (so, the queue is likely but not guaranteed to be empty). + // Never allocates. Thread-safe. + template + inline bool try_dequeue_from_producer(producer_token_t const &producer, + U &item) { + return static_cast(producer.producer)->dequeue(item); + } + + // Attempts to dequeue several elements from a specific producer's inner + // queue. Returns the number of items actually dequeued. If you happen to know + // which producer you want to dequeue from, this is significantly faster than + // using the general-case try_dequeue methods. Returns 0 if the producer's + // queue appeared empty at the time it was checked (so, the queue is likely + // but not guaranteed to be empty). Never allocates. Thread-safe. + template + inline size_t try_dequeue_bulk_from_producer(producer_token_t const &producer, + It itemFirst, size_t max) { + return static_cast(producer.producer) + ->dequeue_bulk(itemFirst, max); + } + + // Returns an estimate of the total number of elements currently in the queue. + // This estimate is only accurate if the queue has completely stabilized + // before it is called (i.e. all enqueue and dequeue operations have completed + // and their memory effects are visible on the calling thread, and no further + // operations start while this method is being called). Thread-safe. + size_t size_approx() const { + size_t size = 0; + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) { + size += ptr->size_approx(); + } + return size; + } + + // Returns true if the underlying atomic variables used by + // the queue are lock-free (they should be on most platforms). + // Thread-safe. + static constexpr bool is_lock_free() { + return details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::value == 2 && + details::static_is_lock_free::thread_id_numeric_size_t>::value == 2; + } private: - friend struct ProducerToken; - friend struct ConsumerToken; - struct ExplicitProducer; - friend struct ExplicitProducer; - struct ImplicitProducer; - friend struct ImplicitProducer; - friend class ConcurrentQueueTests; - - enum AllocationMode { CanAlloc, CannotAlloc }; - - - /////////////////////////////// - // Queue methods - /////////////////////////////// - - template - inline bool inner_enqueue(producer_token_t const& token, U&& element) - { - return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue(U&& element) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue(std::forward(element)); - } - - template - inline bool inner_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count) - { - return static_cast(token.producer)->ConcurrentQueue::ExplicitProducer::template enqueue_bulk(itemFirst, count); - } - - template - inline bool inner_enqueue_bulk(It itemFirst, size_t count) - { - auto producer = get_or_add_implicit_producer(); - return producer == nullptr ? false : producer->ConcurrentQueue::ImplicitProducer::template enqueue_bulk(itemFirst, count); - } - - inline bool update_current_producer_after_rotation(consumer_token_t& token) - { - // Ah, there's been a rotation, figure out where we should be! - auto tail = producerListTail.load(std::memory_order_acquire); - if (token.desiredProducer == nullptr && tail == nullptr) { - return false; - } - auto prodCount = producerCount.load(std::memory_order_relaxed); - auto globalOffset = globalExplicitConsumerOffset.load(std::memory_order_relaxed); - if ((details::unlikely)(token.desiredProducer == nullptr)) { - // Aha, first time we're dequeueing anything. - // Figure out our local position - // Note: offset is from start, not end, but we're traversing from end -- subtract from count first - std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); - token.desiredProducer = tail; - for (std::uint32_t i = 0; i != offset; ++i) { - token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - } - - std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; - if (delta >= prodCount) { - delta = delta % prodCount; - } - for (std::uint32_t i = 0; i != delta; ++i) { - token.desiredProducer = static_cast(token.desiredProducer)->next_prod(); - if (token.desiredProducer == nullptr) { - token.desiredProducer = tail; - } - } - - token.lastKnownGlobalOffset = globalOffset; - token.currentProducer = token.desiredProducer; - token.itemsConsumedFromCurrent = 0; - return true; - } - - - /////////////////////////// - // Free list - /////////////////////////// - - template - struct FreeListNode - { - FreeListNode() : freeListRefs(0), freeListNext(nullptr) { } - - std::atomic freeListRefs; - std::atomic freeListNext; - }; - - // A simple CAS-based lock-free free list. Not the fastest thing in the world under heavy contention, but - // simple and correct (assuming nodes are never freed until after the free list is destroyed), and fairly - // speedy under low contention. - template // N must inherit FreeListNode or have the same fields (and initialization of them) - struct FreeList - { - FreeList() : freeListHead(nullptr) { } - FreeList(FreeList&& other) : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { other.freeListHead.store(nullptr, std::memory_order_relaxed); } - void swap(FreeList& other) { details::swap_relaxed(freeListHead, other.freeListHead); } - - FreeList(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; - FreeList& operator=(FreeList const&) MOODYCAMEL_DELETE_FUNCTION; - - inline void add(N* node) - { + friend struct ProducerToken; + friend struct ConsumerToken; + struct ExplicitProducer; + friend struct ExplicitProducer; + struct ImplicitProducer; + friend struct ImplicitProducer; + friend class ConcurrentQueueTests; + + enum AllocationMode { CanAlloc, CannotAlloc }; + + /////////////////////////////// + // Queue methods + /////////////////////////////// + + template + inline bool inner_enqueue(producer_token_t const &token, U &&element) { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue( + std::forward(element)); + } + + template + inline bool inner_enqueue(U &&element) { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer::template enqueue< + canAlloc>(std::forward(element)); + } + + template + inline bool inner_enqueue_bulk(producer_token_t const &token, It itemFirst, + size_t count) { + return static_cast(token.producer) + ->ConcurrentQueue::ExplicitProducer::template enqueue_bulk( + itemFirst, count); + } + + template + inline bool inner_enqueue_bulk(It itemFirst, size_t count) { + auto producer = get_or_add_implicit_producer(); + return producer == nullptr + ? false + : producer->ConcurrentQueue::ImplicitProducer:: + template enqueue_bulk(itemFirst, count); + } + + inline bool update_current_producer_after_rotation(consumer_token_t &token) { + // Ah, there's been a rotation, figure out where we should be! + auto tail = producerListTail.load(std::memory_order_acquire); + if (token.desiredProducer == nullptr && tail == nullptr) { + return false; + } + auto prodCount = producerCount.load(std::memory_order_relaxed); + auto globalOffset = + globalExplicitConsumerOffset.load(std::memory_order_relaxed); + if ((details::unlikely)(token.desiredProducer == nullptr)) { + // Aha, first time we're dequeueing anything. + // Figure out our local position + // Note: offset is from start, not end, but we're traversing from end -- + // subtract from count first + std::uint32_t offset = prodCount - 1 - (token.initialOffset % prodCount); + token.desiredProducer = tail; + for (std::uint32_t i = 0; i != offset; ++i) { + token.desiredProducer = + static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + } + + std::uint32_t delta = globalOffset - token.lastKnownGlobalOffset; + if (delta >= prodCount) { + delta = delta % prodCount; + } + for (std::uint32_t i = 0; i != delta; ++i) { + token.desiredProducer = + static_cast(token.desiredProducer)->next_prod(); + if (token.desiredProducer == nullptr) { + token.desiredProducer = tail; + } + } + + token.lastKnownGlobalOffset = globalOffset; + token.currentProducer = token.desiredProducer; + token.itemsConsumedFromCurrent = 0; + return true; + } + + /////////////////////////// + // Free list + /////////////////////////// + + template struct FreeListNode { + FreeListNode() : freeListRefs(0), freeListNext(nullptr) {} + + std::atomic freeListRefs; + std::atomic freeListNext; + }; + + // A simple CAS-based lock-free free list. Not the fastest thing in the world + // under heavy contention, but simple and correct (assuming nodes are never + // freed until after the free list is destroyed), and fairly speedy under low + // contention. + template // N must inherit FreeListNode or have the same fields + // (and initialization of them) + struct FreeList { + FreeList() : freeListHead(nullptr) {} + FreeList(FreeList &&other) + : freeListHead(other.freeListHead.load(std::memory_order_relaxed)) { + other.freeListHead.store(nullptr, std::memory_order_relaxed); + } + void swap(FreeList &other) { + details::swap_relaxed(freeListHead, other.freeListHead); + } + + FreeList(FreeList const &) MOODYCAMEL_DELETE_FUNCTION; + FreeList &operator=(FreeList const &) MOODYCAMEL_DELETE_FUNCTION; + + inline void add(N *node) { #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - // We know that the should-be-on-freelist bit is 0 at this point, so it's safe to - // set it using a fetch_add - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, std::memory_order_acq_rel) == 0) { - // Oh look! We were the last ones referencing this node, and we know - // we want to add it to the free list, so let's do it! - add_knowing_refcount_is_zero(node); - } - } - - inline N* try_get() - { + debug::DebugLock lock(mutex); +#endif + // We know that the should-be-on-freelist bit is 0 at this point, so it's + // safe to set it using a fetch_add + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST, + std::memory_order_acq_rel) == 0) { + // Oh look! We were the last ones referencing this node, and we know + // we want to add it to the free list, so let's do it! + add_knowing_refcount_is_zero(node); + } + } + + inline N *try_get() { #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugLock lock(mutex); -#endif - auto head = freeListHead.load(std::memory_order_acquire); - while (head != nullptr) { - auto prevHead = head; - auto refs = head->freeListRefs.load(std::memory_order_relaxed); - if ((refs & REFS_MASK) == 0 || !head->freeListRefs.compare_exchange_strong(refs, refs + 1, std::memory_order_acquire)) { - head = freeListHead.load(std::memory_order_acquire); - continue; - } - - // Good, reference count has been incremented (it wasn't at zero), which means we can read the - // next and not worry about it changing between now and the time we do the CAS - auto next = head->freeListNext.load(std::memory_order_relaxed); - if (freeListHead.compare_exchange_strong(head, next, std::memory_order_acquire, std::memory_order_relaxed)) { - // Yay, got the node. This means it was on the list, which means shouldBeOnFreeList must be false no - // matter the refcount (because nobody else knows it's been taken off yet, it can't have been put back on). - assert((head->freeListRefs.load(std::memory_order_relaxed) & SHOULD_BE_ON_FREELIST) == 0); - - // Decrease refcount twice, once for our ref, and once for the list's ref - head->freeListRefs.fetch_sub(2, std::memory_order_release); - return head; - } - - // OK, the head must have changed on us, but we still need to decrease the refcount we increased. - // Note that we don't need to release any memory effects, but we do need to ensure that the reference - // count decrement happens-after the CAS on the head. - refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); - if (refs == SHOULD_BE_ON_FREELIST + 1) { - add_knowing_refcount_is_zero(prevHead); - } - } - - return nullptr; - } - - // Useful for traversing the list when there's no contention (e.g. to destroy remaining nodes) - N* head_unsafe() const { return freeListHead.load(std::memory_order_relaxed); } - - private: - inline void add_knowing_refcount_is_zero(N* node) - { - // Since the refcount is zero, and nobody can increase it once it's zero (except us, and we run - // only one copy of this method per node at a time, i.e. the single thread case), then we know - // we can safely change the next pointer of the node; however, once the refcount is back above - // zero, then other threads could increase it (happens under heavy contention, when the refcount - // goes to zero in between a load and a refcount increment of a node in try_get, then back up to - // something non-zero, then the refcount increment is done by the other thread) -- so, if the CAS - // to add the node to the actual list fails, decrease the refcount and leave the add operation to - // the next thread who puts the refcount back at zero (which could be us, hence the loop). - auto head = freeListHead.load(std::memory_order_relaxed); - while (true) { - node->freeListNext.store(head, std::memory_order_relaxed); - node->freeListRefs.store(1, std::memory_order_release); - if (!freeListHead.compare_exchange_strong(head, node, std::memory_order_release, std::memory_order_relaxed)) { - // Hmm, the add failed, but we can only try again when the refcount goes back to zero - if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, std::memory_order_acq_rel) == 1) { - continue; - } - } - return; - } - } - - private: - // Implemented like a stack, but where node order doesn't matter (nodes are inserted out of order under contention) - std::atomic freeListHead; - - static const std::uint32_t REFS_MASK = 0x7FFFFFFF; - static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; - + debug::DebugLock lock(mutex); +#endif + auto head = freeListHead.load(std::memory_order_acquire); + while (head != nullptr) { + auto prevHead = head; + auto refs = head->freeListRefs.load(std::memory_order_relaxed); + if ((refs & REFS_MASK) == 0 || + !head->freeListRefs.compare_exchange_strong( + refs, refs + 1, std::memory_order_acquire)) { + head = freeListHead.load(std::memory_order_acquire); + continue; + } + + // Good, reference count has been incremented (it wasn't at zero), which + // means we can read the next and not worry about it changing between + // now and the time we do the CAS + auto next = head->freeListNext.load(std::memory_order_relaxed); + if (freeListHead.compare_exchange_strong(head, next, + std::memory_order_acquire, + std::memory_order_relaxed)) { + // Yay, got the node. This means it was on the list, which means + // shouldBeOnFreeList must be false no matter the refcount (because + // nobody else knows it's been taken off yet, it can't have been put + // back on). + assert((head->freeListRefs.load(std::memory_order_relaxed) & + SHOULD_BE_ON_FREELIST) == 0); + + // Decrease refcount twice, once for our ref, and once for the list's + // ref + head->freeListRefs.fetch_sub(2, std::memory_order_release); + return head; + } + + // OK, the head must have changed on us, but we still need to decrease + // the refcount we increased. Note that we don't need to release any + // memory effects, but we do need to ensure that the reference count + // decrement happens-after the CAS on the head. + refs = prevHead->freeListRefs.fetch_sub(1, std::memory_order_acq_rel); + if (refs == SHOULD_BE_ON_FREELIST + 1) { + add_knowing_refcount_is_zero(prevHead); + } + } + + return nullptr; + } + + // Useful for traversing the list when there's no contention (e.g. to + // destroy remaining nodes) + N *head_unsafe() const { + return freeListHead.load(std::memory_order_relaxed); + } + + private: + inline void add_knowing_refcount_is_zero(N *node) { + // Since the refcount is zero, and nobody can increase it once it's zero + // (except us, and we run only one copy of this method per node at a time, + // i.e. the single thread case), then we know we can safely change the + // next pointer of the node; however, once the refcount is back above + // zero, then other threads could increase it (happens under heavy + // contention, when the refcount goes to zero in between a load and a + // refcount increment of a node in try_get, then back up to something + // non-zero, then the refcount increment is done by the other thread) -- + // so, if the CAS to add the node to the actual list fails, decrease the + // refcount and leave the add operation to the next thread who puts the + // refcount back at zero (which could be us, hence the loop). + auto head = freeListHead.load(std::memory_order_relaxed); + while (true) { + node->freeListNext.store(head, std::memory_order_relaxed); + node->freeListRefs.store(1, std::memory_order_release); + if (!freeListHead.compare_exchange_strong(head, node, + std::memory_order_release, + std::memory_order_relaxed)) { + // Hmm, the add failed, but we can only try again when the refcount + // goes back to zero + if (node->freeListRefs.fetch_add(SHOULD_BE_ON_FREELIST - 1, + std::memory_order_acq_rel) == 1) { + continue; + } + } + return; + } + } + + private: + // Implemented like a stack, but where node order doesn't matter (nodes are + // inserted out of order under contention) + std::atomic freeListHead; + + static const std::uint32_t REFS_MASK = 0x7FFFFFFF; + static const std::uint32_t SHOULD_BE_ON_FREELIST = 0x80000000; + #ifdef MCDBGQ_NOLOCKFREE_FREELIST - debug::DebugMutex mutex; -#endif - }; - - - /////////////////////////// - // Block - /////////////////////////// - - enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; - - struct Block - { - Block() - : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), freeListNext(nullptr), dynamicallyAllocated(true) - { -#ifdef MCDBGQ_TRACKMEM - owner = nullptr; -#endif - } - - template - inline bool is_empty() const - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Check flags - for (size_t i = 0; i < BLOCK_SIZE; ++i) { - if (!emptyFlags[i].load(std::memory_order_relaxed)) { - return false; - } - } - - // Aha, empty; make sure we have all other memory effects that happened before the empty flags were set - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - else { - // Check counter - if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == BLOCK_SIZE) { - std::atomic_thread_fence(std::memory_order_acquire); - return true; - } - assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= BLOCK_SIZE); - return false; - } - } - - // Returns true if the block is now empty (does not apply in explicit context) - template - inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set flag - assert(!emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].load(std::memory_order_relaxed)); - emptyFlags[BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1))].store(true, std::memory_order_release); - return false; - } - else { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel); - assert(prevVal < BLOCK_SIZE); - return prevVal == BLOCK_SIZE - 1; - } - } - - // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping and count > 0). - // Returns true if the block is now empty (does not apply in explicit context). - template - inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, size_t count) - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set flags - std::atomic_thread_fence(std::memory_order_release); - i = BLOCK_SIZE - 1 - static_cast(i & static_cast(BLOCK_SIZE - 1)) - count + 1; - for (size_t j = 0; j != count; ++j) { - assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); - emptyFlags[i + j].store(true, std::memory_order_relaxed); - } - return false; - } - else { - // Increment counter - auto prevVal = elementsCompletelyDequeued.fetch_add(count, std::memory_order_acq_rel); - assert(prevVal + count <= BLOCK_SIZE); - return prevVal + count == BLOCK_SIZE; - } - } - - template - inline void set_all_empty() - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Set all flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store(true, std::memory_order_relaxed); - } - } - else { - // Reset counter - elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); - } - } - - template - inline void reset_empty() - { - MOODYCAMEL_CONSTEXPR_IF (context == explicit_context && BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { - // Reset flags - for (size_t i = 0; i != BLOCK_SIZE; ++i) { - emptyFlags[i].store(false, std::memory_order_relaxed); - } - } - else { - // Reset counter - elementsCompletelyDequeued.store(0, std::memory_order_relaxed); - } - } - - inline T* operator[](index_t idx) MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } - inline T const* operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { return static_cast(static_cast(elements)) + static_cast(idx & static_cast(BLOCK_SIZE - 1)); } - - private: - static_assert(std::alignment_of::value <= sizeof(T), "The queue does not support types with an alignment greater than their size at this time"); - MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; - public: - Block* next; - std::atomic elementsCompletelyDequeued; - std::atomic emptyFlags[BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; - public: - std::atomic freeListRefs; - std::atomic freeListNext; - bool dynamicallyAllocated; // Perhaps a better name for this would be 'isNotPartOfInitialBlockPool' - + debug::DebugMutex mutex; +#endif + }; + + /////////////////////////// + // Block + /////////////////////////// + + enum InnerQueueContext { implicit_context = 0, explicit_context = 1 }; + + struct Block { + Block() + : next(nullptr), elementsCompletelyDequeued(0), freeListRefs(0), + freeListNext(nullptr), dynamicallyAllocated(true) { #ifdef MCDBGQ_TRACKMEM - void* owner; + owner = nullptr; #endif - }; - static_assert(std::alignment_of::value >= std::alignment_of::value, "Internal error: Blocks must be at least as aligned as the type they are wrapping"); + } + + template inline bool is_empty() const { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Check flags + for (size_t i = 0; i < BLOCK_SIZE; ++i) { + if (!emptyFlags[i].load(std::memory_order_relaxed)) { + return false; + } + } + + // Aha, empty; make sure we have all other memory effects that happened + // before the empty flags were set + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + else { + // Check counter + if (elementsCompletelyDequeued.load(std::memory_order_relaxed) == + BLOCK_SIZE) { + std::atomic_thread_fence(std::memory_order_acquire); + return true; + } + assert(elementsCompletelyDequeued.load(std::memory_order_relaxed) <= + BLOCK_SIZE); + return false; + } + } + + // Returns true if the block is now empty (does not apply in explicit + // context) + template + inline bool set_empty(MOODYCAMEL_MAYBE_UNUSED index_t i) { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flag + assert(!emptyFlags[BLOCK_SIZE - 1 - + static_cast( + i & static_cast(BLOCK_SIZE - 1))] + .load(std::memory_order_relaxed)); + emptyFlags[BLOCK_SIZE - 1 - + static_cast(i & + static_cast(BLOCK_SIZE - 1))] + .store(true, std::memory_order_release); + return false; + } + else { + // Increment counter + auto prevVal = + elementsCompletelyDequeued.fetch_add(1, std::memory_order_acq_rel); + assert(prevVal < BLOCK_SIZE); + return prevVal == BLOCK_SIZE - 1; + } + } + + // Sets multiple contiguous item statuses to 'empty' (assumes no wrapping + // and count > 0). Returns true if the block is now empty (does not apply in + // explicit context). + template + inline bool set_many_empty(MOODYCAMEL_MAYBE_UNUSED index_t i, + size_t count) { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set flags + std::atomic_thread_fence(std::memory_order_release); + i = BLOCK_SIZE - 1 - + static_cast(i & static_cast(BLOCK_SIZE - 1)) - + count + 1; + for (size_t j = 0; j != count; ++j) { + assert(!emptyFlags[i + j].load(std::memory_order_relaxed)); + emptyFlags[i + j].store(true, std::memory_order_relaxed); + } + return false; + } + else { + // Increment counter + auto prevVal = elementsCompletelyDequeued.fetch_add( + count, std::memory_order_acq_rel); + assert(prevVal + count <= BLOCK_SIZE); + return prevVal + count == BLOCK_SIZE; + } + } + + template inline void set_all_empty() { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Set all flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(true, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(BLOCK_SIZE, std::memory_order_relaxed); + } + } + + template inline void reset_empty() { + MOODYCAMEL_CONSTEXPR_IF(context == explicit_context && + BLOCK_SIZE <= + EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD) { + // Reset flags + for (size_t i = 0; i != BLOCK_SIZE; ++i) { + emptyFlags[i].store(false, std::memory_order_relaxed); + } + } + else { + // Reset counter + elementsCompletelyDequeued.store(0, std::memory_order_relaxed); + } + } + inline T *operator[](index_t idx) MOODYCAMEL_NOEXCEPT { + return static_cast(static_cast(elements)) + + static_cast(idx & static_cast(BLOCK_SIZE - 1)); + } + inline T const *operator[](index_t idx) const MOODYCAMEL_NOEXCEPT { + return static_cast(static_cast(elements)) + + static_cast(idx & static_cast(BLOCK_SIZE - 1)); + } + + private: + static_assert(std::alignment_of::value <= sizeof(T), + "The queue does not support types with an alignment greater " + "than their size at this time"); + MOODYCAMEL_ALIGNED_TYPE_LIKE(char[sizeof(T) * BLOCK_SIZE], T) elements; + + public: + Block *next; + std::atomic elementsCompletelyDequeued; + std::atomic emptyFlags + [BLOCK_SIZE <= EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD ? BLOCK_SIZE : 1]; + + public: + std::atomic freeListRefs; + std::atomic freeListNext; + bool dynamicallyAllocated; // Perhaps a better name for this would be + // 'isNotPartOfInitialBlockPool' + +#ifdef MCDBGQ_TRACKMEM + void *owner; +#endif + }; + static_assert(std::alignment_of::value >= std::alignment_of::value, + "Internal error: Blocks must be at least as aligned as the " + "type they are wrapping"); #ifdef MCDBGQ_TRACKMEM public: - struct MemStats; + struct MemStats; + private: #endif - - /////////////////////////// - // Producer base - /////////////////////////// - - struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase - { - ProducerBase(ConcurrentQueue* parent_, bool isExplicit_) : - tailIndex(0), - headIndex(0), - dequeueOptimisticCount(0), - dequeueOvercommit(0), - tailBlock(nullptr), - isExplicit(isExplicit_), - parent(parent_) - { - } - - virtual ~ProducerBase() { } - - template - inline bool dequeue(U& element) - { - if (isExplicit) { - return static_cast(this)->dequeue(element); - } - else { - return static_cast(this)->dequeue(element); - } - } - - template - inline size_t dequeue_bulk(It& itemFirst, size_t max) - { - if (isExplicit) { - return static_cast(this)->dequeue_bulk(itemFirst, max); - } - else { - return static_cast(this)->dequeue_bulk(itemFirst, max); - } - } - - inline ProducerBase* next_prod() const { return static_cast(next); } - - inline size_t size_approx() const - { - auto tail = tailIndex.load(std::memory_order_relaxed); - auto head = headIndex.load(std::memory_order_relaxed); - return details::circular_less_than(head, tail) ? static_cast(tail - head) : 0; - } - - inline index_t getTail() const { return tailIndex.load(std::memory_order_relaxed); } - protected: - std::atomic tailIndex; // Where to enqueue to next - std::atomic headIndex; // Where to dequeue from next - - std::atomic dequeueOptimisticCount; - std::atomic dequeueOvercommit; - - Block* tailBlock; - - public: - bool isExplicit; - ConcurrentQueue* parent; - - protected: + + /////////////////////////// + // Producer base + /////////////////////////// + + struct ProducerBase : public details::ConcurrentQueueProducerTypelessBase { + ProducerBase(ConcurrentQueue *parent_, bool isExplicit_) + : tailIndex(0), headIndex(0), dequeueOptimisticCount(0), + dequeueOvercommit(0), tailBlock(nullptr), isExplicit(isExplicit_), + parent(parent_) {} + + virtual ~ProducerBase() {} + + template inline bool dequeue(U &element) { + if (isExplicit) { + return static_cast(this)->dequeue(element); + } else { + return static_cast(this)->dequeue(element); + } + } + + template + inline size_t dequeue_bulk(It &itemFirst, size_t max) { + if (isExplicit) { + return static_cast(this)->dequeue_bulk(itemFirst, + max); + } else { + return static_cast(this)->dequeue_bulk(itemFirst, + max); + } + } + + inline ProducerBase *next_prod() const { + return static_cast(next); + } + + inline size_t size_approx() const { + auto tail = tailIndex.load(std::memory_order_relaxed); + auto head = headIndex.load(std::memory_order_relaxed); + return details::circular_less_than(head, tail) + ? static_cast(tail - head) + : 0; + } + + inline index_t getTail() const { + return tailIndex.load(std::memory_order_relaxed); + } + + protected: + std::atomic tailIndex; // Where to enqueue to next + std::atomic headIndex; // Where to dequeue from next + + std::atomic dequeueOptimisticCount; + std::atomic dequeueOvercommit; + + Block *tailBlock; + + public: + bool isExplicit; + ConcurrentQueue *parent; + + protected: #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - /////////////////////////// - // Explicit queue - /////////////////////////// - - struct ExplicitProducer : public ProducerBase - { - explicit ExplicitProducer(ConcurrentQueue* parent_) : - ProducerBase(parent_, true), - blockIndex(nullptr), - pr_blockIndexSlotsUsed(0), - pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), - pr_blockIndexFront(0), - pr_blockIndexEntries(nullptr), - pr_blockIndexRaw(nullptr) - { - size_t poolBasedIndexSize = details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; - if (poolBasedIndexSize > pr_blockIndexSize) { - pr_blockIndexSize = poolBasedIndexSize; - } - - new_block_index(0); // This creates an index with double the number of current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE - } - - ~ExplicitProducer() - { - // Destruct any elements not yet dequeued. - // Since we're in the destructor, we can assume all elements - // are either completely dequeued or completely not (no halfways). - if (this->tailBlock != nullptr) { // Note this means there must be a block index too - // First find the block that's partially dequeued, if any - Block* halfDequeuedBlock = nullptr; - if ((this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) != 0) { - // The head's not on a block boundary, meaning a block somewhere is partially dequeued - // (or the head block is the tail block and was fully dequeued, but the head/tail are still not on a boundary) - size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & (pr_blockIndexSize - 1); - while (details::circular_less_than(pr_blockIndexEntries[i].base + BLOCK_SIZE, this->headIndex.load(std::memory_order_relaxed))) { - i = (i + 1) & (pr_blockIndexSize - 1); - } - assert(details::circular_less_than(pr_blockIndexEntries[i].base, this->headIndex.load(std::memory_order_relaxed))); - halfDequeuedBlock = pr_blockIndexEntries[i].block; - } - - // Start at the head block (note the first line in the loop gives us the head from the tail on the first iteration) - auto block = this->tailBlock; - do { - block = block->next; - if (block->ConcurrentQueue::Block::template is_empty()) { - continue; - } - - size_t i = 0; // Offset into block - if (block == halfDequeuedBlock) { - i = static_cast(this->headIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); - } - - // Walk through all the items in the block; if this is the tail block, we need to stop when we reach the tail index - auto lastValidIndex = (this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)) == 0 ? BLOCK_SIZE : static_cast(this->tailIndex.load(std::memory_order_relaxed) & static_cast(BLOCK_SIZE - 1)); - while (i != BLOCK_SIZE && (block != this->tailBlock || i != lastValidIndex)) { - (*block)[i++]->~T(); - } - } while (block != this->tailBlock); - } - - // Destroy all blocks that we own - if (this->tailBlock != nullptr) { - auto block = this->tailBlock; - do { - auto nextBlock = block->next; - this->parent->add_block_to_free_list(block); - block = nextBlock; - } while (block != this->tailBlock); - } - - // Destroy the block indices - auto header = static_cast(pr_blockIndexRaw); - while (header != nullptr) { - auto prev = static_cast(header->prev); - header->~BlockIndexHeader(); - (Traits::free)(header); - header = prev; - } - } - - template - inline bool enqueue(U&& element) - { - index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - // We reached the end of a block, start a new one - auto startBlock = this->tailBlock; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - if (this->tailBlock != nullptr && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { - // We can re-use the block ahead of us, it's empty! - this->tailBlock = this->tailBlock->next; - this->tailBlock->ConcurrentQueue::Block::template reset_empty(); - - // We'll put the block on the block index (guaranteed to be room since we're conceptually removing the - // last block from it first -- except instead of removing then adding, we can just overwrite). - // Note that there must be a valid block index here, since even if allocation failed in the ctor, - // it would have been re-attempted when adding the first block to the queue; since there is such - // a block, a block index must have been successfully allocated. - } - else { - // Whatever head value we see here is >= the last value we saw here (relatively), - // and <= its current value. Since we have the most recent tail, the head must be - // <= to it. - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) - || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { - // We can't enqueue in another block because there's not enough leeway -- the - // tail could surpass the head by the time the block fills up! (Or we'll exceed - // the size limit, if the second part of the condition was true.) - return false; - } - // We're going to need a new block; check that the block index has room - if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize) { - // Hmm, the circular block index is already full -- we'll need - // to allocate a new index. Note pr_blockIndexRaw can only be nullptr if - // the initial allocation failed in the constructor. - - MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { - return false; - } - else if (!new_block_index(pr_blockIndexSlotsUsed)) { - return false; - } - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - return false; - } + friend struct MemStats; +#endif + }; + + /////////////////////////// + // Explicit queue + /////////////////////////// + + struct ExplicitProducer : public ProducerBase { + explicit ExplicitProducer(ConcurrentQueue *parent_) + : ProducerBase(parent_, true), blockIndex(nullptr), + pr_blockIndexSlotsUsed(0), + pr_blockIndexSize(EXPLICIT_INITIAL_INDEX_SIZE >> 1), + pr_blockIndexFront(0), pr_blockIndexEntries(nullptr), + pr_blockIndexRaw(nullptr) { + size_t poolBasedIndexSize = + details::ceil_to_pow_2(parent_->initialBlockPoolSize) >> 1; + if (poolBasedIndexSize > pr_blockIndexSize) { + pr_blockIndexSize = poolBasedIndexSize; + } + + new_block_index(0); // This creates an index with double the number of + // current entries, i.e. EXPLICIT_INITIAL_INDEX_SIZE + } + + ~ExplicitProducer() { + // Destruct any elements not yet dequeued. + // Since we're in the destructor, we can assume all elements + // are either completely dequeued or completely not (no halfways). + if (this->tailBlock != + nullptr) { // Note this means there must be a block index too + // First find the block that's partially dequeued, if any + Block *halfDequeuedBlock = nullptr; + if ((this->headIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)) != 0) { + // The head's not on a block boundary, meaning a block somewhere is + // partially dequeued (or the head block is the tail block and was + // fully dequeued, but the head/tail are still not on a boundary) + size_t i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & + (pr_blockIndexSize - 1); + while (details::circular_less_than( + pr_blockIndexEntries[i].base + BLOCK_SIZE, + this->headIndex.load(std::memory_order_relaxed))) { + i = (i + 1) & (pr_blockIndexSize - 1); + } + assert(details::circular_less_than( + pr_blockIndexEntries[i].base, + this->headIndex.load(std::memory_order_relaxed))); + halfDequeuedBlock = pr_blockIndexEntries[i].block; + } + + // Start at the head block (note the first line in the loop gives us the + // head from the tail on the first iteration) + auto block = this->tailBlock; + do { + block = block->next; + if (block->ConcurrentQueue::Block::template is_empty< + explicit_context>()) { + continue; + } + + size_t i = 0; // Offset into block + if (block == halfDequeuedBlock) { + i = static_cast( + this->headIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)); + } + + // Walk through all the items in the block; if this is the tail block, + // we need to stop when we reach the tail index + auto lastValidIndex = + (this->tailIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)) == 0 + ? BLOCK_SIZE + : static_cast( + this->tailIndex.load(std::memory_order_relaxed) & + static_cast(BLOCK_SIZE - 1)); + while (i != BLOCK_SIZE && + (block != this->tailBlock || i != lastValidIndex)) { + (*block)[i++]->~T(); + } + } while (block != this->tailBlock); + } + + // Destroy all blocks that we own + if (this->tailBlock != nullptr) { + auto block = this->tailBlock; + do { + auto nextBlock = block->next; + this->parent->add_block_to_free_list(block); + block = nextBlock; + } while (block != this->tailBlock); + } + + // Destroy the block indices + auto header = static_cast(pr_blockIndexRaw); + while (header != nullptr) { + auto prev = static_cast(header->prev); + header->~BlockIndexHeader(); + (Traits::free)(header); + header = prev; + } + } + + template + inline bool enqueue(U &&element) { + index_t currentTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto startBlock = this->tailBlock; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + if (this->tailBlock != nullptr && + this->tailBlock->next->ConcurrentQueue::Block::template is_empty< + explicit_context>()) { + // We can re-use the block ahead of us, it's empty! + this->tailBlock = this->tailBlock->next; + this->tailBlock->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + + // We'll put the block on the block index (guaranteed to be room since + // we're conceptually removing the last block from it first -- except + // instead of removing then adding, we can just overwrite). Note that + // there must be a valid block index here, since even if allocation + // failed in the ctor, it would have been re-attempted when adding the + // first block to the queue; since there is such a block, a block + // index must have been successfully allocated. + } else { + // Whatever head value we see here is >= the last value we saw here + // (relatively), and <= its current value. Since we have the most + // recent tail, the head must be + // <= to it. + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + // We can't enqueue in another block because there's not enough + // leeway -- the tail could surpass the head by the time the block + // fills up! (Or we'll exceed the size limit, if the second part of + // the condition was true.) + return false; + } + // We're going to need a new block; check that the block index has + // room + if (pr_blockIndexRaw == nullptr || + pr_blockIndexSlotsUsed == pr_blockIndexSize) { + // Hmm, the circular block index is already full -- we'll need + // to allocate a new index. Note pr_blockIndexRaw can only be + // nullptr if the initial allocation failed in the constructor. + + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; } + else if (!new_block_index(pr_blockIndexSlotsUsed)) { + return false; + } + } + + // Insert a new block in the circular linked list + auto newBlock = + this->parent + ->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + return false; + } #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } - else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - ++pr_blockIndexSlotsUsed; - } - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { - // The constructor may throw. We want the element not to appear in the queue in - // that case (without corrupting the queue): - MOODYCAMEL_TRY { - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - } - MOODYCAMEL_CATCH (...) { - // Revert change to the current block, but leave the new block available - // for next time - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? this->tailBlock : startBlock; - MOODYCAMEL_RETHROW; - } - } - else { - (void)startBlock; - (void)originalBlockIndexSlotsUsed; - } - - // Add block to block index - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - blockIndex.load(std::memory_order_relaxed)->front.store(pr_blockIndexFront, std::memory_order_release); - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U& element) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { - // Might be something to dequeue, let's give it a try - - // Note that this if is purely for performance purposes in the common case when the queue is - // empty and the values are eventually consistent -- we may enter here spuriously. - - // Note that whatever the values of overcommit and tail are, they are not going to change (unless we - // change them) and must be the same value at this point (inside the if) as when the if condition was - // evaluated. - - // We insert an acquire fence here to synchronize-with the release upon incrementing dequeueOvercommit below. - // This ensures that whatever the value we got loaded into overcommit, the load of dequeueOptisticCount in - // the fetch_add below will result in a value at least as recent as that (and therefore at least as large). - // Note that I believe a compiler (signal) fence here would be sufficient due to the nature of fetch_add (all - // read-modify-write operations are guaranteed to work on the latest value in the modification order), but - // unfortunately that can't be shown to be correct using only the C++11 standard. - // See http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case - std::atomic_thread_fence(std::memory_order_acquire); - - // Increment optimistic counter, then check if it went over the boundary - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); - - // Note that since dequeueOvercommit must be <= dequeueOptimisticCount (because dequeueOvercommit is only ever - // incremented after dequeueOptimisticCount -- this is enforced in the `else` block below), and since we now - // have a version of dequeueOptimisticCount that is at least as recent as overcommit (due to the release upon - // incrementing dequeueOvercommit and the acquire above that synchronizes with it), overcommit <= myDequeueCount. - // However, we can't assert this since both dequeueOptimisticCount and dequeueOvercommit may (independently) - // overflow; in such a case, though, the logic still holds since the difference between the two is maintained. - - // Note that we reload tail here in case it changed; it will be the same value as before or greater, since - // this load is sequenced after (happens after) the earlier load above. This is supported by read-read - // coherency (as defined in the standard), explained here: http://en.cppreference.com/w/cpp/atomic/memory_order - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { - // Guaranteed to be at least one element to dequeue! - - // Get the index. Note that since there's guaranteed to be at least one element, this - // will never exceed tail. We need to do an acquire-release fence here since it's possible - // that whatever condition got us to this point was for an earlier enqueued element (that - // we already see the memory effects for), but that by the time we increment somebody else - // has incremented it, and we need to see the memory effects for *that* element, which is - // in such a case is necessarily visible on the thread that incremented it in the first - // place with the more current condition (they must have acquired a tail that is at least - // as recent). - auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - - // Determine which block the element is in - - auto localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); - - // We need to be careful here about subtracting and dividing because of index wrap-around. - // When an index wraps, we need to preserve the sign of the offset when dividing it by the - // block size (in order to get a correct signed block count offset in all cases): - auto headBase = localBlockIndex->entries[localBlockIndexHead].base; - auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast(static_cast::type>(blockBaseIndex - headBase) / static_cast::type>(BLOCK_SIZE)); - auto block = localBlockIndex->entries[(localBlockIndexHead + offset) & (localBlockIndex->size - 1)].block; - - // Dequeue - auto& el = *((*block)[index]); - if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { - // Make sure the element is still fully dequeued and destroyed even if the assignment - // throws - struct Guard { - Block* block; - index_t index; - - ~Guard() - { - (*block)[index]->~T(); - block->ConcurrentQueue::Block::template set_empty(index); - } - } guard = { block, index }; - - element = std::move(el); // NOLINT - } - else { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - block->ConcurrentQueue::Block::template set_empty(index); - } - - return true; - } - else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add(1, std::memory_order_release); // Release so that the fetch_add on dequeueOptimisticCount is guaranteed to happen before this write - } - } - - return false; - } - - template - bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - auto originalBlockIndexFront = pr_blockIndexFront; - auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; - - Block* firstAllocatedBlock = nullptr; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { - // Allocate as many blocks as possible from ahead - while (blockBaseDiff > 0 && this->tailBlock != nullptr && this->tailBlock->next != firstAllocatedBlock && this->tailBlock->next->ConcurrentQueue::Block::template is_empty()) { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - this->tailBlock = this->tailBlock->next; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; - - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Now allocate as many blocks as necessary from the block pool - while (blockBaseDiff > 0) { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); - if (pr_blockIndexRaw == nullptr || pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { - MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { - // Failed to allocate, undo changes (but keep injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { - // Failed to allocate, undo changes (but keep injected blocks) - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - - // pr_blockIndexFront is updated inside new_block_index, so we need to - // update our fallback value too (since we keep the new index even if we - // later fail) - originalBlockIndexFront = originalBlockIndexSlotsUsed; - } - - // Insert a new block in the circular linked list - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - return false; - } - + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + ++pr_blockIndexSlotsUsed; + } + + MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( + T, U, + new (static_cast(nullptr)) T(std::forward(element)))) { + // The constructor may throw. We want the element not to appear in the + // queue in that case (without corrupting the queue): + MOODYCAMEL_TRY { + new ((*this->tailBlock)[currentTailIndex]) + T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) { + // Revert change to the current block, but leave the new block + // available for next time + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = + startBlock == nullptr ? this->tailBlock : startBlock; + MOODYCAMEL_RETHROW; + } + } + else { + (void)startBlock; + (void)originalBlockIndexSlotsUsed; + } + + // Add block to block index + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + blockIndex.load(std::memory_order_relaxed) + ->front.store(pr_blockIndexFront, std::memory_order_release); + pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + + MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( + T, U, + new (static_cast(nullptr)) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template bool dequeue(U &element) { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit, + tail)) { + // Might be something to dequeue, let's give it a try + + // Note that this if is purely for performance purposes in the common + // case when the queue is empty and the values are eventually consistent + // -- we may enter here spuriously. + + // Note that whatever the values of overcommit and tail are, they are + // not going to change (unless we change them) and must be the same + // value at this point (inside the if) as when the if condition was + // evaluated. + + // We insert an acquire fence here to synchronize-with the release upon + // incrementing dequeueOvercommit below. This ensures that whatever the + // value we got loaded into overcommit, the load of dequeueOptisticCount + // in the fetch_add below will result in a value at least as recent as + // that (and therefore at least as large). Note that I believe a + // compiler (signal) fence here would be sufficient due to the nature of + // fetch_add (all read-modify-write operations are guaranteed to work on + // the latest value in the modification order), but unfortunately that + // can't be shown to be correct using only the C++11 standard. See + // http://stackoverflow.com/questions/18223161/what-are-the-c11-memory-ordering-guarantees-in-this-corner-case + std::atomic_thread_fence(std::memory_order_acquire); + + // Increment optimistic counter, then check if it went over the boundary + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + 1, std::memory_order_relaxed); + + // Note that since dequeueOvercommit must be <= dequeueOptimisticCount + // (because dequeueOvercommit is only ever incremented after + // dequeueOptimisticCount -- this is enforced in the `else` block + // below), and since we now have a version of dequeueOptimisticCount + // that is at least as recent as overcommit (due to the release upon + // incrementing dequeueOvercommit and the acquire above that + // synchronizes with it), overcommit <= myDequeueCount. However, we + // can't assert this since both dequeueOptimisticCount and + // dequeueOvercommit may (independently) overflow; in such a case, + // though, the logic still holds since the difference between the two is + // maintained. + + // Note that we reload tail here in case it changed; it will be the same + // value as before or greater, since this load is sequenced after + // (happens after) the earlier load above. This is supported by + // read-read coherency (as defined in the standard), explained here: + // http://en.cppreference.com/w/cpp/atomic/memory_order + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than( + myDequeueCount - overcommit, tail))) { + // Guaranteed to be at least one element to dequeue! + + // Get the index. Note that since there's guaranteed to be at least + // one element, this will never exceed tail. We need to do an + // acquire-release fence here since it's possible that whatever + // condition got us to this point was for an earlier enqueued element + // (that we already see the memory effects for), but that by the time + // we increment somebody else has incremented it, and we need to see + // the memory effects for *that* element, which is in such a case is + // necessarily visible on the thread that incremented it in the first + // place with the more current condition (they must have acquired a + // tail that is at least as recent). + auto index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load(std::memory_order_acquire); + + // We need to be careful here about subtracting and dividing because + // of index wrap-around. When an index wraps, we need to preserve the + // sign of the offset when dividing it by the block size (in order to + // get a correct signed block count offset in all cases): + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto blockBaseIndex = index & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>( + blockBaseIndex - headBase) / + static_cast::type>( + BLOCK_SIZE)); + auto block = localBlockIndex + ->entries[(localBlockIndexHead + offset) & + (localBlockIndex->size - 1)] + .block; + + // Dequeue + auto &el = *((*block)[index]); + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) { + // Make sure the element is still fully dequeued and destroyed even + // if the assignment throws + struct Guard { + Block *block; + index_t index; + + ~Guard() { + (*block)[index]->~T(); + block->ConcurrentQueue::Block::template set_empty< + explicit_context>(index); + } + } guard = {block, index}; + + element = std::move(el); // NOLINT + } else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + block->ConcurrentQueue::Block::template set_empty( + index); + } + + return true; + } else { + // Wasn't anything to dequeue after all; make the effective dequeue + // count eventually consistent + this->dequeueOvercommit.fetch_add( + 1, std::memory_order_release); // Release so that the fetch_add on + // dequeueOptimisticCount is + // guaranteed to happen before this + // write + } + } + + return false; + } + + template + bool MOODYCAMEL_NO_TSAN enqueue_bulk(It itemFirst, size_t count) { + // First, we need to make sure we have enough room to enqueue all of the + // elements; this means pre-allocating blocks and putting them in the + // block index (but only if all the allocations succeeded). + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + auto originalBlockIndexFront = pr_blockIndexFront; + auto originalBlockIndexSlotsUsed = pr_blockIndexSlotsUsed; + + Block *firstAllocatedBlock = nullptr; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) & + ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { + // Allocate as many blocks as possible from ahead + while (blockBaseDiff > 0 && this->tailBlock != nullptr && + this->tailBlock->next != firstAllocatedBlock && + this->tailBlock->next->ConcurrentQueue::Block::template is_empty< + explicit_context>()) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + this->tailBlock = this->tailBlock->next; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Now allocate as many blocks as necessary from the block pool + while (blockBaseDiff > 0) { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = + !details::circular_less_than(head, currentTailIndex + + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + if (pr_blockIndexRaw == nullptr || + pr_blockIndexSlotsUsed == pr_blockIndexSize || full) { + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = + startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + else if (full || !new_block_index(originalBlockIndexSlotsUsed)) { + // Failed to allocate, undo changes (but keep injected blocks) + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = + startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + + // pr_blockIndexFront is updated inside new_block_index, so we need + // to update our fallback value too (since we keep the new index + // even if we later fail) + originalBlockIndexFront = originalBlockIndexSlotsUsed; + } + + // Insert a new block in the circular linked list + auto newBlock = + this->parent + ->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = + startBlock == nullptr ? firstAllocatedBlock : startBlock; + return false; + } + #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template set_all_empty(); - if (this->tailBlock == nullptr) { - newBlock->next = newBlock; - } - else { - newBlock->next = this->tailBlock->next; - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? this->tailBlock : firstAllocatedBlock; - - ++pr_blockIndexSlotsUsed; - - auto& entry = blockIndex.load(std::memory_order_relaxed)->entries[pr_blockIndexFront]; - entry.base = currentTailIndex; - entry.block = this->tailBlock; - pr_blockIndexFront = (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); - } - - // Excellent, all allocations succeeded. Reset each block's emptiness before we fill them up, and - // publish the new block index front - auto block = firstAllocatedBlock; - while (true) { - block->ConcurrentQueue::Block::template reset_empty(); - if (block == this->tailBlock) { - break; - } - block = block->next; - } - - MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { - blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); - } - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - auto endBlock = this->tailBlock; - this->tailBlock = startBlock; - assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, stopIndex)) { - stopIndex = newTailIndex; - } - MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); - } - } - else { - MOODYCAMEL_TRY { - while (currentTailIndex != stopIndex) { - // Must use copy constructor even if move constructor is available - // because we may have to revert if there's an exception. - // Sorry about the horrible templated next line, but it was the only way - // to disable moving *at compile time*, which is important because a type - // may only define a (noexcept) move constructor, and so calls to the - // cctor will not compile, even if they are in an if branch that will never - // be executed - new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH (...) { - // Oh dear, an exception's been thrown -- destroy the elements that - // were enqueued so far and revert the entire bulk operation (we'll keep - // any allocated blocks in our linked list for later, though). - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - pr_blockIndexFront = originalBlockIndexFront; - pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; - this->tailBlock = startBlock == nullptr ? firstAllocatedBlock : startBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { - if (firstAllocatedBlock != nullptr) - blockIndex.load(std::memory_order_relaxed)->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), std::memory_order_release); - } - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - size_t dequeue_bulk(It& itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); - if (details::circular_less_than(0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) { - actualCount = desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); - - // Determine which block the first element is in - auto localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto localBlockIndexHead = localBlockIndex->front.load(std::memory_order_acquire); - - auto headBase = localBlockIndex->entries[localBlockIndexHead].base; - auto firstBlockBaseIndex = firstIndex & ~static_cast(BLOCK_SIZE - 1); - auto offset = static_cast(static_cast::type>(firstBlockBaseIndex - headBase) / static_cast::type>(BLOCK_SIZE)); - auto indexIndex = (localBlockIndexHead + offset) & (localBlockIndex->size - 1); - - // Iterate the blocks and dequeue - auto index = firstIndex; - do { - auto firstIndexInBlock = index; - index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - auto block = localBlockIndex->entries[indexIndex].block; - if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } - else { - MOODYCAMEL_TRY { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH (...) { - // It's too late to revert the dequeue, but we can make sure that all - // the dequeued objects are properly destroyed and the block index - // (and empty count) are properly updated before we propagate the exception - do { - block = localBlockIndex->entries[indexIndex].block; - while (index != endIndex) { - (*block)[index++]->~T(); - } - block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); - - firstIndexInBlock = index; - endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - block->ConcurrentQueue::Block::template set_many_empty(firstIndexInBlock, static_cast(endIndex - firstIndexInBlock)); - indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } - else { - // Wasn't anything to dequeue after all; make the effective dequeue count eventually consistent - this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - struct BlockIndexEntry - { - index_t base; - Block* block; - }; - - struct BlockIndexHeader - { - size_t size; - std::atomic front; // Current slot (not next, like pr_blockIndexFront) - BlockIndexEntry* entries; - void* prev; - }; - - - bool new_block_index(size_t numberOfFilledSlotsToExpose) - { - auto prevBlockSizeMask = pr_blockIndexSize - 1; - - // Create the new block - pr_blockIndexSize <<= 1; - auto newRawPtr = static_cast((Traits::malloc)(sizeof(BlockIndexHeader) + std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); - if (newRawPtr == nullptr) { - pr_blockIndexSize >>= 1; // Reset to allow graceful retry - return false; - } - - auto newBlockIndexEntries = reinterpret_cast(details::align_for(newRawPtr + sizeof(BlockIndexHeader))); - - // Copy in all the old indices, if any - size_t j = 0; - if (pr_blockIndexSlotsUsed != 0) { - auto i = (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; - do { - newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; - i = (i + 1) & prevBlockSizeMask; - } while (i != pr_blockIndexFront); - } - - // Update everything - auto header = new (newRawPtr) BlockIndexHeader; - header->size = pr_blockIndexSize; - header->front.store(numberOfFilledSlotsToExpose - 1, std::memory_order_relaxed); - header->entries = newBlockIndexEntries; - header->prev = pr_blockIndexRaw; // we link the new block to the old one so we can free it later - - pr_blockIndexFront = j; - pr_blockIndexEntries = newBlockIndexEntries; - pr_blockIndexRaw = newRawPtr; - blockIndex.store(header, std::memory_order_release); - - return true; - } - - private: - std::atomic blockIndex; - - // To be used by producer only -- consumer must use the ones in referenced by blockIndex - size_t pr_blockIndexSlotsUsed; - size_t pr_blockIndexSize; - size_t pr_blockIndexFront; // Next slot (not current) - BlockIndexEntry* pr_blockIndexEntries; - void* pr_blockIndexRaw; - + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template set_all_empty< + explicit_context>(); + if (this->tailBlock == nullptr) { + newBlock->next = newBlock; + } else { + newBlock->next = this->tailBlock->next; + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + firstAllocatedBlock = firstAllocatedBlock == nullptr + ? this->tailBlock + : firstAllocatedBlock; + + ++pr_blockIndexSlotsUsed; + + auto &entry = blockIndex.load(std::memory_order_relaxed) + ->entries[pr_blockIndexFront]; + entry.base = currentTailIndex; + entry.block = this->tailBlock; + pr_blockIndexFront = + (pr_blockIndexFront + 1) & (pr_blockIndexSize - 1); + } + + // Excellent, all allocations succeeded. Reset each block's emptiness + // before we fill them up, and publish the new block index front + auto block = firstAllocatedBlock; + while (true) { + block->ConcurrentQueue::Block::template reset_empty< + explicit_context>(); + if (block == this->tailBlock) { + break; + } + block = block->next; + } + + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) { + blockIndex.load(std::memory_order_relaxed) + ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), + std::memory_order_release); + } + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + auto endBlock = this->tailBlock; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || + firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && + firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + // Must use copy constructor even if move constructor is available + // because we may have to revert if there's an exception. + // Sorry about the horrible templated next line, but it was the + // only way to disable moving *at compile time*, which is + // important because a type may only define a (noexcept) move + // constructor, and so calls to the cctor will not compile, even + // if they are in an if branch that will never be executed + new ((*this->tailBlock)[currentTailIndex]) T( + details::nomove_if(nullptr)) T(details::deref_noexcept( + itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) { + // Oh dear, an exception's been thrown -- destroy the elements that + // were enqueued so far and revert the entire bulk operation (we'll + // keep any allocated blocks in our linked list for later, though). + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + pr_blockIndexFront = originalBlockIndexFront; + pr_blockIndexSlotsUsed = originalBlockIndexSlotsUsed; + this->tailBlock = + startBlock == nullptr ? firstAllocatedBlock : startBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == + 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, + stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + + MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) { + if (firstAllocatedBlock != nullptr) + blockIndex.load(std::memory_order_relaxed) + ->front.store((pr_blockIndexFront - 1) & (pr_blockIndexSize - 1), + std::memory_order_release); + } + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template size_t dequeue_bulk(It &itemFirst, size_t max) { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast( + tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = + static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at + // least actualCount elements, this will never exceed tail. + auto firstIndex = + this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Determine which block the first element is in + auto localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto localBlockIndexHead = + localBlockIndex->front.load(std::memory_order_acquire); + + auto headBase = localBlockIndex->entries[localBlockIndexHead].base; + auto firstBlockBaseIndex = + firstIndex & ~static_cast(BLOCK_SIZE - 1); + auto offset = static_cast( + static_cast::type>( + firstBlockBaseIndex - headBase) / + static_cast::type>( + BLOCK_SIZE)); + auto indexIndex = + (localBlockIndexHead + offset) & (localBlockIndex->size - 1); + + // Iterate the blocks and dequeue + auto index = firstIndex; + do { + auto firstIndexInBlock = index; + index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + auto block = localBlockIndex->entries[indexIndex].block; + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, + details::deref_noexcept(itemFirst) = + std::move((*(*block)[index])))) { + while (index != endIndex) { + auto &el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto &el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) { + // It's too late to revert the dequeue, but we can make sure + // that all the dequeued objects are properly destroyed and the + // block index (and empty count) are properly updated before we + // propagate the exception + do { + block = localBlockIndex->entries[indexIndex].block; + while (index != endIndex) { + (*block)[index++]->~T(); + } + block->ConcurrentQueue::Block::template set_many_empty< + explicit_context>( + firstIndexInBlock, + static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + + firstIndexInBlock = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), + endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + block->ConcurrentQueue::Block::template set_many_empty< + explicit_context>( + firstIndexInBlock, + static_cast(endIndex - firstIndexInBlock)); + indexIndex = (indexIndex + 1) & (localBlockIndex->size - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } else { + // Wasn't anything to dequeue after all; make the effective dequeue + // count eventually consistent + this->dequeueOvercommit.fetch_add(desiredCount, + std::memory_order_release); + } + } + + return 0; + } + + private: + struct BlockIndexEntry { + index_t base; + Block *block; + }; + + struct BlockIndexHeader { + size_t size; + std::atomic + front; // Current slot (not next, like pr_blockIndexFront) + BlockIndexEntry *entries; + void *prev; + }; + + bool new_block_index(size_t numberOfFilledSlotsToExpose) { + auto prevBlockSizeMask = pr_blockIndexSize - 1; + + // Create the new block + pr_blockIndexSize <<= 1; + auto newRawPtr = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + std::alignment_of::value - + 1 + sizeof(BlockIndexEntry) * pr_blockIndexSize)); + if (newRawPtr == nullptr) { + pr_blockIndexSize >>= 1; // Reset to allow graceful retry + return false; + } + + auto newBlockIndexEntries = reinterpret_cast( + details::align_for(newRawPtr + + sizeof(BlockIndexHeader))); + + // Copy in all the old indices, if any + size_t j = 0; + if (pr_blockIndexSlotsUsed != 0) { + auto i = + (pr_blockIndexFront - pr_blockIndexSlotsUsed) & prevBlockSizeMask; + do { + newBlockIndexEntries[j++] = pr_blockIndexEntries[i]; + i = (i + 1) & prevBlockSizeMask; + } while (i != pr_blockIndexFront); + } + + // Update everything + auto header = new (newRawPtr) BlockIndexHeader; + header->size = pr_blockIndexSize; + header->front.store(numberOfFilledSlotsToExpose - 1, + std::memory_order_relaxed); + header->entries = newBlockIndexEntries; + header->prev = pr_blockIndexRaw; // we link the new block to the old one + // so we can free it later + + pr_blockIndexFront = j; + pr_blockIndexEntries = newBlockIndexEntries; + pr_blockIndexRaw = newRawPtr; + blockIndex.store(header, std::memory_order_release); + + return true; + } + + private: + std::atomic blockIndex; + + // To be used by producer only -- consumer must use the ones in referenced + // by blockIndex + size_t pr_blockIndexSlotsUsed; + size_t pr_blockIndexSize; + size_t pr_blockIndexFront; // Next slot (not current) + BlockIndexEntry *pr_blockIndexEntries; + void *pr_blockIndexRaw; + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ExplicitProducer* nextExplicitProducer; - private: + public: + ExplicitProducer *nextExplicitProducer; + + private: #endif - + #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - ////////////////////////////////// - // Implicit queue - ////////////////////////////////// - - struct ImplicitProducer : public ProducerBase - { - ImplicitProducer(ConcurrentQueue* parent_) : - ProducerBase(parent_, false), - nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), - blockIndex(nullptr) - { - new_block_index(); - } - - ~ImplicitProducer() - { - // Note that since we're in the destructor we can assume that all enqueue/dequeue operations - // completed already; this means that all undequeued elements are placed contiguously across - // contiguous blocks, and that only the first and last remaining blocks can be only partially - // empty (all other remaining blocks must be completely full). - + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Implicit queue + ////////////////////////////////// + + struct ImplicitProducer : public ProducerBase { + ImplicitProducer(ConcurrentQueue *parent_) + : ProducerBase(parent_, false), + nextBlockIndexCapacity(IMPLICIT_INITIAL_INDEX_SIZE), + blockIndex(nullptr) { + new_block_index(); + } + + ~ImplicitProducer() { + // Note that since we're in the destructor we can assume that all + // enqueue/dequeue operations completed already; this means that all + // undequeued elements are placed contiguously across contiguous blocks, + // and that only the first and last remaining blocks can be only partially + // empty (all other remaining blocks must be completely full). + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - // Unregister ourselves for thread termination notification - if (!this->inactive.load(std::memory_order_relaxed)) { - details::ThreadExitNotifier::unsubscribe(&threadExitListener); - } -#endif - - // Destroy all remaining elements! - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto index = this->headIndex.load(std::memory_order_relaxed); - Block* block = nullptr; - assert(index == tail || details::circular_less_than(index, tail)); - bool forceFreeLastBlock = index != tail; // If we enter the loop, then the last (tail) block will not be freed - while (index != tail) { - if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || block == nullptr) { - if (block != nullptr) { - // Free the old block - this->parent->add_block_to_free_list(block); - } - - block = get_block_index_entry_for_index(index)->value.load(std::memory_order_relaxed); - } - - ((*block)[index])->~T(); - ++index; - } - // Even if the queue is empty, there's still one block that's not on the free list - // (unless the head index reached the end of it, in which case the tail will be poised - // to create a new block). - if (this->tailBlock != nullptr && (forceFreeLastBlock || (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { - this->parent->add_block_to_free_list(this->tailBlock); - } - - // Destroy block index - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - if (localBlockIndex != nullptr) { - for (size_t i = 0; i != localBlockIndex->capacity; ++i) { - localBlockIndex->index[i]->~BlockIndexEntry(); - } - do { - auto prev = localBlockIndex->prev; - localBlockIndex->~BlockIndexHeader(); - (Traits::free)(localBlockIndex); - localBlockIndex = prev; - } while (localBlockIndex != nullptr); - } - } - - template - inline bool enqueue(U&& element) - { - index_t currentTailIndex = this->tailIndex.load(std::memory_order_relaxed); - index_t newTailIndex = 1 + currentTailIndex; - if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - // We reached the end of a block, start a new one - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - if (!details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { - return false; - } + // Unregister ourselves for thread termination notification + if (!this->inactive.load(std::memory_order_relaxed)) { + details::ThreadExitNotifier::unsubscribe(&threadExitListener); + } +#endif + + // Destroy all remaining elements! + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto index = this->headIndex.load(std::memory_order_relaxed); + Block *block = nullptr; + assert(index == tail || details::circular_less_than(index, tail)); + bool forceFreeLastBlock = + index != tail; // If we enter the loop, then the last (tail) block + // will not be freed + while (index != tail) { + if ((index & static_cast(BLOCK_SIZE - 1)) == 0 || + block == nullptr) { + if (block != nullptr) { + // Free the old block + this->parent->add_block_to_free_list(block); + } + + block = get_block_index_entry_for_index(index)->value.load( + std::memory_order_relaxed); + } + + ((*block)[index])->~T(); + ++index; + } + // Even if the queue is empty, there's still one block that's not on the + // free list (unless the head index reached the end of it, in which case + // the tail will be poised to create a new block). + if (this->tailBlock != nullptr && + (forceFreeLastBlock || + (tail & static_cast(BLOCK_SIZE - 1)) != 0)) { + this->parent->add_block_to_free_list(this->tailBlock); + } + + // Destroy block index + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + if (localBlockIndex != nullptr) { + for (size_t i = 0; i != localBlockIndex->capacity; ++i) { + localBlockIndex->index[i]->~BlockIndexEntry(); + } + do { + auto prev = localBlockIndex->prev; + localBlockIndex->~BlockIndexHeader(); + (Traits::free)(localBlockIndex); + localBlockIndex = prev; + } while (localBlockIndex != nullptr); + } + } + + template + inline bool enqueue(U &&element) { + index_t currentTailIndex = + this->tailIndex.load(std::memory_order_relaxed); + index_t newTailIndex = 1 + currentTailIndex; + if ((currentTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { + // We reached the end of a block, start a new one + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + if (!details::circular_less_than(head, currentTailIndex + + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head))) { + return false; + } #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Find out where we'll be inserting this block in the block index - BlockIndexEntry* idxEntry; - if (!insert_block_index_entry(idxEntry, currentTailIndex)) { - return false; - } - - // Get ahold of a new block - auto newBlock = this->parent->ConcurrentQueue::template requisition_block(); - if (newBlock == nullptr) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - return false; - } + debug::DebugLock lock(mutex); +#endif + // Find out where we'll be inserting this block in the block index + BlockIndexEntry *idxEntry; + if (!insert_block_index_entry(idxEntry, currentTailIndex)) { + return false; + } + + // Get ahold of a new block + auto newBlock = + this->parent + ->ConcurrentQueue::template requisition_block(); + if (newBlock == nullptr) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + return false; + } #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { - // May throw, try to insert now before we publish the fact that we have this new block - MOODYCAMEL_TRY { - new ((*newBlock)[currentTailIndex]) T(std::forward(element)); - } - MOODYCAMEL_CATCH (...) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list(newBlock); - MOODYCAMEL_RETHROW; - } - } - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - this->tailBlock = newBlock; - - MOODYCAMEL_CONSTEXPR_IF (!MOODYCAMEL_NOEXCEPT_CTOR(T, U, new (static_cast(nullptr)) T(std::forward(element)))) { - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - } - - // Enqueue - new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); - - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } - - template - bool dequeue(U& element) - { - // See ExplicitProducer::dequeue for rationale and explanation - index_t tail = this->tailIndex.load(std::memory_order_relaxed); - index_t overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - if (details::circular_less_than(this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit, tail)) { - std::atomic_thread_fence(std::memory_order_acquire); - - index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add(1, std::memory_order_relaxed); - tail = this->tailIndex.load(std::memory_order_acquire); - if ((details::likely)(details::circular_less_than(myDequeueCount - overcommit, tail))) { - index_t index = this->headIndex.fetch_add(1, std::memory_order_acq_rel); - - // Determine which block the element is in - auto entry = get_block_index_entry_for_index(index); - - // Dequeue - auto block = entry->value.load(std::memory_order_relaxed); - auto& el = *((*block)[index]); - - if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, element = std::move(el))) { + newBlock->owner = this; +#endif + newBlock + ->ConcurrentQueue::Block::template reset_empty(); + + MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( + T, U, + new (static_cast(nullptr)) T(std::forward(element)))) { + // May throw, try to insert now before we publish the fact that we + // have this new block + MOODYCAMEL_TRY { + new ((*newBlock)[currentTailIndex]) T(std::forward(element)); + } + MOODYCAMEL_CATCH(...) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(newBlock); + MOODYCAMEL_RETHROW; + } + } + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + this->tailBlock = newBlock; + + MOODYCAMEL_CONSTEXPR_IF(!MOODYCAMEL_NOEXCEPT_CTOR( + T, U, + new (static_cast(nullptr)) T(std::forward(element)))) { + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + } + + // Enqueue + new ((*this->tailBlock)[currentTailIndex]) T(std::forward(element)); + + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } + + template bool dequeue(U &element) { + // See ExplicitProducer::dequeue for rationale and explanation + index_t tail = this->tailIndex.load(std::memory_order_relaxed); + index_t overcommit = + this->dequeueOvercommit.load(std::memory_order_relaxed); + if (details::circular_less_than( + this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit, + tail)) { + std::atomic_thread_fence(std::memory_order_acquire); + + index_t myDequeueCount = this->dequeueOptimisticCount.fetch_add( + 1, std::memory_order_relaxed); + tail = this->tailIndex.load(std::memory_order_acquire); + if ((details::likely)(details::circular_less_than( + myDequeueCount - overcommit, tail))) { + index_t index = + this->headIndex.fetch_add(1, std::memory_order_acq_rel); + + // Determine which block the element is in + auto entry = get_block_index_entry_for_index(index); + + // Dequeue + auto block = entry->value.load(std::memory_order_relaxed); + auto &el = *((*block)[index]); + + if (!MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, element = std::move(el))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - // Note: Acquiring the mutex with every dequeue instead of only when a block - // is released is very sub-optimal, but it is, after all, purely debug code. - debug::DebugLock lock(producer->mutex); -#endif - struct Guard { - Block* block; - index_t index; - BlockIndexEntry* entry; - ConcurrentQueue* parent; - - ~Guard() - { - (*block)[index]->~T(); - if (block->ConcurrentQueue::Block::template set_empty(index)) { - entry->value.store(nullptr, std::memory_order_relaxed); - parent->add_block_to_free_list(block); - } - } - } guard = { block, index, entry, this->parent }; - - element = std::move(el); // NOLINT - } - else { - element = std::move(el); // NOLINT - el.~T(); // NOLINT - - if (block->ConcurrentQueue::Block::template set_empty(index)) { - { + // Note: Acquiring the mutex with every dequeue instead of only when + // a block is released is very sub-optimal, but it is, after all, + // purely debug code. + debug::DebugLock lock(producer->mutex); +#endif + struct Guard { + Block *block; + index_t index; + BlockIndexEntry *entry; + ConcurrentQueue *parent; + + ~Guard() { + (*block)[index]->~T(); + if (block->ConcurrentQueue::Block::template set_empty< + implicit_context>(index)) { + entry->value.store(nullptr, std::memory_order_relaxed); + parent->add_block_to_free_list(block); + } + } + } guard = {block, index, entry, this->parent}; + + element = std::move(el); // NOLINT + } else { + element = std::move(el); // NOLINT + el.~T(); // NOLINT + + if (block->ConcurrentQueue::Block::template set_empty< + implicit_context>(index)) { + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Add the block back into the global free pool (and remove from block index) - entry->value.store(nullptr, std::memory_order_relaxed); - } - this->parent->add_block_to_free_list(block); // releases the above store - } - } - - return true; - } - else { - this->dequeueOvercommit.fetch_add(1, std::memory_order_release); - } - } - - return false; - } - + debug::DebugLock lock(mutex); +#endif + // Add the block back into the global free pool (and remove from + // block index) + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list( + block); // releases the above store + } + } + + return true; + } else { + this->dequeueOvercommit.fetch_add(1, std::memory_order_release); + } + } + + return false; + } + #ifdef _MSC_VER #pragma warning(push) -#pragma warning(disable: 4706) // assignment within conditional expression -#endif - template - bool enqueue_bulk(It itemFirst, size_t count) - { - // First, we need to make sure we have enough room to enqueue all of the elements; - // this means pre-allocating blocks and putting them in the block index (but only if - // all the allocations succeeded). - - // Note that the tailBlock we start off with may not be owned by us any more; - // this happens if it was filled up exactly to the top (setting tailIndex to - // the first index of the next block which is not yet allocated), then dequeued - // completely (putting it on the free list) before we enqueue again. - - index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); - auto startBlock = this->tailBlock; - Block* firstAllocatedBlock = nullptr; - auto endBlock = this->tailBlock; - - // Figure out how many blocks we'll need to allocate, and do so - size_t blockBaseDiff = ((startTailIndex + count - 1) & ~static_cast(BLOCK_SIZE - 1)) - ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); - index_t currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - if (blockBaseDiff > 0) { +#pragma warning(disable : 4706) // assignment within conditional expression +#endif + template + bool enqueue_bulk(It itemFirst, size_t count) { + // First, we need to make sure we have enough room to enqueue all of the + // elements; this means pre-allocating blocks and putting them in the + // block index (but only if all the allocations succeeded). + + // Note that the tailBlock we start off with may not be owned by us any + // more; this happens if it was filled up exactly to the top (setting + // tailIndex to the first index of the next block which is not yet + // allocated), then dequeued completely (putting it on the free list) + // before we enqueue again. + + index_t startTailIndex = this->tailIndex.load(std::memory_order_relaxed); + auto startBlock = this->tailBlock; + Block *firstAllocatedBlock = nullptr; + auto endBlock = this->tailBlock; + + // Figure out how many blocks we'll need to allocate, and do so + size_t blockBaseDiff = + ((startTailIndex + count - 1) & + ~static_cast(BLOCK_SIZE - 1)) - + ((startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1)); + index_t currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + if (blockBaseDiff > 0) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - do { - blockBaseDiff -= static_cast(BLOCK_SIZE); - currentTailIndex += static_cast(BLOCK_SIZE); - - // Find out where we'll be inserting this block in the block index - BlockIndexEntry* idxEntry = nullptr; // initialization here unnecessary but compiler can't always tell - Block* newBlock; - bool indexInserted = false; - auto head = this->headIndex.load(std::memory_order_relaxed); - assert(!details::circular_less_than(currentTailIndex, head)); - bool full = !details::circular_less_than(head, currentTailIndex + BLOCK_SIZE) || (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && (MAX_SUBQUEUE_SIZE == 0 || MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); - - if (full || !(indexInserted = insert_block_index_entry(idxEntry, currentTailIndex)) || (newBlock = this->parent->ConcurrentQueue::template requisition_block()) == nullptr) { - // Index allocation or block allocation failed; revert any other allocations - // and index insertions done so far for this operation - if (indexInserted) { - rewind_block_index_tail(); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - } - currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { - currentTailIndex += static_cast(BLOCK_SIZE); - idxEntry = get_block_index_entry_for_index(currentTailIndex); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list(firstAllocatedBlock); - this->tailBlock = startBlock; - - return false; - } - + debug::DebugLock lock(mutex); +#endif + do { + blockBaseDiff -= static_cast(BLOCK_SIZE); + currentTailIndex += static_cast(BLOCK_SIZE); + + // Find out where we'll be inserting this block in the block index + BlockIndexEntry *idxEntry = + nullptr; // initialization here unnecessary but compiler can't + // always tell + Block *newBlock; + bool indexInserted = false; + auto head = this->headIndex.load(std::memory_order_relaxed); + assert(!details::circular_less_than(currentTailIndex, head)); + bool full = + !details::circular_less_than(head, currentTailIndex + + BLOCK_SIZE) || + (MAX_SUBQUEUE_SIZE != details::const_numeric_max::value && + (MAX_SUBQUEUE_SIZE == 0 || + MAX_SUBQUEUE_SIZE - BLOCK_SIZE < currentTailIndex - head)); + + if (full || + !(indexInserted = insert_block_index_entry( + idxEntry, currentTailIndex)) || + (newBlock = + this->parent->ConcurrentQueue::template requisition_block< + allocMode>()) == nullptr) { + // Index allocation or block allocation failed; revert any other + // allocations and index insertions done so far for this operation + if (indexInserted) { + rewind_block_index_tail(); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + } + currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + + return false; + } + #ifdef MCDBGQ_TRACKMEM - newBlock->owner = this; -#endif - newBlock->ConcurrentQueue::Block::template reset_empty(); - newBlock->next = nullptr; - - // Insert the new block into the index - idxEntry->value.store(newBlock, std::memory_order_relaxed); - - // Store the chain of blocks so that we can undo if later allocations fail, - // and so that we can find the blocks when we do the actual enqueueing - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr) { - assert(this->tailBlock != nullptr); - this->tailBlock->next = newBlock; - } - this->tailBlock = newBlock; - endBlock = newBlock; - firstAllocatedBlock = firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; - } while (blockBaseDiff > 0); - } - - // Enqueue, one block at a time - index_t newTailIndex = startTailIndex + static_cast(count); - currentTailIndex = startTailIndex; - this->tailBlock = startBlock; - assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || firstAllocatedBlock != nullptr || count == 0); - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && firstAllocatedBlock != nullptr) { - this->tailBlock = firstAllocatedBlock; - } - while (true) { - index_t stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(newTailIndex, stopIndex)) { - stopIndex = newTailIndex; - } - MOODYCAMEL_CONSTEXPR_IF (MOODYCAMEL_NOEXCEPT_CTOR(T, decltype(*itemFirst), new (static_cast(nullptr)) T(details::deref_noexcept(itemFirst)))) { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); - } - } - else { - MOODYCAMEL_TRY { - while (currentTailIndex != stopIndex) { - new ((*this->tailBlock)[currentTailIndex]) T(details::nomove_if(nullptr)) T(details::deref_noexcept(itemFirst)))>::eval(*itemFirst)); - ++currentTailIndex; - ++itemFirst; - } - } - MOODYCAMEL_CATCH (...) { - auto constructedStopIndex = currentTailIndex; - auto lastBlockEnqueued = this->tailBlock; - - if (!details::is_trivially_destructible::value) { - auto block = startBlock; - if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0) { - block = firstAllocatedBlock; - } - currentTailIndex = startTailIndex; - while (true) { - stopIndex = (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - if (details::circular_less_than(constructedStopIndex, stopIndex)) { - stopIndex = constructedStopIndex; - } - while (currentTailIndex != stopIndex) { - (*block)[currentTailIndex++]->~T(); - } - if (block == lastBlockEnqueued) { - break; - } - block = block->next; - } - } - - currentTailIndex = (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); - for (auto block = firstAllocatedBlock; block != nullptr; block = block->next) { - currentTailIndex += static_cast(BLOCK_SIZE); - auto idxEntry = get_block_index_entry_for_index(currentTailIndex); - idxEntry->value.store(nullptr, std::memory_order_relaxed); - rewind_block_index_tail(); - } - this->parent->add_blocks_to_free_list(firstAllocatedBlock); - this->tailBlock = startBlock; - MOODYCAMEL_RETHROW; - } - } - - if (this->tailBlock == endBlock) { - assert(currentTailIndex == newTailIndex); - break; - } - this->tailBlock = this->tailBlock->next; - } - this->tailIndex.store(newTailIndex, std::memory_order_release); - return true; - } + newBlock->owner = this; +#endif + newBlock->ConcurrentQueue::Block::template reset_empty< + implicit_context>(); + newBlock->next = nullptr; + + // Insert the new block into the index + idxEntry->value.store(newBlock, std::memory_order_relaxed); + + // Store the chain of blocks so that we can undo if later allocations + // fail, and so that we can find the blocks when we do the actual + // enqueueing + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || + firstAllocatedBlock != nullptr) { + assert(this->tailBlock != nullptr); + this->tailBlock->next = newBlock; + } + this->tailBlock = newBlock; + endBlock = newBlock; + firstAllocatedBlock = + firstAllocatedBlock == nullptr ? newBlock : firstAllocatedBlock; + } while (blockBaseDiff > 0); + } + + // Enqueue, one block at a time + index_t newTailIndex = startTailIndex + static_cast(count); + currentTailIndex = startTailIndex; + this->tailBlock = startBlock; + assert((startTailIndex & static_cast(BLOCK_SIZE - 1)) != 0 || + firstAllocatedBlock != nullptr || count == 0); + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == 0 && + firstAllocatedBlock != nullptr) { + this->tailBlock = firstAllocatedBlock; + } + while (true) { + index_t stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(newTailIndex, stopIndex)) { + stopIndex = newTailIndex; + } + MOODYCAMEL_CONSTEXPR_IF(MOODYCAMEL_NOEXCEPT_CTOR( + T, decltype(*itemFirst), + new (static_cast(nullptr)) + T(details::deref_noexcept(itemFirst)))) { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex++]) T(*itemFirst++); + } + } + else { + MOODYCAMEL_TRY { + while (currentTailIndex != stopIndex) { + new ((*this->tailBlock)[currentTailIndex]) T( + details::nomove_if(nullptr)) T(details::deref_noexcept( + itemFirst)))>::eval(*itemFirst)); + ++currentTailIndex; + ++itemFirst; + } + } + MOODYCAMEL_CATCH(...) { + auto constructedStopIndex = currentTailIndex; + auto lastBlockEnqueued = this->tailBlock; + + if (!details::is_trivially_destructible::value) { + auto block = startBlock; + if ((startTailIndex & static_cast(BLOCK_SIZE - 1)) == + 0) { + block = firstAllocatedBlock; + } + currentTailIndex = startTailIndex; + while (true) { + stopIndex = + (currentTailIndex & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + if (details::circular_less_than(constructedStopIndex, + stopIndex)) { + stopIndex = constructedStopIndex; + } + while (currentTailIndex != stopIndex) { + (*block)[currentTailIndex++]->~T(); + } + if (block == lastBlockEnqueued) { + break; + } + block = block->next; + } + } + + currentTailIndex = + (startTailIndex - 1) & ~static_cast(BLOCK_SIZE - 1); + for (auto block = firstAllocatedBlock; block != nullptr; + block = block->next) { + currentTailIndex += static_cast(BLOCK_SIZE); + auto idxEntry = get_block_index_entry_for_index(currentTailIndex); + idxEntry->value.store(nullptr, std::memory_order_relaxed); + rewind_block_index_tail(); + } + this->parent->add_blocks_to_free_list(firstAllocatedBlock); + this->tailBlock = startBlock; + MOODYCAMEL_RETHROW; + } + } + + if (this->tailBlock == endBlock) { + assert(currentTailIndex == newTailIndex); + break; + } + this->tailBlock = this->tailBlock->next; + } + this->tailIndex.store(newTailIndex, std::memory_order_release); + return true; + } #ifdef _MSC_VER #pragma warning(pop) #endif - - template - size_t dequeue_bulk(It& itemFirst, size_t max) - { - auto tail = this->tailIndex.load(std::memory_order_relaxed); - auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); - auto desiredCount = static_cast(tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - overcommit)); - if (details::circular_less_than(0, desiredCount)) { - desiredCount = desiredCount < max ? desiredCount : max; - std::atomic_thread_fence(std::memory_order_acquire); - - auto myDequeueCount = this->dequeueOptimisticCount.fetch_add(desiredCount, std::memory_order_relaxed); - - tail = this->tailIndex.load(std::memory_order_acquire); - auto actualCount = static_cast(tail - (myDequeueCount - overcommit)); - if (details::circular_less_than(0, actualCount)) { - actualCount = desiredCount < actualCount ? desiredCount : actualCount; - if (actualCount < desiredCount) { - this->dequeueOvercommit.fetch_add(desiredCount - actualCount, std::memory_order_release); - } - - // Get the first index. Note that since there's guaranteed to be at least actualCount elements, this - // will never exceed tail. - auto firstIndex = this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); - - // Iterate the blocks and dequeue - auto index = firstIndex; - BlockIndexHeader* localBlockIndex; - auto indexIndex = get_block_index_index_for_index(index, localBlockIndex); - do { - auto blockStartIndex = index; - index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - - auto entry = localBlockIndex->index[indexIndex]; - auto block = entry->value.load(std::memory_order_relaxed); - if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T&&, details::deref_noexcept(itemFirst) = std::move((*(*block)[index])))) { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst++ = std::move(el); - el.~T(); - ++index; - } - } - else { - MOODYCAMEL_TRY { - while (index != endIndex) { - auto& el = *((*block)[index]); - *itemFirst = std::move(el); - ++itemFirst; - el.~T(); - ++index; - } - } - MOODYCAMEL_CATCH (...) { - do { - entry = localBlockIndex->index[indexIndex]; - block = entry->value.load(std::memory_order_relaxed); - while (index != endIndex) { - (*block)[index++]->~T(); - } - - if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { + + template size_t dequeue_bulk(It &itemFirst, size_t max) { + auto tail = this->tailIndex.load(std::memory_order_relaxed); + auto overcommit = this->dequeueOvercommit.load(std::memory_order_relaxed); + auto desiredCount = static_cast( + tail - (this->dequeueOptimisticCount.load(std::memory_order_relaxed) - + overcommit)); + if (details::circular_less_than(0, desiredCount)) { + desiredCount = desiredCount < max ? desiredCount : max; + std::atomic_thread_fence(std::memory_order_acquire); + + auto myDequeueCount = this->dequeueOptimisticCount.fetch_add( + desiredCount, std::memory_order_relaxed); + + tail = this->tailIndex.load(std::memory_order_acquire); + auto actualCount = + static_cast(tail - (myDequeueCount - overcommit)); + if (details::circular_less_than(0, actualCount)) { + actualCount = desiredCount < actualCount ? desiredCount : actualCount; + if (actualCount < desiredCount) { + this->dequeueOvercommit.fetch_add(desiredCount - actualCount, + std::memory_order_release); + } + + // Get the first index. Note that since there's guaranteed to be at + // least actualCount elements, this will never exceed tail. + auto firstIndex = + this->headIndex.fetch_add(actualCount, std::memory_order_acq_rel); + + // Iterate the blocks and dequeue + auto index = firstIndex; + BlockIndexHeader *localBlockIndex; + auto indexIndex = + get_block_index_index_for_index(index, localBlockIndex); + do { + auto blockStartIndex = index; + index_t endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + + auto entry = localBlockIndex->index[indexIndex]; + auto block = entry->value.load(std::memory_order_relaxed); + if (MOODYCAMEL_NOEXCEPT_ASSIGN(T, T &&, + details::deref_noexcept(itemFirst) = + std::move((*(*block)[index])))) { + while (index != endIndex) { + auto &el = *((*block)[index]); + *itemFirst++ = std::move(el); + el.~T(); + ++index; + } + } else { + MOODYCAMEL_TRY { + while (index != endIndex) { + auto &el = *((*block)[index]); + *itemFirst = std::move(el); + ++itemFirst; + el.~T(); + ++index; + } + } + MOODYCAMEL_CATCH(...) { + do { + entry = localBlockIndex->index[indexIndex]; + block = entry->value.load(std::memory_order_relaxed); + while (index != endIndex) { + (*block)[index++]->~T(); + } + + if (block->ConcurrentQueue::Block::template set_many_empty< + implicit_context>( + blockStartIndex, + static_cast(endIndex - blockStartIndex))) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - entry->value.store(nullptr, std::memory_order_relaxed); - this->parent->add_block_to_free_list(block); - } - indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); - - blockStartIndex = index; - endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + static_cast(BLOCK_SIZE); - endIndex = details::circular_less_than(firstIndex + static_cast(actualCount), endIndex) ? firstIndex + static_cast(actualCount) : endIndex; - } while (index != firstIndex + actualCount); - - MOODYCAMEL_RETHROW; - } - } - if (block->ConcurrentQueue::Block::template set_many_empty(blockStartIndex, static_cast(endIndex - blockStartIndex))) { - { + debug::DebugLock lock(mutex); +#endif + entry->value.store(nullptr, std::memory_order_relaxed); + this->parent->add_block_to_free_list(block); + } + indexIndex = + (indexIndex + 1) & (localBlockIndex->capacity - 1); + + blockStartIndex = index; + endIndex = (index & ~static_cast(BLOCK_SIZE - 1)) + + static_cast(BLOCK_SIZE); + endIndex = + details::circular_less_than( + firstIndex + static_cast(actualCount), + endIndex) + ? firstIndex + static_cast(actualCount) + : endIndex; + } while (index != firstIndex + actualCount); + + MOODYCAMEL_RETHROW; + } + } + if (block->ConcurrentQueue::Block::template set_many_empty< + implicit_context>( + blockStartIndex, + static_cast(endIndex - blockStartIndex))) { + { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - // Note that the set_many_empty above did a release, meaning that anybody who acquires the block - // we're about to free can use it safely since our writes (and reads!) will have happened-before then. - entry->value.store(nullptr, std::memory_order_relaxed); - } - this->parent->add_block_to_free_list(block); // releases the above store - } - indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); - } while (index != firstIndex + actualCount); - - return actualCount; - } - else { - this->dequeueOvercommit.fetch_add(desiredCount, std::memory_order_release); - } - } - - return 0; - } - - private: - // The block size must be > 1, so any number with the low bit set is an invalid block base index - static const index_t INVALID_BLOCK_BASE = 1; - - struct BlockIndexEntry - { - std::atomic key; - std::atomic value; - }; - - struct BlockIndexHeader - { - size_t capacity; - std::atomic tail; - BlockIndexEntry* entries; - BlockIndexEntry** index; - BlockIndexHeader* prev; - }; - - template - inline bool insert_block_index_entry(BlockIndexEntry*& idxEntry, index_t blockStartIndex) - { - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); // We're the only writer thread, relaxed is OK - if (localBlockIndex == nullptr) { - return false; // this can happen if new_block_index failed in the constructor - } - size_t newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || - idxEntry->value.load(std::memory_order_relaxed) == nullptr) { - - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - - // No room in the old block index, try to allocate another one! - MOODYCAMEL_CONSTEXPR_IF (allocMode == CannotAlloc) { - return false; - } - else if (!new_block_index()) { - return false; - } - else { - localBlockIndex = blockIndex.load(std::memory_order_relaxed); - newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & (localBlockIndex->capacity - 1); - idxEntry = localBlockIndex->index[newTail]; - assert(idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE); - idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); - localBlockIndex->tail.store(newTail, std::memory_order_release); - return true; - } - } - - inline void rewind_block_index_tail() - { - auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); - localBlockIndex->tail.store((localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & (localBlockIndex->capacity - 1), std::memory_order_relaxed); - } - - inline BlockIndexEntry* get_block_index_entry_for_index(index_t index) const - { - BlockIndexHeader* localBlockIndex; - auto idx = get_block_index_index_for_index(index, localBlockIndex); - return localBlockIndex->index[idx]; - } - - inline size_t get_block_index_index_for_index(index_t index, BlockIndexHeader*& localBlockIndex) const - { + debug::DebugLock lock(mutex); +#endif + // Note that the set_many_empty above did a release, meaning + // that anybody who acquires the block we're about to free can + // use it safely since our writes (and reads!) will have + // happened-before then. + entry->value.store(nullptr, std::memory_order_relaxed); + } + this->parent->add_block_to_free_list( + block); // releases the above store + } + indexIndex = (indexIndex + 1) & (localBlockIndex->capacity - 1); + } while (index != firstIndex + actualCount); + + return actualCount; + } else { + this->dequeueOvercommit.fetch_add(desiredCount, + std::memory_order_release); + } + } + + return 0; + } + + private: + // The block size must be > 1, so any number with the low bit set is an + // invalid block base index + static const index_t INVALID_BLOCK_BASE = 1; + + struct BlockIndexEntry { + std::atomic key; + std::atomic value; + }; + + struct BlockIndexHeader { + size_t capacity; + std::atomic tail; + BlockIndexEntry *entries; + BlockIndexEntry **index; + BlockIndexHeader *prev; + }; + + template + inline bool insert_block_index_entry(BlockIndexEntry *&idxEntry, + index_t blockStartIndex) { + auto localBlockIndex = + blockIndex.load(std::memory_order_relaxed); // We're the only writer + // thread, relaxed is OK + if (localBlockIndex == nullptr) { + return false; // this can happen if new_block_index failed in the + // constructor + } + size_t newTail = + (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & + (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + if (idxEntry->key.load(std::memory_order_relaxed) == INVALID_BLOCK_BASE || + idxEntry->value.load(std::memory_order_relaxed) == nullptr) { + + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + + // No room in the old block index, try to allocate another one! + MOODYCAMEL_CONSTEXPR_IF(allocMode == CannotAlloc) { return false; } + else if (!new_block_index()) { + return false; + } + else { + localBlockIndex = blockIndex.load(std::memory_order_relaxed); + newTail = (localBlockIndex->tail.load(std::memory_order_relaxed) + 1) & + (localBlockIndex->capacity - 1); + idxEntry = localBlockIndex->index[newTail]; + assert(idxEntry->key.load(std::memory_order_relaxed) == + INVALID_BLOCK_BASE); + idxEntry->key.store(blockStartIndex, std::memory_order_relaxed); + localBlockIndex->tail.store(newTail, std::memory_order_release); + return true; + } + } + + inline void rewind_block_index_tail() { + auto localBlockIndex = blockIndex.load(std::memory_order_relaxed); + localBlockIndex->tail.store( + (localBlockIndex->tail.load(std::memory_order_relaxed) - 1) & + (localBlockIndex->capacity - 1), + std::memory_order_relaxed); + } + + inline BlockIndexEntry * + get_block_index_entry_for_index(index_t index) const { + BlockIndexHeader *localBlockIndex; + auto idx = get_block_index_index_for_index(index, localBlockIndex); + return localBlockIndex->index[idx]; + } + + inline size_t + get_block_index_index_for_index(index_t index, + BlockIndexHeader *&localBlockIndex) const { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - debug::DebugLock lock(mutex); -#endif - index &= ~static_cast(BLOCK_SIZE - 1); - localBlockIndex = blockIndex.load(std::memory_order_acquire); - auto tail = localBlockIndex->tail.load(std::memory_order_acquire); - auto tailBase = localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); - assert(tailBase != INVALID_BLOCK_BASE); - // Note: Must use division instead of shift because the index may wrap around, causing a negative - // offset, whose negativity we want to preserve - auto offset = static_cast(static_cast::type>(index - tailBase) / static_cast::type>(BLOCK_SIZE)); - size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); - assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == index && localBlockIndex->index[idx]->value.load(std::memory_order_relaxed) != nullptr); - return idx; - } - - bool new_block_index() - { - auto prev = blockIndex.load(std::memory_order_relaxed); - size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; - auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; - auto raw = static_cast((Traits::malloc)( - sizeof(BlockIndexHeader) + - std::alignment_of::value - 1 + sizeof(BlockIndexEntry) * entryCount + - std::alignment_of::value - 1 + sizeof(BlockIndexEntry*) * nextBlockIndexCapacity)); - if (raw == nullptr) { - return false; - } - - auto header = new (raw) BlockIndexHeader; - auto entries = reinterpret_cast(details::align_for(raw + sizeof(BlockIndexHeader))); - auto index = reinterpret_cast(details::align_for(reinterpret_cast(entries) + sizeof(BlockIndexEntry) * entryCount)); - if (prev != nullptr) { - auto prevTail = prev->tail.load(std::memory_order_relaxed); - auto prevPos = prevTail; - size_t i = 0; - do { - prevPos = (prevPos + 1) & (prev->capacity - 1); - index[i++] = prev->index[prevPos]; - } while (prevPos != prevTail); - assert(i == prevCapacity); - } - for (size_t i = 0; i != entryCount; ++i) { - new (entries + i) BlockIndexEntry; - entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); - index[prevCapacity + i] = entries + i; - } - header->prev = prev; - header->entries = entries; - header->index = index; - header->capacity = nextBlockIndexCapacity; - header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), std::memory_order_relaxed); - - blockIndex.store(header, std::memory_order_release); - - nextBlockIndexCapacity <<= 1; - - return true; - } - - private: - size_t nextBlockIndexCapacity; - std::atomic blockIndex; + debug::DebugLock lock(mutex); +#endif + index &= ~static_cast(BLOCK_SIZE - 1); + localBlockIndex = blockIndex.load(std::memory_order_acquire); + auto tail = localBlockIndex->tail.load(std::memory_order_acquire); + auto tailBase = + localBlockIndex->index[tail]->key.load(std::memory_order_relaxed); + assert(tailBase != INVALID_BLOCK_BASE); + // Note: Must use division instead of shift because the index may wrap + // around, causing a negative offset, whose negativity we want to preserve + auto offset = static_cast( + static_cast::type>(index - + tailBase) / + static_cast::type>(BLOCK_SIZE)); + size_t idx = (tail + offset) & (localBlockIndex->capacity - 1); + assert(localBlockIndex->index[idx]->key.load(std::memory_order_relaxed) == + index && + localBlockIndex->index[idx]->value.load( + std::memory_order_relaxed) != nullptr); + return idx; + } + + bool new_block_index() { + auto prev = blockIndex.load(std::memory_order_relaxed); + size_t prevCapacity = prev == nullptr ? 0 : prev->capacity; + auto entryCount = prev == nullptr ? nextBlockIndexCapacity : prevCapacity; + auto raw = static_cast((Traits::malloc)( + sizeof(BlockIndexHeader) + std::alignment_of::value - + 1 + sizeof(BlockIndexEntry) * entryCount + + std::alignment_of::value - 1 + + sizeof(BlockIndexEntry *) * nextBlockIndexCapacity)); + if (raw == nullptr) { + return false; + } + + auto header = new (raw) BlockIndexHeader; + auto entries = reinterpret_cast( + details::align_for(raw + sizeof(BlockIndexHeader))); + auto index = reinterpret_cast( + details::align_for( + reinterpret_cast(entries) + + sizeof(BlockIndexEntry) * entryCount)); + if (prev != nullptr) { + auto prevTail = prev->tail.load(std::memory_order_relaxed); + auto prevPos = prevTail; + size_t i = 0; + do { + prevPos = (prevPos + 1) & (prev->capacity - 1); + index[i++] = prev->index[prevPos]; + } while (prevPos != prevTail); + assert(i == prevCapacity); + } + for (size_t i = 0; i != entryCount; ++i) { + new (entries + i) BlockIndexEntry; + entries[i].key.store(INVALID_BLOCK_BASE, std::memory_order_relaxed); + index[prevCapacity + i] = entries + i; + } + header->prev = prev; + header->entries = entries; + header->index = index; + header->capacity = nextBlockIndexCapacity; + header->tail.store((prevCapacity - 1) & (nextBlockIndexCapacity - 1), + std::memory_order_relaxed); + + blockIndex.store(header, std::memory_order_release); + + nextBlockIndexCapacity <<= 1; + + return true; + } + + private: + size_t nextBlockIndexCapacity; + std::atomic blockIndex; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - public: - details::ThreadExitListener threadExitListener; - private: + public: + details::ThreadExitListener threadExitListener; + + private: #endif - + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - public: - ImplicitProducer* nextImplicitProducer; - private: + public: + ImplicitProducer *nextImplicitProducer; + + private: #endif #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODBLOCKINDEX - mutable debug::DebugMutex mutex; + mutable debug::DebugMutex mutex; #endif #ifdef MCDBGQ_TRACKMEM - friend struct MemStats; -#endif - }; - - - ////////////////////////////////// - // Block pool manipulation - ////////////////////////////////// - - void populate_initial_block_list(size_t blockCount) - { - initialBlockPoolSize = blockCount; - if (initialBlockPoolSize == 0) { - initialBlockPool = nullptr; - return; - } - - initialBlockPool = create_array(blockCount); - if (initialBlockPool == nullptr) { - initialBlockPoolSize = 0; - } - for (size_t i = 0; i < initialBlockPoolSize; ++i) { - initialBlockPool[i].dynamicallyAllocated = false; - } - } - - inline Block* try_get_block_from_initial_pool() - { - if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= initialBlockPoolSize) { - return nullptr; - } - - auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); - - return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; - } - - inline void add_block_to_free_list(Block* block) - { + friend struct MemStats; +#endif + }; + + ////////////////////////////////// + // Block pool manipulation + ////////////////////////////////// + + void populate_initial_block_list(size_t blockCount) { + initialBlockPoolSize = blockCount; + if (initialBlockPoolSize == 0) { + initialBlockPool = nullptr; + return; + } + + initialBlockPool = create_array(blockCount); + if (initialBlockPool == nullptr) { + initialBlockPoolSize = 0; + } + for (size_t i = 0; i < initialBlockPoolSize; ++i) { + initialBlockPool[i].dynamicallyAllocated = false; + } + } + + inline Block *try_get_block_from_initial_pool() { + if (initialBlockPoolIndex.load(std::memory_order_relaxed) >= + initialBlockPoolSize) { + return nullptr; + } + + auto index = initialBlockPoolIndex.fetch_add(1, std::memory_order_relaxed); + + return index < initialBlockPoolSize ? (initialBlockPool + index) : nullptr; + } + + inline void add_block_to_free_list(Block *block) { #ifdef MCDBGQ_TRACKMEM - block->owner = nullptr; -#endif - if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) { - destroy(block); - } - else { - freeList.add(block); - } - } - - inline void add_blocks_to_free_list(Block* block) - { - while (block != nullptr) { - auto next = block->next; - add_block_to_free_list(block); - block = next; - } - } - - inline Block* try_get_block_from_free_list() - { - return freeList.try_get(); - } - - // Gets a free block from one of the memory pools, or allocates a new one (if applicable) - template - Block* requisition_block() - { - auto block = try_get_block_from_initial_pool(); - if (block != nullptr) { - return block; - } - - block = try_get_block_from_free_list(); - if (block != nullptr) { - return block; - } - - MOODYCAMEL_CONSTEXPR_IF (canAlloc == CanAlloc) { - return create(); - } - else { - return nullptr; - } - } - + block->owner = nullptr; +#endif + if (!Traits::RECYCLE_ALLOCATED_BLOCKS && block->dynamicallyAllocated) { + destroy(block); + } else { + freeList.add(block); + } + } + + inline void add_blocks_to_free_list(Block *block) { + while (block != nullptr) { + auto next = block->next; + add_block_to_free_list(block); + block = next; + } + } + + inline Block *try_get_block_from_free_list() { return freeList.try_get(); } + + // Gets a free block from one of the memory pools, or allocates a new one (if + // applicable) + template Block *requisition_block() { + auto block = try_get_block_from_initial_pool(); + if (block != nullptr) { + return block; + } + + block = try_get_block_from_free_list(); + if (block != nullptr) { + return block; + } + + MOODYCAMEL_CONSTEXPR_IF(canAlloc == CanAlloc) { return create(); } + else { + return nullptr; + } + } #ifdef MCDBGQ_TRACKMEM - public: - struct MemStats { - size_t allocatedBlocks; - size_t usedBlocks; - size_t freeBlocks; - size_t ownedBlocksExplicit; - size_t ownedBlocksImplicit; - size_t implicitProducers; - size_t explicitProducers; - size_t elementsEnqueued; - size_t blockClassBytes; - size_t queueClassBytes; - size_t implicitBlockIndexBytes; - size_t explicitBlockIndexBytes; - - friend class ConcurrentQueue; - - private: - static MemStats getFor(ConcurrentQueue* q) - { - MemStats stats = { 0 }; - - stats.elementsEnqueued = q->size_approx(); - - auto block = q->freeList.head_unsafe(); - while (block != nullptr) { - ++stats.allocatedBlocks; - ++stats.freeBlocks; - block = block->freeListNext.load(std::memory_order_relaxed); - } - - for (auto ptr = q->producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - bool implicit = dynamic_cast(ptr) != nullptr; - stats.implicitProducers += implicit ? 1 : 0; - stats.explicitProducers += implicit ? 0 : 1; - - if (implicit) { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ImplicitProducer); - auto head = prod->headIndex.load(std::memory_order_relaxed); - auto tail = prod->tailIndex.load(std::memory_order_relaxed); - auto hash = prod->blockIndex.load(std::memory_order_relaxed); - if (hash != nullptr) { - for (size_t i = 0; i != hash->capacity; ++i) { - if (hash->index[i]->key.load(std::memory_order_relaxed) != ImplicitProducer::INVALID_BLOCK_BASE && hash->index[i]->value.load(std::memory_order_relaxed) != nullptr) { - ++stats.allocatedBlocks; - ++stats.ownedBlocksImplicit; - } - } - stats.implicitBlockIndexBytes += hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry); - for (; hash != nullptr; hash = hash->prev) { - stats.implicitBlockIndexBytes += sizeof(typename ImplicitProducer::BlockIndexHeader) + hash->capacity * sizeof(typename ImplicitProducer::BlockIndexEntry*); - } - } - for (; details::circular_less_than(head, tail); head += BLOCK_SIZE) { - //auto block = prod->get_block_index_entry_for_index(head); - ++stats.usedBlocks; - } - } - else { - auto prod = static_cast(ptr); - stats.queueClassBytes += sizeof(ExplicitProducer); - auto tailBlock = prod->tailBlock; - bool wasNonEmpty = false; - if (tailBlock != nullptr) { - auto block = tailBlock; - do { - ++stats.allocatedBlocks; - if (!block->ConcurrentQueue::Block::template is_empty() || wasNonEmpty) { - ++stats.usedBlocks; - wasNonEmpty = wasNonEmpty || block != tailBlock; - } - ++stats.ownedBlocksExplicit; - block = block->next; - } while (block != tailBlock); - } - auto index = prod->blockIndex.load(std::memory_order_relaxed); - while (index != nullptr) { - stats.explicitBlockIndexBytes += sizeof(typename ExplicitProducer::BlockIndexHeader) + index->size * sizeof(typename ExplicitProducer::BlockIndexEntry); - index = static_cast(index->prev); - } - } - } - - auto freeOnInitialPool = q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= q->initialBlockPoolSize ? 0 : q->initialBlockPoolSize - q->initialBlockPoolIndex.load(std::memory_order_relaxed); - stats.allocatedBlocks += freeOnInitialPool; - stats.freeBlocks += freeOnInitialPool; - - stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; - stats.queueClassBytes += sizeof(ConcurrentQueue); - - return stats; - } - }; - - // For debugging only. Not thread-safe. - MemStats getMemStats() - { - return MemStats::getFor(this); - } - private: - friend struct MemStats; -#endif - - - ////////////////////////////////// - // Producer list manipulation - ////////////////////////////////// - - ProducerBase* recycle_or_create_producer(bool isExplicit) - { +public: + struct MemStats { + size_t allocatedBlocks; + size_t usedBlocks; + size_t freeBlocks; + size_t ownedBlocksExplicit; + size_t ownedBlocksImplicit; + size_t implicitProducers; + size_t explicitProducers; + size_t elementsEnqueued; + size_t blockClassBytes; + size_t queueClassBytes; + size_t implicitBlockIndexBytes; + size_t explicitBlockIndexBytes; + + friend class ConcurrentQueue; + + private: + static MemStats getFor(ConcurrentQueue *q) { + MemStats stats = {0}; + + stats.elementsEnqueued = q->size_approx(); + + auto block = q->freeList.head_unsafe(); + while (block != nullptr) { + ++stats.allocatedBlocks; + ++stats.freeBlocks; + block = block->freeListNext.load(std::memory_order_relaxed); + } + + for (auto ptr = q->producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) { + bool implicit = dynamic_cast(ptr) != nullptr; + stats.implicitProducers += implicit ? 1 : 0; + stats.explicitProducers += implicit ? 0 : 1; + + if (implicit) { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ImplicitProducer); + auto head = prod->headIndex.load(std::memory_order_relaxed); + auto tail = prod->tailIndex.load(std::memory_order_relaxed); + auto hash = prod->blockIndex.load(std::memory_order_relaxed); + if (hash != nullptr) { + for (size_t i = 0; i != hash->capacity; ++i) { + if (hash->index[i]->key.load(std::memory_order_relaxed) != + ImplicitProducer::INVALID_BLOCK_BASE && + hash->index[i]->value.load(std::memory_order_relaxed) != + nullptr) { + ++stats.allocatedBlocks; + ++stats.ownedBlocksImplicit; + } + } + stats.implicitBlockIndexBytes += + hash->capacity * + sizeof(typename ImplicitProducer::BlockIndexEntry); + for (; hash != nullptr; hash = hash->prev) { + stats.implicitBlockIndexBytes += + sizeof(typename ImplicitProducer::BlockIndexHeader) + + hash->capacity * + sizeof(typename ImplicitProducer::BlockIndexEntry *); + } + } + for (; details::circular_less_than(head, tail); + head += BLOCK_SIZE) { + // auto block = prod->get_block_index_entry_for_index(head); + ++stats.usedBlocks; + } + } else { + auto prod = static_cast(ptr); + stats.queueClassBytes += sizeof(ExplicitProducer); + auto tailBlock = prod->tailBlock; + bool wasNonEmpty = false; + if (tailBlock != nullptr) { + auto block = tailBlock; + do { + ++stats.allocatedBlocks; + if (!block->ConcurrentQueue::Block::template is_empty< + explicit_context>() || + wasNonEmpty) { + ++stats.usedBlocks; + wasNonEmpty = wasNonEmpty || block != tailBlock; + } + ++stats.ownedBlocksExplicit; + block = block->next; + } while (block != tailBlock); + } + auto index = prod->blockIndex.load(std::memory_order_relaxed); + while (index != nullptr) { + stats.explicitBlockIndexBytes += + sizeof(typename ExplicitProducer::BlockIndexHeader) + + index->size * + sizeof(typename ExplicitProducer::BlockIndexEntry); + index = static_cast( + index->prev); + } + } + } + + auto freeOnInitialPool = + q->initialBlockPoolIndex.load(std::memory_order_relaxed) >= + q->initialBlockPoolSize + ? 0 + : q->initialBlockPoolSize - + q->initialBlockPoolIndex.load(std::memory_order_relaxed); + stats.allocatedBlocks += freeOnInitialPool; + stats.freeBlocks += freeOnInitialPool; + + stats.blockClassBytes = sizeof(Block) * stats.allocatedBlocks; + stats.queueClassBytes += sizeof(ConcurrentQueue); + + return stats; + } + }; + + // For debugging only. Not thread-safe. + MemStats getMemStats() { return MemStats::getFor(this); } + +private: + friend struct MemStats; +#endif + + ////////////////////////////////// + // Producer list manipulation + ////////////////////////////////// + + ProducerBase *recycle_or_create_producer(bool isExplicit) { #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - // Try to re-use one first - for (auto ptr = producerListTail.load(std::memory_order_acquire); ptr != nullptr; ptr = ptr->next_prod()) { - if (ptr->inactive.load(std::memory_order_relaxed) && ptr->isExplicit == isExplicit) { - bool expected = true; - if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, std::memory_order_acquire, std::memory_order_relaxed)) { - // We caught one! It's been marked as activated, the caller can have it - return ptr; - } - } - } - - return add_producer(isExplicit ? static_cast(create(this)) : create(this)); - } - - ProducerBase* add_producer(ProducerBase* producer) - { - // Handle failed memory allocation - if (producer == nullptr) { - return nullptr; - } - - producerCount.fetch_add(1, std::memory_order_relaxed); - - // Add it to the lock-free list - auto prevTail = producerListTail.load(std::memory_order_relaxed); - do { - producer->next = prevTail; - } while (!producerListTail.compare_exchange_weak(prevTail, producer, std::memory_order_release, std::memory_order_relaxed)); - + debug::DebugLock lock(implicitProdMutex); +#endif + // Try to re-use one first + for (auto ptr = producerListTail.load(std::memory_order_acquire); + ptr != nullptr; ptr = ptr->next_prod()) { + if (ptr->inactive.load(std::memory_order_relaxed) && + ptr->isExplicit == isExplicit) { + bool expected = true; + if (ptr->inactive.compare_exchange_strong(expected, /* desired */ false, + std::memory_order_acquire, + std::memory_order_relaxed)) { + // We caught one! It's been marked as activated, the caller can have + // it + return ptr; + } + } + } + + return add_producer( + isExplicit ? static_cast(create(this)) + : create(this)); + } + + ProducerBase *add_producer(ProducerBase *producer) { + // Handle failed memory allocation + if (producer == nullptr) { + return nullptr; + } + + producerCount.fetch_add(1, std::memory_order_relaxed); + + // Add it to the lock-free list + auto prevTail = producerListTail.load(std::memory_order_relaxed); + do { + producer->next = prevTail; + } while (!producerListTail.compare_exchange_weak( + prevTail, producer, std::memory_order_release, + std::memory_order_relaxed)); + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - if (producer->isExplicit) { - auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); - do { - static_cast(producer)->nextExplicitProducer = prevTailExplicit; - } while (!explicitProducers.compare_exchange_weak(prevTailExplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); - } - else { - auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); - do { - static_cast(producer)->nextImplicitProducer = prevTailImplicit; - } while (!implicitProducers.compare_exchange_weak(prevTailImplicit, static_cast(producer), std::memory_order_release, std::memory_order_relaxed)); - } -#endif - - return producer; - } - - void reown_producers() - { - // After another instance is moved-into/swapped-with this one, all the - // producers we stole still think their parents are the other queue. - // So fix them up! - for (auto ptr = producerListTail.load(std::memory_order_relaxed); ptr != nullptr; ptr = ptr->next_prod()) { - ptr->parent = this; - } - } - - - ////////////////////////////////// - // Implicit producer hash - ////////////////////////////////// - - struct ImplicitProducerKVP - { - std::atomic key; - ImplicitProducer* value; // No need for atomicity since it's only read by the thread that sets it in the first place - - ImplicitProducerKVP() : value(nullptr) { } - - ImplicitProducerKVP(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT - { - key.store(other.key.load(std::memory_order_relaxed), std::memory_order_relaxed); - value = other.value; - } - - inline ImplicitProducerKVP& operator=(ImplicitProducerKVP&& other) MOODYCAMEL_NOEXCEPT - { - swap(other); - return *this; - } - - inline void swap(ImplicitProducerKVP& other) MOODYCAMEL_NOEXCEPT - { - if (this != &other) { - details::swap_relaxed(key, other.key); - std::swap(value, other.value); - } - } - }; - - template - friend void moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP&, typename ConcurrentQueue::ImplicitProducerKVP&) MOODYCAMEL_NOEXCEPT; - - struct ImplicitProducerHash - { - size_t capacity; - ImplicitProducerKVP* entries; - ImplicitProducerHash* prev; - }; - - inline void populate_initial_implicit_producer_hash() - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { - return; - } - else { - implicitProducerHashCount.store(0, std::memory_order_relaxed); - auto hash = &initialImplicitProducerHash; - hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; - hash->entries = &initialImplicitProducerHashEntries[0]; - for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { - initialImplicitProducerHashEntries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); - } - hash->prev = nullptr; - implicitProducerHash.store(hash, std::memory_order_relaxed); - } - } - - void swap_implicit_producer_hashes(ConcurrentQueue& other) - { - MOODYCAMEL_CONSTEXPR_IF (INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { - return; - } - else { - // Swap (assumes our implicit producer hash is initialized) - initialImplicitProducerHashEntries.swap(other.initialImplicitProducerHashEntries); - initialImplicitProducerHash.entries = &initialImplicitProducerHashEntries[0]; - other.initialImplicitProducerHash.entries = &other.initialImplicitProducerHashEntries[0]; - - details::swap_relaxed(implicitProducerHashCount, other.implicitProducerHashCount); - - details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); - if (implicitProducerHash.load(std::memory_order_relaxed) == &other.initialImplicitProducerHash) { - implicitProducerHash.store(&initialImplicitProducerHash, std::memory_order_relaxed); - } - else { - ImplicitProducerHash* hash; - for (hash = implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &other.initialImplicitProducerHash; hash = hash->prev) { - continue; - } - hash->prev = &initialImplicitProducerHash; - } - if (other.implicitProducerHash.load(std::memory_order_relaxed) == &initialImplicitProducerHash) { - other.implicitProducerHash.store(&other.initialImplicitProducerHash, std::memory_order_relaxed); - } - else { - ImplicitProducerHash* hash; - for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); hash->prev != &initialImplicitProducerHash; hash = hash->prev) { - continue; - } - hash->prev = &other.initialImplicitProducerHash; - } - } - } - - // Only fails (returns nullptr) if memory allocation fails - ImplicitProducer* get_or_add_implicit_producer() - { - // Note that since the data is essentially thread-local (key is thread ID), - // there's a reduced need for fences (memory ordering is already consistent - // for any individual thread), except for the current table itself. - - // Start by looking for the thread ID in the current and all previous hash tables. - // If it's not found, it must not be in there yet, since this same thread would - // have added it previously to one of the tables that we traversed. - - // Code and algorithm adapted from http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table - + if (producer->isExplicit) { + auto prevTailExplicit = explicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextExplicitProducer = + prevTailExplicit; + } while (!explicitProducers.compare_exchange_weak( + prevTailExplicit, static_cast(producer), + std::memory_order_release, std::memory_order_relaxed)); + } else { + auto prevTailImplicit = implicitProducers.load(std::memory_order_relaxed); + do { + static_cast(producer)->nextImplicitProducer = + prevTailImplicit; + } while (!implicitProducers.compare_exchange_weak( + prevTailImplicit, static_cast(producer), + std::memory_order_release, std::memory_order_relaxed)); + } +#endif + + return producer; + } + + void reown_producers() { + // After another instance is moved-into/swapped-with this one, all the + // producers we stole still think their parents are the other queue. + // So fix them up! + for (auto ptr = producerListTail.load(std::memory_order_relaxed); + ptr != nullptr; ptr = ptr->next_prod()) { + ptr->parent = this; + } + } + + ////////////////////////////////// + // Implicit producer hash + ////////////////////////////////// + + struct ImplicitProducerKVP { + std::atomic key; + ImplicitProducer *value; // No need for atomicity since it's only read by + // the thread that sets it in the first place + + ImplicitProducerKVP() : value(nullptr) {} + + ImplicitProducerKVP(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT { + key.store(other.key.load(std::memory_order_relaxed), + std::memory_order_relaxed); + value = other.value; + } + + inline ImplicitProducerKVP & + operator=(ImplicitProducerKVP &&other) MOODYCAMEL_NOEXCEPT { + swap(other); + return *this; + } + + inline void swap(ImplicitProducerKVP &other) MOODYCAMEL_NOEXCEPT { + if (this != &other) { + details::swap_relaxed(key, other.key); + std::swap(value, other.value); + } + } + }; + + template + friend void + moodycamel::swap(typename ConcurrentQueue::ImplicitProducerKVP &, + typename ConcurrentQueue::ImplicitProducerKVP &) + MOODYCAMEL_NOEXCEPT; + + struct ImplicitProducerHash { + size_t capacity; + ImplicitProducerKVP *entries; + ImplicitProducerHash *prev; + }; + + inline void populate_initial_implicit_producer_hash() { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + implicitProducerHashCount.store(0, std::memory_order_relaxed); + auto hash = &initialImplicitProducerHash; + hash->capacity = INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; + hash->entries = &initialImplicitProducerHashEntries[0]; + for (size_t i = 0; i != INITIAL_IMPLICIT_PRODUCER_HASH_SIZE; ++i) { + initialImplicitProducerHashEntries[i].key.store( + details::invalid_thread_id, std::memory_order_relaxed); + } + hash->prev = nullptr; + implicitProducerHash.store(hash, std::memory_order_relaxed); + } + } + + void swap_implicit_producer_hashes(ConcurrentQueue &other) { + MOODYCAMEL_CONSTEXPR_IF(INITIAL_IMPLICIT_PRODUCER_HASH_SIZE == 0) { + return; + } + else { + // Swap (assumes our implicit producer hash is initialized) + initialImplicitProducerHashEntries.swap( + other.initialImplicitProducerHashEntries); + initialImplicitProducerHash.entries = + &initialImplicitProducerHashEntries[0]; + other.initialImplicitProducerHash.entries = + &other.initialImplicitProducerHashEntries[0]; + + details::swap_relaxed(implicitProducerHashCount, + other.implicitProducerHashCount); + + details::swap_relaxed(implicitProducerHash, other.implicitProducerHash); + if (implicitProducerHash.load(std::memory_order_relaxed) == + &other.initialImplicitProducerHash) { + implicitProducerHash.store(&initialImplicitProducerHash, + std::memory_order_relaxed); + } else { + ImplicitProducerHash *hash; + for (hash = implicitProducerHash.load(std::memory_order_relaxed); + hash->prev != &other.initialImplicitProducerHash; + hash = hash->prev) { + continue; + } + hash->prev = &initialImplicitProducerHash; + } + if (other.implicitProducerHash.load(std::memory_order_relaxed) == + &initialImplicitProducerHash) { + other.implicitProducerHash.store(&other.initialImplicitProducerHash, + std::memory_order_relaxed); + } else { + ImplicitProducerHash *hash; + for (hash = other.implicitProducerHash.load(std::memory_order_relaxed); + hash->prev != &initialImplicitProducerHash; hash = hash->prev) { + continue; + } + hash->prev = &other.initialImplicitProducerHash; + } + } + } + + // Only fails (returns nullptr) if memory allocation fails + ImplicitProducer *get_or_add_implicit_producer() { + // Note that since the data is essentially thread-local (key is thread ID), + // there's a reduced need for fences (memory ordering is already consistent + // for any individual thread), except for the current table itself. + + // Start by looking for the thread ID in the current and all previous hash + // tables. If it's not found, it must not be in there yet, since this same + // thread would have added it previously to one of the tables that we + // traversed. + + // Code and algorithm adapted from + // http://preshing.com/20130605/the-worlds-simplest-lock-free-hash-table + #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - - auto mainHash = implicitProducerHash.load(std::memory_order_acquire); - assert(mainHash != nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) - for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { - // Look for the id in this hash - auto index = hashedId; - while (true) { // Not an infinite loop because at least one slot is free in the hash table - index &= hash->capacity - 1u; - - auto probedKey = hash->entries[index].key.load(std::memory_order_relaxed); - if (probedKey == id) { - // Found it! If we had to search several hashes deep, though, we should lazily add it - // to the current main hash table to avoid the extended search next time. - // Note there's guaranteed to be room in the current hash table since every subsequent - // table implicitly reserves space for all previous tables (there's only one - // implicitProducerHashCount). - auto value = hash->entries[index].value; - if (hash != mainHash) { - index = hashedId; - while (true) { - index &= mainHash->capacity - 1u; - auto empty = details::invalid_thread_id; + debug::DebugLock lock(implicitProdMutex); +#endif + + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + + auto mainHash = implicitProducerHash.load(std::memory_order_acquire); + assert( + mainHash != + nullptr); // silence clang-tidy and MSVC warnings (hash cannot be null) + for (auto hash = mainHash; hash != nullptr; hash = hash->prev) { + // Look for the id in this hash + auto index = hashedId; + while (true) { // Not an infinite loop because at least one slot is free + // in the hash table + index &= hash->capacity - 1u; + + auto probedKey = + hash->entries[index].key.load(std::memory_order_relaxed); + if (probedKey == id) { + // Found it! If we had to search several hashes deep, though, we + // should lazily add it to the current main hash table to avoid the + // extended search next time. Note there's guaranteed to be room in + // the current hash table since every subsequent table implicitly + // reserves space for all previous tables (there's only one + // implicitProducerHashCount). + auto value = hash->entries[index].value; + if (hash != mainHash) { + index = hashedId; + while (true) { + index &= mainHash->capacity - 1u; + auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed) || - mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { + auto reusable = details::invalid_thread_id2; + if (mainHash->entries[index].key.compare_exchange_strong( + empty, id, std::memory_order_seq_cst, + std::memory_order_relaxed) || + mainHash->entries[index].key.compare_exchange_strong( + reusable, id, std::memory_order_seq_cst, + std::memory_order_relaxed)) { #else - if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { -#endif - mainHash->entries[index].value = value; - break; - } - ++index; - } - } - - return value; - } - if (probedKey == details::invalid_thread_id) { - break; // Not in this hash table - } - ++index; - } - } - - // Insert! - auto newCount = 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); - while (true) { - // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) - if (newCount >= (mainHash->capacity >> 1) && !implicitProducerHashResizeInProgress.test_and_set(std::memory_order_acquire)) { - // We've acquired the resize lock, try to allocate a bigger hash table. - // Note the acquire fence synchronizes with the release fence at the end of this block, and hence when - // we reload implicitProducerHash it must be the most recent version (it only gets changed within this - // locked block). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - if (newCount >= (mainHash->capacity >> 1)) { - size_t newCapacity = mainHash->capacity << 1; - while (newCount >= (newCapacity >> 1)) { - newCapacity <<= 1; - } - auto raw = static_cast((Traits::malloc)(sizeof(ImplicitProducerHash) + std::alignment_of::value - 1 + sizeof(ImplicitProducerKVP) * newCapacity)); - if (raw == nullptr) { - // Allocation failed - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - implicitProducerHashResizeInProgress.clear(std::memory_order_relaxed); - return nullptr; - } - - auto newHash = new (raw) ImplicitProducerHash; - newHash->capacity = static_cast(newCapacity); - newHash->entries = reinterpret_cast(details::align_for(raw + sizeof(ImplicitProducerHash))); - for (size_t i = 0; i != newCapacity; ++i) { - new (newHash->entries + i) ImplicitProducerKVP; - newHash->entries[i].key.store(details::invalid_thread_id, std::memory_order_relaxed); - } - newHash->prev = mainHash; - implicitProducerHash.store(newHash, std::memory_order_release); - implicitProducerHashResizeInProgress.clear(std::memory_order_release); - mainHash = newHash; - } - else { - implicitProducerHashResizeInProgress.clear(std::memory_order_release); - } - } - - // If it's < three-quarters full, add to the old one anyway so that we don't have to wait for the next table - // to finish being allocated by another thread (and if we just finished allocating above, the condition will - // always be true) - if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { - auto producer = static_cast(recycle_or_create_producer(false)); - if (producer == nullptr) { - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); - return nullptr; - } - + if (mainHash->entries[index].key.compare_exchange_strong( + empty, id, std::memory_order_seq_cst, + std::memory_order_relaxed)) { +#endif + mainHash->entries[index].value = value; + break; + } + ++index; + } + } + + return value; + } + if (probedKey == details::invalid_thread_id) { + break; // Not in this hash table + } + ++index; + } + } + + // Insert! + auto newCount = + 1 + implicitProducerHashCount.fetch_add(1, std::memory_order_relaxed); + while (true) { + // NOLINTNEXTLINE(clang-analyzer-core.NullDereference) + if (newCount >= (mainHash->capacity >> 1) && + !implicitProducerHashResizeInProgress.test_and_set( + std::memory_order_acquire)) { + // We've acquired the resize lock, try to allocate a bigger hash table. + // Note the acquire fence synchronizes with the release fence at the end + // of this block, and hence when we reload implicitProducerHash it must + // be the most recent version (it only gets changed within this locked + // block). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + if (newCount >= (mainHash->capacity >> 1)) { + size_t newCapacity = mainHash->capacity << 1; + while (newCount >= (newCapacity >> 1)) { + newCapacity <<= 1; + } + auto raw = static_cast( + (Traits::malloc)(sizeof(ImplicitProducerHash) + + std::alignment_of::value - + 1 + sizeof(ImplicitProducerKVP) * newCapacity)); + if (raw == nullptr) { + // Allocation failed + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + implicitProducerHashResizeInProgress.clear( + std::memory_order_relaxed); + return nullptr; + } + + auto newHash = new (raw) ImplicitProducerHash; + newHash->capacity = static_cast(newCapacity); + newHash->entries = reinterpret_cast( + details::align_for( + raw + sizeof(ImplicitProducerHash))); + for (size_t i = 0; i != newCapacity; ++i) { + new (newHash->entries + i) ImplicitProducerKVP; + newHash->entries[i].key.store(details::invalid_thread_id, + std::memory_order_relaxed); + } + newHash->prev = mainHash; + implicitProducerHash.store(newHash, std::memory_order_release); + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + mainHash = newHash; + } else { + implicitProducerHashResizeInProgress.clear(std::memory_order_release); + } + } + + // If it's < three-quarters full, add to the old one anyway so that we + // don't have to wait for the next table to finish being allocated by + // another thread (and if we just finished allocating above, the condition + // will always be true) + if (newCount < (mainHash->capacity >> 1) + (mainHash->capacity >> 2)) { + auto producer = + static_cast(recycle_or_create_producer(false)); + if (producer == nullptr) { + implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); + return nullptr; + } + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - producer->threadExitListener.callback = &ConcurrentQueue::implicit_producer_thread_exited_callback; - producer->threadExitListener.userData = producer; - details::ThreadExitNotifier::subscribe(&producer->threadExitListener); -#endif - - auto index = hashedId; - while (true) { - index &= mainHash->capacity - 1u; - auto empty = details::invalid_thread_id; + producer->threadExitListener.callback = + &ConcurrentQueue::implicit_producer_thread_exited_callback; + producer->threadExitListener.userData = producer; + details::ThreadExitNotifier::subscribe(&producer->threadExitListener); +#endif + + auto index = hashedId; + while (true) { + index &= mainHash->capacity - 1u; + auto empty = details::invalid_thread_id; #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - auto reusable = details::invalid_thread_id2; - if (mainHash->entries[index].key.compare_exchange_strong(reusable, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { - implicitProducerHashCount.fetch_sub(1, std::memory_order_relaxed); // already counted as a used slot - mainHash->entries[index].value = producer; - break; - } -#endif - if (mainHash->entries[index].key.compare_exchange_strong(empty, id, std::memory_order_seq_cst, std::memory_order_relaxed)) { - mainHash->entries[index].value = producer; - break; - } - ++index; - } - return producer; - } - - // Hmm, the old hash is quite full and somebody else is busy allocating a new one. - // We need to wait for the allocating thread to finish (if it succeeds, we add, if not, - // we try to allocate ourselves). - mainHash = implicitProducerHash.load(std::memory_order_acquire); - } - } - + auto reusable = details::invalid_thread_id2; + if (mainHash->entries[index].key.compare_exchange_strong( + reusable, id, std::memory_order_seq_cst, + std::memory_order_relaxed)) { + implicitProducerHashCount.fetch_sub( + 1, std::memory_order_relaxed); // already counted as a used slot + mainHash->entries[index].value = producer; + break; + } +#endif + if (mainHash->entries[index].key.compare_exchange_strong( + empty, id, std::memory_order_seq_cst, + std::memory_order_relaxed)) { + mainHash->entries[index].value = producer; + break; + } + ++index; + } + return producer; + } + + // Hmm, the old hash is quite full and somebody else is busy allocating a + // new one. We need to wait for the allocating thread to finish (if it + // succeeds, we add, if not, we try to allocate ourselves). + mainHash = implicitProducerHash.load(std::memory_order_acquire); + } + } + #ifdef MOODYCAMEL_CPP11_THREAD_LOCAL_SUPPORTED - void implicit_producer_thread_exited(ImplicitProducer* producer) - { - // Remove from hash + void implicit_producer_thread_exited(ImplicitProducer *producer) { + // Remove from hash #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugLock lock(implicitProdMutex); -#endif - auto hash = implicitProducerHash.load(std::memory_order_acquire); - assert(hash != nullptr); // The thread exit listener is only registered if we were added to a hash in the first place - auto id = details::thread_id(); - auto hashedId = details::hash_thread_id(id); - details::thread_id_t probedKey; - - // We need to traverse all the hashes just in case other threads aren't on the current one yet and are - // trying to add an entry thinking there's a free slot (because they reused a producer) - for (; hash != nullptr; hash = hash->prev) { - auto index = hashedId; - do { - index &= hash->capacity - 1u; - probedKey = id; - if (hash->entries[index].key.compare_exchange_strong(probedKey, details::invalid_thread_id2, std::memory_order_seq_cst, std::memory_order_relaxed)) { - break; - } - ++index; - } while (probedKey != details::invalid_thread_id); // Can happen if the hash has changed but we weren't put back in it yet, or if we weren't added to this hash in the first place - } - - // Mark the queue as being recyclable - producer->inactive.store(true, std::memory_order_release); - } - - static void implicit_producer_thread_exited_callback(void* userData) - { - auto producer = static_cast(userData); - auto queue = producer->parent; - queue->implicit_producer_thread_exited(producer); - } -#endif - - ////////////////////////////////// - // Utility functions - ////////////////////////////////// - - template - static inline void* aligned_malloc(size_t size) - { - MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) - return (Traits::malloc)(size); - else { - size_t alignment = std::alignment_of::value; - void* raw = (Traits::malloc)(size + alignment - 1 + sizeof(void*)); - if (!raw) - return nullptr; - char* ptr = details::align_for(reinterpret_cast(raw) + sizeof(void*)); - *(reinterpret_cast(ptr) - 1) = raw; - return ptr; - } - } - - template - static inline void aligned_free(void* ptr) - { - MOODYCAMEL_CONSTEXPR_IF (std::alignment_of::value <= std::alignment_of::value) - return (Traits::free)(ptr); - else - (Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); - } - - template - static inline U* create_array(size_t count) - { - assert(count > 0); - U* p = static_cast(aligned_malloc(sizeof(U) * count)); - if (p == nullptr) - return nullptr; - - for (size_t i = 0; i != count; ++i) - new (p + i) U(); - return p; - } - - template - static inline void destroy_array(U* p, size_t count) - { - if (p != nullptr) { - assert(count > 0); - for (size_t i = count; i != 0; ) - (p + --i)->~U(); - } - aligned_free(p); - } - - template - static inline U* create() - { - void* p = aligned_malloc(sizeof(U)); - return p != nullptr ? new (p) U : nullptr; - } - - template - static inline U* create(A1&& a1) - { - void* p = aligned_malloc(sizeof(U)); - return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; - } - - template - static inline void destroy(U* p) - { - if (p != nullptr) - p->~U(); - aligned_free(p); - } + debug::DebugLock lock(implicitProdMutex); +#endif + auto hash = implicitProducerHash.load(std::memory_order_acquire); + assert(hash != nullptr); // The thread exit listener is only registered if + // we were added to a hash in the first place + auto id = details::thread_id(); + auto hashedId = details::hash_thread_id(id); + details::thread_id_t probedKey; + + // We need to traverse all the hashes just in case other threads aren't on + // the current one yet and are trying to add an entry thinking there's a + // free slot (because they reused a producer) + for (; hash != nullptr; hash = hash->prev) { + auto index = hashedId; + do { + index &= hash->capacity - 1u; + probedKey = id; + if (hash->entries[index].key.compare_exchange_strong( + probedKey, details::invalid_thread_id2, + std::memory_order_seq_cst, std::memory_order_relaxed)) { + break; + } + ++index; + } while (probedKey != + details::invalid_thread_id); // Can happen if the hash has + // changed but we weren't put back + // in it yet, or if we weren't added + // to this hash in the first place + } + + // Mark the queue as being recyclable + producer->inactive.store(true, std::memory_order_release); + } + + static void implicit_producer_thread_exited_callback(void *userData) { + auto producer = static_cast(userData); + auto queue = producer->parent; + queue->implicit_producer_thread_exited(producer); + } +#endif + + ////////////////////////////////// + // Utility functions + ////////////////////////////////// + + template static inline void *aligned_malloc(size_t size) { + MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= + std::alignment_of::value) + return (Traits::malloc)(size); + else { + size_t alignment = std::alignment_of::value; + void *raw = (Traits::malloc)(size + alignment - 1 + sizeof(void *)); + if (!raw) + return nullptr; + char *ptr = details::align_for(reinterpret_cast(raw) + + sizeof(void *)); + *(reinterpret_cast(ptr) - 1) = raw; + return ptr; + } + } + + template static inline void aligned_free(void *ptr) { + MOODYCAMEL_CONSTEXPR_IF(std::alignment_of::value <= + std::alignment_of::value) + return (Traits::free)(ptr); + else(Traits::free)(ptr ? *(reinterpret_cast(ptr) - 1) : nullptr); + } + + template static inline U *create_array(size_t count) { + assert(count > 0); + U *p = static_cast(aligned_malloc(sizeof(U) * count)); + if (p == nullptr) + return nullptr; + + for (size_t i = 0; i != count; ++i) + new (p + i) U(); + return p; + } + + template static inline void destroy_array(U *p, size_t count) { + if (p != nullptr) { + assert(count > 0); + for (size_t i = count; i != 0;) + (p + --i)->~U(); + } + aligned_free(p); + } + + template static inline U *create() { + void *p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U : nullptr; + } + + template static inline U *create(A1 &&a1) { + void *p = aligned_malloc(sizeof(U)); + return p != nullptr ? new (p) U(std::forward(a1)) : nullptr; + } + + template static inline void destroy(U *p) { + if (p != nullptr) + p->~U(); + aligned_free(p); + } private: - std::atomic producerListTail; - std::atomic producerCount; - - std::atomic initialBlockPoolIndex; - Block* initialBlockPool; - size_t initialBlockPoolSize; - + std::atomic producerListTail; + std::atomic producerCount; + + std::atomic initialBlockPoolIndex; + Block *initialBlockPool; + size_t initialBlockPoolSize; + #ifndef MCDBGQ_USEDEBUGFREELIST - FreeList freeList; + FreeList freeList; #else - debug::DebugFreeList freeList; -#endif - - std::atomic implicitProducerHash; - std::atomic implicitProducerHashCount; // Number of slots logically used - ImplicitProducerHash initialImplicitProducerHash; - std::array initialImplicitProducerHashEntries; - std::atomic_flag implicitProducerHashResizeInProgress; - - std::atomic nextExplicitConsumerId; - std::atomic globalExplicitConsumerOffset; - + debug::DebugFreeList freeList; +#endif + + std::atomic implicitProducerHash; + std::atomic + implicitProducerHashCount; // Number of slots logically used + ImplicitProducerHash initialImplicitProducerHash; + std::array + initialImplicitProducerHashEntries; + std::atomic_flag implicitProducerHashResizeInProgress; + + std::atomic nextExplicitConsumerId; + std::atomic globalExplicitConsumerOffset; + #ifdef MCDBGQ_NOLOCKFREE_IMPLICITPRODHASH - debug::DebugMutex implicitProdMutex; + debug::DebugMutex implicitProdMutex; #endif - + #ifdef MOODYCAMEL_QUEUE_INTERNAL_DEBUG - std::atomic explicitProducers; - std::atomic implicitProducers; + std::atomic explicitProducers; + std::atomic implicitProducers; #endif }; - -template -ProducerToken::ProducerToken(ConcurrentQueue& queue) - : producer(queue.recycle_or_create_producer(true)) -{ - if (producer != nullptr) { - producer->token = this; - } +template +ProducerToken::ProducerToken(ConcurrentQueue &queue) + : producer(queue.recycle_or_create_producer(true)) { + if (producer != nullptr) { + producer->token = this; + } } -template -ProducerToken::ProducerToken(BlockingConcurrentQueue& queue) - : producer(reinterpret_cast*>(&queue)->recycle_or_create_producer(true)) -{ - if (producer != nullptr) { - producer->token = this; - } +template +ProducerToken::ProducerToken(BlockingConcurrentQueue &queue) + : producer(reinterpret_cast *>(&queue) + ->recycle_or_create_producer(true)) { + if (producer != nullptr) { + producer->token = this; + } } -template -ConsumerToken::ConsumerToken(ConcurrentQueue& queue) - : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) -{ - initialOffset = queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = static_cast(-1); +template +ConsumerToken::ConsumerToken(ConcurrentQueue &queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), + desiredProducer(nullptr) { + initialOffset = + queue.nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); } -template -ConsumerToken::ConsumerToken(BlockingConcurrentQueue& queue) - : itemsConsumedFromCurrent(0), currentProducer(nullptr), desiredProducer(nullptr) -{ - initialOffset = reinterpret_cast*>(&queue)->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); - lastKnownGlobalOffset = static_cast(-1); +template +ConsumerToken::ConsumerToken(BlockingConcurrentQueue &queue) + : itemsConsumedFromCurrent(0), currentProducer(nullptr), + desiredProducer(nullptr) { + initialOffset = + reinterpret_cast *>(&queue) + ->nextExplicitConsumerId.fetch_add(1, std::memory_order_release); + lastKnownGlobalOffset = static_cast(-1); } -template -inline void swap(ConcurrentQueue& a, ConcurrentQueue& b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); +template +inline void swap(ConcurrentQueue &a, + ConcurrentQueue &b) MOODYCAMEL_NOEXCEPT { + a.swap(b); } -inline void swap(ProducerToken& a, ProducerToken& b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); +inline void swap(ProducerToken &a, ProducerToken &b) MOODYCAMEL_NOEXCEPT { + a.swap(b); } -inline void swap(ConsumerToken& a, ConsumerToken& b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); +inline void swap(ConsumerToken &a, ConsumerToken &b) MOODYCAMEL_NOEXCEPT { + a.swap(b); } -template -inline void swap(typename ConcurrentQueue::ImplicitProducerKVP& a, typename ConcurrentQueue::ImplicitProducerKVP& b) MOODYCAMEL_NOEXCEPT -{ - a.swap(b); +template +inline void swap(typename ConcurrentQueue::ImplicitProducerKVP &a, + typename ConcurrentQueue::ImplicitProducerKVP &b) + MOODYCAMEL_NOEXCEPT { + a.swap(b); } -} +} // namespace moodycamel #if defined(_MSC_VER) && (!defined(_HAS_CXX17) || !_HAS_CXX17) #pragma warning(pop) diff --git a/src/conv-conds.cpp b/src/conv-conds.cpp index 4c9521d..74e4b0a 100644 --- a/src/conv-conds.cpp +++ b/src/conv-conds.cpp @@ -25,11 +25,9 @@ struct ccd_cond_callback { struct ccd_periodic_callback { CcdVoidFn fn; void *arg; - int pe; /* the pe that sets the callback */ + int pe; /* the pe that sets the callback */ - ccd_periodic_callback(CcdVoidFn f, void *a, int p) - : fn{f}, arg{a}, pe{p} - { } + ccd_periodic_callback(CcdVoidFn f, void *a, int p) : fn{f}, arg{a}, pe{p} {} }; /** @@ -165,7 +163,8 @@ struct ccd_heap_elem { double time; ccd_periodic_callback cb; - ccd_heap_elem(double t, CcdVoidFn fn, void *arg, int pe) : time{t}, cb{fn, arg, pe} {} + ccd_heap_elem(double t, CcdVoidFn fn, void *arg, int pe) + : time{t}, cb{fn, arg, pe} {} bool operator>(const ccd_heap_elem &rhs) const { return this->time > rhs.time; @@ -187,7 +186,8 @@ int CcdNumTimerCBs(void) { return ccd_heap.size() + _ccd_num_timed_cond_cbs; } /** * Insert a new callback into the heap */ -static inline void ccd_heap_insert(double t, CcdVoidFn fnp, void *arg, int pe = CcdIGNOREPE) { +static inline void ccd_heap_insert(double t, CcdVoidFn fnp, void *arg, + int pe = CcdIGNOREPE) { auto &h = ccd_heap; h.emplace(t, fnp, arg, pe); } @@ -259,12 +259,11 @@ void CcdCancelCallOnConditionKeep(int condnum, int idx) { * Register a callback function that will be triggered on the specified PE * after a minimum delay of deltaT */ -void CcdCallFnAfterOnPE(CcdVoidFn fnp, void *arg, double deltaT, int pe) -{ - double ctime = CmiWallTimer(); - double tcall = ctime + deltaT * (1.0/1000.0); - ccd_heap_insert(tcall, fnp, arg, pe); -} +void CcdCallFnAfterOnPE(CcdVoidFn fnp, void *arg, double deltaT, int pe) { + double ctime = CmiWallTimer(); + double tcall = ctime + deltaT * (1.0 / 1000.0); + ccd_heap_insert(tcall, fnp, arg, pe); +} /** * Register a callback function that will be triggered after a minimum diff --git a/src/conv-rdma.cpp b/src/conv-rdma.cpp index 4cc2694..fbb38ae 100644 --- a/src/conv-rdma.cpp +++ b/src/conv-rdma.cpp @@ -146,8 +146,8 @@ void CmiIssueRputCopyBased(NcpyOperationInfo *ncpyOpInfo) { // Invoke the source ack ncpyOpInfo->ackMode = CMK_SRC_ACK; // only invoke the source ack - // We need to ensure consistent behavior no matter what ncpyDirectAckHandlerFn actually does - // so we cannot rely on the charm layer to free the ncpyOpInfo + // We need to ensure consistent behavior no matter what ncpyDirectAckHandlerFn + // actually does so we cannot rely on the charm layer to free the ncpyOpInfo auto realFreeMe = ncpyOpInfo->freeMe; ncpyOpInfo->freeMe = CMK_DONT_FREE_NCPYOPINFO; ncpyDirectAckHandlerFn(ncpyOpInfo); @@ -367,7 +367,7 @@ void CommRputLocalHandler(comm_backend::Status status) { ncpyOpInfo->freeMe = CMK_DONT_FREE_NCPYOPINFO; ncpyDirectAckHandlerFn(ncpyOpInfo); if (realFreeMe == CMK_FREE_NCPYOPINFO) - CmiFree(ncpyOpInfo); + CmiFree(ncpyOpInfo); } // Invoked by the local completion of the Rget operation @@ -384,12 +384,12 @@ void CommRgetLocalHandler(comm_backend::Status status) { /* Perform an RDMA Get operation into the local destination address from the * remote source address*/ void CmiIssueRget(NcpyOperationInfo *ncpyOpInfo) { -// #if CMK_USE_LRTS && CMK_ONESIDED_IMPL -// // Use network RDMA for a PE on a remote host -// LrtsIssueRget(ncpyOpInfo); -// #else -// CmiIssueRgetCopyBased(ncpyOpInfo); -// #endif + // #if CMK_USE_LRTS && CMK_ONESIDED_IMPL + // // Use network RDMA for a PE on a remote host + // LrtsIssueRget(ncpyOpInfo); + // #else + // CmiIssueRgetCopyBased(ncpyOpInfo); + // #endif int target_node = CmiNodeOf(ncpyOpInfo->srcPe); if (target_node == CmiMyNode()) { // loopback messages @@ -415,39 +415,40 @@ void CmiIssueRget(NcpyOperationInfo *ncpyOpInfo) { /* Perform an RDMA Put operation into the remote destination address from the * local source address */ void CmiIssueRput(NcpyOperationInfo *ncpyOpInfo) { -// #if CMK_USE_LRTS && CMK_ONESIDED_IMPL -// // Use network RDMA for a PE on a remote host -// LrtsIssueRput(ncpyOpInfo); -// #else -// CmiIssueRputCopyBased(ncpyOpInfo); -// #endif -int target_node = CmiNodeOf(ncpyOpInfo->destPe); -if (target_node == CmiMyNode()) { - // loopback messages - memcpy((void *)ncpyOpInfo->destPtr, ncpyOpInfo->srcPtr, ncpyOpInfo->srcSize); - comm_backend::Status status; - status.local_buf = ncpyOpInfo->srcPtr; - status.size = ncpyOpInfo->srcSize; - status.user_context = ncpyOpInfo; - CommRputLocalHandler(status); -} else if (!CmiUseCopyBasedRDMA) { - auto mr = *(comm_backend::mr_t *)ncpyOpInfo->srcLayerInfo; - void *rmr = ncpyOpInfo->destLayerInfo + sizeof(comm_backend::mr_t); - // FIXME: we assume the offset to the registered base address is 0 here - comm_backend::issueRput(CmiNodeOf(ncpyOpInfo->destPe), ncpyOpInfo->srcPtr, - ncpyOpInfo->srcSize, mr, 0, rmr, CommRputLocalHandler, - ncpyOpInfo); -} else { - CmiIssueRputCopyBased(ncpyOpInfo); -} + // #if CMK_USE_LRTS && CMK_ONESIDED_IMPL + // // Use network RDMA for a PE on a remote host + // LrtsIssueRput(ncpyOpInfo); + // #else + // CmiIssueRputCopyBased(ncpyOpInfo); + // #endif + int target_node = CmiNodeOf(ncpyOpInfo->destPe); + if (target_node == CmiMyNode()) { + // loopback messages + memcpy((void *)ncpyOpInfo->destPtr, ncpyOpInfo->srcPtr, + ncpyOpInfo->srcSize); + comm_backend::Status status; + status.local_buf = ncpyOpInfo->srcPtr; + status.size = ncpyOpInfo->srcSize; + status.user_context = ncpyOpInfo; + CommRputLocalHandler(status); + } else if (!CmiUseCopyBasedRDMA) { + auto mr = *(comm_backend::mr_t *)ncpyOpInfo->srcLayerInfo; + void *rmr = ncpyOpInfo->destLayerInfo + sizeof(comm_backend::mr_t); + // FIXME: we assume the offset to the registered base address is 0 here + comm_backend::issueRput(CmiNodeOf(ncpyOpInfo->destPe), ncpyOpInfo->srcPtr, + ncpyOpInfo->srcSize, mr, 0, rmr, + CommRputLocalHandler, ncpyOpInfo); + } else { + CmiIssueRputCopyBased(ncpyOpInfo); + } } /* De-register registered memory for pointer */ void CmiDeregisterMem(const void *ptr, void *info, int pe, unsigned short int mode) { -// #if CMK_USE_LRTS && CMK_ONESIDED_IMPL -// LrtsDeregisterMem(ptr, info, pe, mode); -// #endif + // #if CMK_USE_LRTS && CMK_ONESIDED_IMPL + // LrtsDeregisterMem(ptr, info, pe, mode); + // #endif if (!CmiUseCopyBasedRDMA) { comm_backend::deregisterMemory(*(comm_backend::mr_t *)info); } @@ -455,18 +456,19 @@ void CmiDeregisterMem(const void *ptr, void *info, int pe, // FIXME: This really should be implemented in the charm layer void CmiInvokeRemoteDeregAckHandler(int pe, NcpyOperationInfo *ncpyOpInfo) { -// #if CMK_USE_LRTS && CMK_ONESIDED_IMPL -// LrtsInvokeRemoteDeregAckHandler(pe, ncpyOpInfo); -// #endif - if(ncpyOpInfo->opMode == CMK_BCAST_EM_API) + // #if CMK_USE_LRTS && CMK_ONESIDED_IMPL + // LrtsInvokeRemoteDeregAckHandler(pe, ncpyOpInfo); + // #endif + if (ncpyOpInfo->opMode == CMK_BCAST_EM_API) return; bool freeInfo; - if(ncpyOpInfo->opMode == CMK_DIRECT_API) { + if (ncpyOpInfo->opMode == CMK_DIRECT_API) { freeInfo = true; - } else if(ncpyOpInfo->opMode == CMK_EM_API) { + } else if (ncpyOpInfo->opMode == CMK_EM_API) { freeInfo = false; } else { - CmiAbort("CmiInvokeRemoteDeregAckHandler: ncpyOpInfo->opMode is not valid for dereg\n"); + CmiAbort("CmiInvokeRemoteDeregAckHandler: ncpyOpInfo->opMode is not valid " + "for dereg\n"); } int ncpyOpInfoSize = ncpyOpInfo->ncpyOpInfoSize; @@ -479,8 +481,8 @@ void CmiInvokeRemoteDeregAckHandler(int pe, NcpyOperationInfo *ncpyOpInfo) { ncpyOpInfoSize); CmiSetHandler(remoteDeregMsg, remote_dereg_handler_idx); - CmiSyncSendAndFree(pe, - sizeof(ConverseRdmaMsg) + ncpyOpInfoSize, remoteDeregMsg); + CmiSyncSendAndFree(pe, sizeof(ConverseRdmaMsg) + ncpyOpInfoSize, + remoteDeregMsg); // free original ncpyOpinfo if (freeInfo) @@ -504,7 +506,8 @@ void CmiSetRdmaBufferInfo(void *info, const void *ptr, int size, info = (char *)info + sizeof(mr); size_t info_size_left = CMK_NOCOPY_DIRECT_BYTES - sizeof(mr); size_t rmr_size = comm_backend::getRMR(mr, info, info_size_left); - CmiAssertMsg(rmr_size <= info_size_left, "CMK_NOCOPY_DIRECT_BYTES is too small"); + CmiAssertMsg(rmr_size <= info_size_left, + "CMK_NOCOPY_DIRECT_BYTES is too small"); } } diff --git a/src/conv-topology.cpp b/src/conv-topology.cpp index be577fa..add33f3 100644 --- a/src/conv-topology.cpp +++ b/src/conv-topology.cpp @@ -3,12 +3,11 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE #endif -#include -#include /* for sockaddr_in */ -#include /* for getifaddrs */ -#include /* for IFF_RUNNING */ #include - +#include /* for getifaddrs */ +#include /* for IFF_RUNNING */ +#include /* for sockaddr_in */ +#include #include #include @@ -35,59 +34,59 @@ #if 1 -# include -# include -# include #include #include - - +#include +#include +#include void CmiInitMemAffinity(char **argv) { - char *tmpstr = NULL; - int maffinity_flag = CmiGetArgFlagDesc(argv,"+maffinity", - "memory affinity"); - if (maffinity_flag && CmiMyPe()==0) - CmiPrintf("memory affinity is not supported, +maffinity flag disabled.\n"); - - /* consume the remaining possible arguments */ - CmiGetArgStringDesc(argv, "+memnodemap", &tmpstr, "define memory node mapping"); - CmiGetArgStringDesc(argv, "+mempol", &tmpstr, "define memory policy {bind, preferred or interleave} "); + char *tmpstr = NULL; + int maffinity_flag = CmiGetArgFlagDesc(argv, "+maffinity", "memory affinity"); + if (maffinity_flag && CmiMyPe() == 0) + CmiPrintf("memory affinity is not supported, +maffinity flag disabled.\n"); + + /* consume the remaining possible arguments */ + CmiGetArgStringDesc(argv, "+memnodemap", &tmpstr, + "define memory node mapping"); + CmiGetArgStringDesc(argv, "+mempol", &tmpstr, + "define memory policy {bind, preferred or interleave} "); } +skt_ip_t _skt_invalid_ip = {{0}}; - -skt_ip_t _skt_invalid_ip={{0}}; - -skt_ip_t skt_my_ip(void) -{ +skt_ip_t skt_my_ip(void) { char hostname[1000]; skt_ip_t ip = _skt_invalid_ip; int ifcount = 0; - /* Code snippet from Jens Alfke - * http://lists.apple.com/archives/macnetworkprog/2008/May/msg00013.html */ - struct ifaddrs *ifaces=0; - if( getifaddrs(&ifaces) == 0 ) { - struct ifaddrs *iface; - for( iface=ifaces; iface; iface=iface->ifa_next ) { - if( (iface->ifa_flags & IFF_UP) && ! (iface->ifa_flags & IFF_LOOPBACK) ) { - const struct sockaddr_in *addr = (const struct sockaddr_in*)iface->ifa_addr; - if( addr && addr->sin_family==AF_INET ) { - ifcount ++; - if ( ifcount==1 ) memcpy(&ip, &addr->sin_addr, sizeof(ip)); - } - } + /* Code snippet from Jens Alfke + * http://lists.apple.com/archives/macnetworkprog/2008/May/msg00013.html + */ + struct ifaddrs *ifaces = 0; + if (getifaddrs(&ifaces) == 0) { + struct ifaddrs *iface; + for (iface = ifaces; iface; iface = iface->ifa_next) { + if ((iface->ifa_flags & IFF_UP) && !(iface->ifa_flags & IFF_LOOPBACK)) { + const struct sockaddr_in *addr = + (const struct sockaddr_in *)iface->ifa_addr; + if (addr && addr->sin_family == AF_INET) { + ifcount++; + if (ifcount == 1) + memcpy(&ip, &addr->sin_addr, sizeof(ip)); } - freeifaddrs(ifaces); + } + } + freeifaddrs(ifaces); } - /* fprintf(stderr, "My IP is %d.%d.%d.%d\n", ip.data[0],ip.data[1],ip.data[2],ip.data[3]); */ - if (ifcount==1) return ip; + /* fprintf(stderr, "My IP is %d.%d.%d.%d\n", + * ip.data[0],ip.data[1],ip.data[2],ip.data[3]); */ + if (ifcount == 1) + return ip; return _skt_invalid_ip; } -struct _procInfo -{ +struct _procInfo { skt_ip_t ip; int pe; int ncores; @@ -95,40 +94,35 @@ struct _procInfo int nodeID; }; -typedef struct _hostnameMsg -{ +typedef struct _hostnameMsg { char core[CmiMsgHeaderSizeBytes]; int n; - _procInfo* procs; + _procInfo *procs; } hostnameMsg; -typedef struct _nodeTopoMsg -{ +typedef struct _nodeTopoMsg { char core[CmiMsgHeaderSizeBytes]; - int* nodes; + int *nodes; } nodeTopoMsg; // nodeIDs[pe] is the node number of processor pe -class CpuTopology -{ +class CpuTopology { public: - static int* nodeIDs; + static int *nodeIDs; static int numPes; static int numNodes; - static std::vector* bynodes; + static std::vector *bynodes; static int supported; - ~CpuTopology() - { + ~CpuTopology() { auto n = bynodes; bynodes = nullptr; delete[] n; } // return -1 when not supported - int numUniqNodes() - { -# if 0 + int numUniqNodes() { +#if 0 if (numNodes != 0) return numNodes; int n = 0; for (int i=0; i 0) - return numNodes; // already calculated + return numNodes; // already calculated std::vector unodes(numPes); int i; - for (i = 0; i < numPes; i++) unodes[i] = nodeIDs[i]; + for (i = 0; i < numPes; i++) + unodes[i] = nodeIDs[i]; std::sort(unodes.begin(), unodes.end()); int last = -1; - std::map nodemap; // nodeIDs can be out of range of [0,numNodes] - for (i = 0; i < numPes; i++) - { - if (unodes[i] != last) - { + std::map nodemap; // nodeIDs can be out of range of [0,numNodes] + for (i = 0; i < numPes; i++) { + if (unodes[i] != last) { last = unodes[i]; nodemap[unodes[i]] = numNodes; numNodes++; } } - if (numNodes == 0) - { + if (numNodes == 0) { numNodes = CmiNumNodes(); numPes = CmiNumPes(); - } - else - { + } else { // re-number nodeIDs, which may be necessary e.g. on BlueGene/P - for (i = 0; i < numPes; i++) nodeIDs[i] = nodemap[nodeIDs[i]]; + for (i = 0; i < numPes; i++) + nodeIDs[i] = nodemap[nodeIDs[i]]; CpuTopology::supported = 1; } return numNodes; -# endif +#endif } - void sort() - { + void sort() { int i; numUniqNodes(); bynodes = new std::vector[numNodes]; - if (supported) - { - for (i = 0; i < numPes; i++) - { - CmiAssert(nodeIDs[i] >= 0 && - nodeIDs[i] <= - numNodes); // Sanity check for bug that occurs on mpi-crayxt + if (supported) { + for (i = 0; i < numPes; i++) { + CmiAssert( + nodeIDs[i] >= 0 && + nodeIDs[i] <= + numNodes); // Sanity check for bug that occurs on mpi-crayxt bynodes[nodeIDs[i]].push_back(i); } - } - else - { /* not supported/enabled */ - for (i = 0; i < CmiNumPes(); i++) bynodes[CmiNodeOf(i)].push_back(i); + } else { /* not supported/enabled */ + for (i = 0; i < CmiNumPes(); i++) + bynodes[CmiNodeOf(i)].push_back(i); } } - void print() - { + void print() { int i; CmiPrintf("Charm++> Cpu topology info:\n"); CmiPrintf("PE to node map: "); - for (i = 0; i < CmiNumPes(); i++) CmiPrintf("%d ", nodeIDs[i]); + for (i = 0; i < CmiNumPes(); i++) + CmiPrintf("%d ", nodeIDs[i]); CmiPrintf("\n"); CmiPrintf("Node to PE map:\n"); - for (i = 0; i < numNodes; i++) - { + for (i = 0; i < numNodes; i++) { CmiPrintf("Chip #%d: ", i); - for (int j = 0; j < bynodes[i].size(); j++) CmiPrintf("%d ", bynodes[i][j]); + for (int j = 0; j < bynodes[i].size(); j++) + CmiPrintf("%d ", bynodes[i][j]); CmiPrintf("\n"); } } }; -int* CpuTopology::nodeIDs = NULL; +int *CpuTopology::nodeIDs = NULL; int CpuTopology::numPes = 0; int CpuTopology::numNodes = 0; -std::vector* CpuTopology::bynodes = NULL; +std::vector *CpuTopology::bynodes = NULL; int CpuTopology::supported = 0; -namespace CpuTopoDetails -{ +namespace CpuTopoDetails { -static nodeTopoMsg* topomsg = NULL; -static std::map hostTable; +static nodeTopoMsg *topomsg = NULL; +static std::map hostTable; static int cpuTopoHandlerIdx; static int cpuTopoRecvHandlerIdx; @@ -226,7 +213,7 @@ static CpuTopology cpuTopo; static int done = 0; static int _noip = 0; -} // namespace CpuTopoDetails +} // namespace CpuTopoDetails using namespace CpuTopoDetails; @@ -236,75 +223,72 @@ using namespace CpuTopoDetails; // const int ways = CmiHwlocTopologyLocal.num_pus; // if (ways > 1) // CmiPrintf( -// "Charm++> Running on %d hosts (%d sockets x %d cores x %d PUs = %d-way SMP)\n", -// numNodes, CmiHwlocTopologyLocal.num_sockets, +// "Charm++> Running on %d hosts (%d sockets x %d cores x %d PUs = +// %d-way SMP)\n", numNodes, CmiHwlocTopologyLocal.num_sockets, // CmiHwlocTopologyLocal.num_cores / CmiHwlocTopologyLocal.num_sockets, -// CmiHwlocTopologyLocal.num_pus / CmiHwlocTopologyLocal.num_cores, ways); +// CmiHwlocTopologyLocal.num_pus / CmiHwlocTopologyLocal.num_cores, +// ways); // else // CmiPrintf("Charm++> Running on %d hosts\n", numNodes); // } static std::atomic cpuTopoSyncHandlerDone{}; -# if CMK_SMP && !CMK_SMP_NO_COMMTHD +#if CMK_SMP && !CMK_SMP_NO_COMMTHD extern void CommunicationServerThread(int sleepTime); static std::atomic cpuTopoSyncCommThreadDone{}; -# endif +#endif -# if CMK_SMP && !CMK_SMP_NO_COMMTHD -static void cpuTopoSyncWaitCommThread(std::atomic& done) -{ - do CommunicationServerThread(5); +#if CMK_SMP && !CMK_SMP_NO_COMMTHD +static void cpuTopoSyncWaitCommThread(std::atomic &done) { + do + CommunicationServerThread(5); while (!done.load()); CommunicationServerThread(5); } -# endif +#endif -static void cpuTopoSyncWait(std::atomic& done) -{ - do CsdSchedulePoll(); +static void cpuTopoSyncWait(std::atomic &done) { + do + CsdSchedulePoll(); while (!done.load()); CsdSchedulePoll(); } /* called on PE 0 */ -static void cpuTopoHandler(void* m) -{ - _procInfo* rec; - hostnameMsg* msg = (hostnameMsg*)m; +static void cpuTopoHandler(void *m) { + _procInfo *rec; + hostnameMsg *msg = (hostnameMsg *)m; int pe; - if (topomsg == NULL) - { + if (topomsg == NULL) { int i; - topomsg = (nodeTopoMsg*)CmiAlloc(sizeof(nodeTopoMsg) + CmiNumPes() * sizeof(int)); - CmiSetHandler((char*)topomsg, cpuTopoRecvHandlerIdx); - topomsg->nodes = (int*)((char*)topomsg + sizeof(nodeTopoMsg)); - for (i = 0; i < CmiNumPes(); i++) topomsg->nodes[i] = -1; + topomsg = (nodeTopoMsg *)CmiAlloc(sizeof(nodeTopoMsg) + + CmiNumPes() * sizeof(int)); + CmiSetHandler((char *)topomsg, cpuTopoRecvHandlerIdx); + topomsg->nodes = (int *)((char *)topomsg + sizeof(nodeTopoMsg)); + for (i = 0; i < CmiNumPes(); i++) + topomsg->nodes[i] = -1; } CmiAssert(topomsg != NULL); - msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg)); + msg->procs = (_procInfo *)((char *)msg + sizeof(hostnameMsg)); CmiAssert(msg->n == CmiNumPes()); - for (int i = 0; i < msg->n; i++) - { - _procInfo* proc = msg->procs + i; + for (int i = 0; i < msg->n; i++) { + _procInfo *proc = msg->procs + i; /* for debug skt_print_ip(str, msg->ip); printf("hostname: %d %s\n", msg->pe, str); */ - skt_ip_t& ip = proc->ip; + skt_ip_t &ip = proc->ip; pe = proc->pe; auto iter = hostTable.find(ip); - if (iter != hostTable.end()) - { + if (iter != hostTable.end()) { rec = iter->second; - } - else - { - proc->nodeID = pe; // we will compact the node ID later + } else { + proc->nodeID = pe; // we will compact the node ID later rec = proc; hostTable.emplace(ip, proc); } @@ -321,17 +305,14 @@ static void cpuTopoHandler(void* m) } /* called on each processor */ -static void cpuTopoRecvHandler(void* msg) -{ - nodeTopoMsg* m = (nodeTopoMsg*)msg; - m->nodes = (int*)((char*)m + sizeof(nodeTopoMsg)); +static void cpuTopoRecvHandler(void *msg) { + nodeTopoMsg *m = (nodeTopoMsg *)msg; + m->nodes = (int *)((char *)m + sizeof(nodeTopoMsg)); - if (cpuTopo.nodeIDs == NULL) - { + if (cpuTopo.nodeIDs == NULL) { cpuTopo.nodeIDs = m->nodes; cpuTopo.sort(); - } - else + } else CmiFree(m); done++; @@ -341,28 +322,29 @@ static void cpuTopoRecvHandler(void* msg) } // reduction function -static void* combineMessage(int* size, void* data, void** remote, int count) -{ +static void *combineMessage(int *size, void *data, void **remote, int count) { int i, j; - int nprocs = ((hostnameMsg*)data)->n; + int nprocs = ((hostnameMsg *)data)->n; if (count == 0) return data; - for (i = 0; i < count; i++) nprocs += ((hostnameMsg*)remote[i])->n; + for (i = 0; i < count; i++) + nprocs += ((hostnameMsg *)remote[i])->n; *size = sizeof(hostnameMsg) + sizeof(_procInfo) * nprocs; - hostnameMsg* msg = (hostnameMsg*)CmiAlloc(*size); - msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg)); + hostnameMsg *msg = (hostnameMsg *)CmiAlloc(*size); + msg->procs = (_procInfo *)((char *)msg + sizeof(hostnameMsg)); msg->n = nprocs; - CmiSetHandler((char*)msg, cpuTopoHandlerIdx); + CmiSetHandler((char *)msg, cpuTopoHandlerIdx); int n = 0; - hostnameMsg* m = (hostnameMsg*)data; - m->procs = (_procInfo*)((char*)m + sizeof(hostnameMsg)); - for (j = 0; j < m->n; j++) msg->procs[n++] = m->procs[j]; - for (i = 0; i < count; i++) - { - m = (hostnameMsg*)remote[i]; - m->procs = (_procInfo*)((char*)m + sizeof(hostnameMsg)); - for (j = 0; j < m->n; j++) msg->procs[n++] = m->procs[j]; + hostnameMsg *m = (hostnameMsg *)data; + m->procs = (_procInfo *)((char *)m + sizeof(hostnameMsg)); + for (j = 0; j < m->n; j++) + msg->procs[n++] = m->procs[j]; + for (i = 0; i < count; i++) { + m = (hostnameMsg *)remote[i]; + m->procs = (_procInfo *)((char *)m + sizeof(hostnameMsg)); + for (j = 0; j < m->n; j++) + msg->procs[n++] = m->procs[j]; } return msg; } @@ -371,9 +353,8 @@ static void* combineMessage(int* size, void* data, void** remote, int count) int LrtsCpuTopoEnabled() { return CpuTopology::supported; } -int LrtsPeOnSameNode(int pe1, int pe2) -{ - int* nodeIDs = cpuTopo.nodeIDs; +int LrtsPeOnSameNode(int pe1, int pe2) { + int *nodeIDs = cpuTopo.nodeIDs; if (!cpuTopo.supported || nodeIDs == NULL) return CmiNodeOf(pe1) == CmiNodeOf(pe2); else @@ -381,66 +362,62 @@ int LrtsPeOnSameNode(int pe1, int pe2) } // return -1 when not supported -int LrtsNumNodes() -{ +int LrtsNumNodes() { if (!cpuTopo.supported) return CmiNumNodes(); else return cpuTopo.numUniqNodes(); } -int LrtsNodeSize(int node) -{ - return !cpuTopo.supported ? CmiNodeSize(node) : (int)cpuTopo.bynodes[node].size(); +int LrtsNodeSize(int node) { + return !cpuTopo.supported ? CmiNodeSize(node) + : (int)cpuTopo.bynodes[node].size(); } // pelist points to system memory, user should not free it -void LrtsPeOnNode(int node, int** pelist, int* num) -{ +void LrtsPeOnNode(int node, int **pelist, int *num) { *num = cpuTopo.bynodes[node].size(); if (pelist != NULL && *num > 0) *pelist = cpuTopo.bynodes[node].data(); } -int LrtsRankOf(int pe) -{ +int LrtsRankOf(int pe) { if (!cpuTopo.supported) return CmiRankOf(pe); - const std::vector& v = cpuTopo.bynodes[cpuTopo.nodeIDs[pe]]; + const std::vector &v = cpuTopo.bynodes[cpuTopo.nodeIDs[pe]]; int rank = 0; int npes = v.size(); - while (rank < npes && v[rank] < pe) rank++; // already sorted + while (rank < npes && v[rank] < pe) + rank++; // already sorted CmiAssert(v[rank] == pe); return rank; } -int LrtsNodeOf(int pe) -{ +int LrtsNodeOf(int pe) { if (!cpuTopo.supported) return CmiNodeOf(pe); return cpuTopo.nodeIDs[pe]; } // the least number processor on the same physical node -int LrtsNodeFirst(int node) -{ +int LrtsNodeFirst(int node) { if (!cpuTopo.supported) return CmiNodeFirst(node); return cpuTopo.bynodes[node][0]; } -void LrtsInitCpuTopo(char** argv) -{ +void LrtsInitCpuTopo(char **argv) { static skt_ip_t myip; double startT; - int obtain_flag = 1; // default on - int show_flag = 0; // default not show topology + int obtain_flag = 1; // default on + int show_flag = 0; // default not show topology -# if __FAULT__ +#if __FAULT__ obtain_flag = 0; -# endif - if (CmiGetArgFlagDesc(argv, "+obtain_cpu_topology", "obtain cpu topology info")) +#endif + if (CmiGetArgFlagDesc(argv, "+obtain_cpu_topology", + "obtain cpu topology info")) obtain_flag = 1; if (CmiGetArgFlagDesc(argv, "+skip_cpu_topology", "skip the processof getting cpu topology info")) @@ -448,40 +425,37 @@ void LrtsInitCpuTopo(char** argv) if (CmiGetArgFlagDesc(argv, "+show_cpu_topology", "Show cpu topology info")) show_flag = 1; - CmiAssignOnce(&cpuTopoHandlerIdx, CmiRegisterHandler((CmiHandler)cpuTopoHandler)); + CmiAssignOnce(&cpuTopoHandlerIdx, + CmiRegisterHandler((CmiHandler)cpuTopoHandler)); CmiAssignOnce(&cpuTopoRecvHandlerIdx, CmiRegisterHandler((CmiHandler)cpuTopoRecvHandler)); - if (!obtain_flag) - { + if (!obtain_flag) { if (CmiMyRank() == 0) cpuTopo.sort(); CmiNodeAllBarrier(); - CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks + CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks return; } - if (CmiMyPe() == 0) - { + if (CmiMyPe() == 0) { startT = CmiWallTimer(); } -# if 0 +#if 0 if (gethostname(hostname, 999)!=0) { strcpy(hostname, ""); } -# endif -# if CMK_CRAYXE || CMK_CRAYXC || CMK_CRAYEX - if (CmiMyRank() == 0) - { +#endif +#if CMK_CRAYXE || CMK_CRAYXC || CMK_CRAYEX + if (CmiMyRank() == 0) { int numPes = cpuTopo.numPes = CmiNumPes(); int numNodes = CmiNumNodes(); cpuTopo.nodeIDs = new int[numPes]; CpuTopology::supported = 1; int nid; - for (int i = 0; i < numPes; i++) - { + for (int i = 0; i < numPes; i++) { nid = getXTNodeID(CmiNodeOf(i), numNodes); cpuTopo.nodeIDs[i] = nid; } @@ -490,14 +464,11 @@ void LrtsInitCpuTopo(char** argv) // this assumes that all cores on a node have consecutive MPI rank IDs // and then changes nodeIDs to 0 to numNodes-1 - for (int i = 0; i < numPes; i++) - { - if (cpuTopo.nodeIDs[i] != prev) - { + for (int i = 0; i < numPes; i++) { + if (cpuTopo.nodeIDs[i] != prev) { prev = cpuTopo.nodeIDs[i]; cpuTopo.nodeIDs[i] = ++nid; - } - else + } else cpuTopo.nodeIDs[i] = nid; } cpuTopo.sort(); @@ -506,11 +477,10 @@ void LrtsInitCpuTopo(char** argv) } CmiNodeAllBarrier(); -# else +#else /* get my ip address */ - if (CmiMyRank() == 0) - { + if (CmiMyRank() == 0) { myip = skt_my_ip(); /* not thread safe, so only calls on rank 0 */ // fprintf(stderr, "[%d] IP is %d.%d.%d.%d\n", CmiMyPe(), // myip.data[0],myip.data[1],myip.data[2],myip.data[3]); @@ -518,27 +488,25 @@ void LrtsInitCpuTopo(char** argv) } CmiNodeAllBarrier(); - if (_noip) - { + if (_noip) { if (CmiMyRank() == 0) cpuTopo.sort(); - CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks + CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks return; } -# if CMK_SMP && !CMK_SMP_NO_COMMTHD - if (CmiInCommThread()) - { +#if CMK_SMP && !CMK_SMP_NO_COMMTHD + if (CmiInCommThread()) { cpuTopoSyncWaitCommThread(cpuTopoSyncCommThreadDone); - } - else -# endif + } else +#endif { /* prepare a msg to send */ - hostnameMsg* msg = (hostnameMsg*)CmiAlloc(sizeof(hostnameMsg) + sizeof(_procInfo)); + hostnameMsg *msg = + (hostnameMsg *)CmiAlloc(sizeof(hostnameMsg) + sizeof(_procInfo)); msg->n = 1; - msg->procs = (_procInfo*)((char*)msg + sizeof(hostnameMsg)); - CmiSetHandler((char*)msg, cpuTopoHandlerIdx); + msg->procs = (_procInfo *)((char *)msg + sizeof(hostnameMsg)); + CmiSetHandler((char *)msg, cpuTopoHandlerIdx); auto proc = &msg->procs[0]; proc->pe = CmiMyPe(); proc->ip = myip; @@ -550,44 +518,40 @@ void LrtsInitCpuTopo(char** argv) cpuTopoSyncWait(cpuTopoSyncHandlerDone); - if (CmiMyRank() == 0) - { - if (CmiMyPe() == 0) - { - CmiSyncNodeBroadcastAllAndFree(sizeof(nodeTopoMsg) + CmiNumPes() * sizeof(int), - (char*)topomsg); + if (CmiMyRank() == 0) { + if (CmiMyPe() == 0) { + CmiSyncNodeBroadcastAllAndFree( + sizeof(nodeTopoMsg) + CmiNumPes() * sizeof(int), (char *)topomsg); CsdSchedulePoll(); } -# if CMK_SMP && !CMK_SMP_NO_COMMTHD +#if CMK_SMP && !CMK_SMP_NO_COMMTHD cpuTopoSyncCommThreadDone = true; -# endif +#endif } } CmiBarrier(); - if (CmiMyPe() == 0) - { + if (CmiMyPe() == 0) { CmiPrintf("Charm++> cpu topology info is gathered in %.3f seconds.\n", CmiWallTimer() - startT); } -# endif +#endif // now every one should have the node info - CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks + CcdRaiseCondition(CcdTOPOLOGY_AVAIL); // call callbacks if (CmiMyPe() == 0 && show_flag) cpuTopo.print(); } #else /* not supporting cpu topology */ -extern "C" void LrtsInitCpuTopo(char** argv) -{ +extern "C" void LrtsInitCpuTopo(char **argv) { /* do nothing */ - int obtain_flag = - CmiGetArgFlagDesc(argv, "+obtain_cpu_topology", "obtain cpu topology info"); + int obtain_flag = CmiGetArgFlagDesc(argv, "+obtain_cpu_topology", + "obtain cpu topology info"); CmiGetArgFlagDesc(argv, "+skip_cpu_topology", "skip the processof getting cpu topology info"); CmiGetArgFlagDesc(argv, "+show_cpu_topology", "Show cpu topology info"); @@ -596,14 +560,15 @@ extern "C" void LrtsInitCpuTopo(char** argv) #endif int CmiCpuTopologyEnabled() { return LrtsCpuTopoEnabled(); } -int CmiPeOnSamePhysicalNode(int pe1, int pe2) { return LrtsPeOnSameNode(pe1, pe2); } +int CmiPeOnSamePhysicalNode(int pe1, int pe2) { + return LrtsPeOnSameNode(pe1, pe2); +} int CmiNumPhysicalNodes() { return LrtsNumNodes(); } int CmiNumPesOnPhysicalNode(int node) { return LrtsNodeSize(node); } -void CmiGetPesOnPhysicalNode(int node, int** pelist, int* num) -{ +void CmiGetPesOnPhysicalNode(int node, int **pelist, int *num) { LrtsPeOnNode(node, pelist, num); } int CmiPhysicalRank(int pe) { return LrtsRankOf(pe); } int CmiPhysicalNodeID(int pe) { return LrtsNodeOf(pe); } int CmiGetFirstPeOnPhysicalNode(int node) { return LrtsNodeFirst(node); } -void CmiInitCPUTopology(char** argv) { LrtsInitCpuTopo(argv); } +void CmiInitCPUTopology(char **argv) { LrtsInitCpuTopo(argv); } diff --git a/src/conv-topology.h b/src/conv-topology.h index 6091937..12ac3c4 100644 --- a/src/conv-topology.h +++ b/src/conv-topology.h @@ -1,12 +1,11 @@ #include "converse.h" #include -typedef struct {/*IPv4 IP address*/ +typedef struct { /*IPv4 IP address*/ unsigned char data[4]; } skt_ip_t; extern skt_ip_t _skt_invalid_ip; -static inline bool operator< (const skt_ip_t & a, const skt_ip_t & b) -{ +static inline bool operator<(const skt_ip_t &a, const skt_ip_t &b) { return memcmp(&a, &b, sizeof(a)) < 0; } \ No newline at end of file diff --git a/src/convcore.cpp b/src/convcore.cpp index a040708..70f2f5b 100644 --- a/src/convcore.cpp +++ b/src/convcore.cpp @@ -10,9 +10,9 @@ #include #include #include +#include #include #include -#include // GLOBALS static char **Cmi_argv; @@ -47,7 +47,7 @@ int _replaySystem = 0; CmiNodeLock CmiMemLock_lock; CpvDeclare(int, isHelperOn); -//partition +// partition PartitionInfo _partitionInfo; int _Cmi_mype_global; int _Cmi_numpes_global; @@ -77,12 +77,10 @@ int Cmi_exitHandler; comm_backend::AmHandler g_amHandler; CpvStaticDeclare(double, clocktick); -CpvStaticDeclare(int,inittime_wallclock); -CpvStaticDeclare(int,inittime_virtual); +CpvStaticDeclare(int, inittime_wallclock); +CpvStaticDeclare(int, inittime_virtual); -void registerTraceInit(void (*fn)(char **argv)) { - CmiTraceFn = fn; -} +void registerTraceInit(void (*fn)(char **argv)) { CmiTraceFn = fn; } void CommLocalHandler(comm_backend::Status status) { CmiFree(const_cast(status.local_buf)); @@ -236,7 +234,7 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, #ifdef CMK_HAS_PARTITION CmiCreatePartitions(argv); - #else +#else _partitionInfo.type = PARTITION_SINGLETON; _partitionInfo.numPartitions = 1; _partitionInfo.myPartition = 0; @@ -244,7 +242,7 @@ void ConverseInit(int argc, char **argv, CmiStartFn fn, int usched, _Cmi_mynode_global = Cmi_mynode; _Cmi_numpes_global = Cmi_npes; Cmi_nodestartGlobal = _Cmi_mynode_global * Cmi_mynodesize; - #endif +#endif CmiStartThreads(); } @@ -1425,7 +1423,4 @@ void CmiCreatePartitions(char **argv) { // Since we are not implementing converse level seed balancers yet void LBTopoInit() {} -int CmiDeliverMsgs(int maxmsgs) -{ - return CsdScheduler(maxmsgs); -} +int CmiDeliverMsgs(int maxmsgs) { return CsdScheduler(maxmsgs); } diff --git a/src/converse_internal.h b/src/converse_internal.h index 32ce4c6..49b0db5 100644 --- a/src/converse_internal.h +++ b/src/converse_internal.h @@ -3,8 +3,8 @@ #ifndef CONVCORE_H #define CONVCORE_H -#include #include "converse.h" +#include #include "converse.h" #include "converse_config.h" @@ -23,8 +23,8 @@ typedef struct GroupDef_s { #define GROUPTAB_SIZE 101 -//debug -#define DEBUGF(...) //CmiPrintf(__VA_ARGS__) +// debug +#define DEBUGF(...) // CmiPrintf(__VA_ARGS__) void CmiStartThreads(char **argv); void converseRunPe(int rank); @@ -41,8 +41,8 @@ void CmiGroupHandler(void *msg); void CmiReduceHandler(void *msg); typedef struct HandlerInfo { - union{ - CmiHandler hdlr; // handler function + union { + CmiHandler hdlr; // handler function CmiHandlerEx exhdlr; // handler function with user pointer }; void *userPtr; // does this point to the mesage data itself diff --git a/src/cpuaffinity.cpp b/src/cpuaffinity.cpp index d4245c9..783368e 100644 --- a/src/cpuaffinity.cpp +++ b/src/cpuaffinity.cpp @@ -18,9 +18,10 @@ #define _GNU_SOURCE #endif -static int affMsgsRecvd = 1; // number of affinity messages received at PE0 +static int affMsgsRecvd = 1; // number of affinity messages received at PE0 #if defined(CPU_OR) -static cpu_set_t core_usage; // used to record union of CPUs used by every PE in physical node +static cpu_set_t core_usage; // used to record union of CPUs used by every PE in + // physical node #endif static int aff_is_set = 0; @@ -28,9 +29,9 @@ static std::atomic cpuPhyAffCheckDone{}; struct affMsg { char core[CmiMsgHeaderSizeBytes]; - #if defined(CPU_OR) +#if defined(CPU_OR) cpu_set_t affinity; - #endif +#endif }; CmiHwlocTopology CmiHwlocTopologyLocal; @@ -41,92 +42,101 @@ static int cpuPhyNodeAffinityRecvHandlerIdx; // CmiNumCores static hwloc_topology_t topology, legacy_topology; -int CmiNumCores(void) -{ +int CmiNumCores(void) { // PU count is the intended output here rather than literal cores return CmiHwlocTopologyLocal.total_num_pus; } -static int search_pemap(char *pecoremap, int pe) -{ - int *map = (int *)malloc(CmiNumPesGlobal()*sizeof(int)); +static int search_pemap(char *pecoremap, int pe) { + int *map = (int *)malloc(CmiNumPesGlobal() * sizeof(int)); char *ptr = NULL; int h, i, j, k, count; int plusarr[128]; char *str; - char *mapstr = (char*)malloc(strlen(pecoremap)+1); + char *mapstr = (char *)malloc(strlen(pecoremap) + 1); strcpy(mapstr, pecoremap); str = strtok_r(mapstr, ",", &ptr); count = 0; - while (str && count < CmiNumPesGlobal()) - { - int hasdash=0, hascolon=0, hasdot=0, hasstar1=0, hasstar2=0, numplus=0; - int start, end, stride=1, block=1; - int iter=1; - plusarr[0] = 0; - for (i=0; i stride) { - printf("Warning: invalid block size in \"%s\" ignored.\n", str); - block=1; + } + if (hasstar1 || hasstar2) { + if (hasstar1) + sscanf(str, "%dx", &iter); + if (hasstar2) + sscanf(str, "%dX", &iter); + while (*str != 'x' && *str != 'X') + str++; + str++; + } + if (hasdash) { + if (hascolon) { + if (hasdot) { + if (sscanf(str, "%d-%d:%d.%d", &start, &end, &stride, &block) != 4) + printf("Warning: Check the format of \"%s\".\n", str); + } else { + if (sscanf(str, "%d-%d:%d", &start, &end, &stride) != 3) + printf("Warning: Check the format of \"%s\".\n", str); + } + } else { + if (sscanf(str, "%d-%d", &start, &end) != 2) + printf("Warning: Check the format of \"%s\".\n", str); } - //if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block: %d. plus %d \n", iter, start, end, stride, block, numplus); - for (k = 0; kend) break; - for (h=0; h<=numplus; h++) { - map[count++] = i+j+plusarr[h]; - if (count == CmiNumPesGlobal()) break; - } - if (count == CmiNumPesGlobal()) break; + } else { + sscanf(str, "%d", &start); + end = start; + } + if (block > stride) { + printf("Warning: invalid block size in \"%s\" ignored.\n", str); + block = 1; + } + // if (CmiMyPe() == 0) printf("iter: %d start: %d end: %d stride: %d, block: + // %d. plus %d \n", iter, start, end, stride, block, numplus); + for (k = 0; k < iter; k++) { + for (i = start; i <= end; i += stride) { + for (j = 0; j < block; j++) { + if (i + j > end) + break; + for (h = 0; h <= numplus; h++) { + map[count++] = i + j + plusarr[h]; + if (count == CmiNumPesGlobal()) + break; } - if (count == CmiNumPesGlobal()) break; + if (count == CmiNumPesGlobal()) + break; } - if (count == CmiNumPesGlobal()) break; + if (count == CmiNumPesGlobal()) + break; } - str = strtok_r(NULL, ",", &ptr); + if (count == CmiNumPesGlobal()) + break; + } + str = strtok_r(NULL, ",", &ptr); } i = map[pe % count]; @@ -135,8 +145,7 @@ static int search_pemap(char *pecoremap, int pe) return i; } -static void cpuAffSyncWait(std::atomic & done) -{ +static void cpuAffSyncWait(std::atomic &done) { do CsdSchedulePoll(); while (!done.load()); @@ -144,8 +153,7 @@ static void cpuAffSyncWait(std::atomic & done) CsdSchedulePoll(); } -static void cpuPhyNodeAffinityRecvHandler(void *msg) -{ +static void cpuPhyNodeAffinityRecvHandler(void *msg) { static int count = 0; affMsg *m = (affMsg *)msg; @@ -162,7 +170,8 @@ static void cpuPhyNodeAffinityRecvHandler(void *msg) #if defined(CPU_OR) int get_thread_affinity(cpu_set_t *cpuset) { CPU_ZERO(cpuset); - if ((errno = pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset))) { + if ((errno = + pthread_getaffinity_np(pthread_self(), sizeof(cpu_set_t), cpuset))) { perror("pthread_getaffinity"); return -1; } @@ -171,9 +180,7 @@ int get_thread_affinity(cpu_set_t *cpuset) { #endif #if defined(CPU_OR) -int get_affinity(cpu_set_t *cpuset) { - return get_thread_affinity(cpuset); -} +int get_affinity(cpu_set_t *cpuset) { return get_thread_affinity(cpuset); } #endif void CmiInitHwlocTopology(void) { @@ -218,12 +225,11 @@ void CmiInitHwlocTopology(void) { : 1; } -static int set_process_affinity(hwloc_cpuset_t cpuset) -{ +static int set_process_affinity(hwloc_cpuset_t cpuset) { pid_t process = getpid(); - #define PRINTF_PROCESS "%d" - if (hwloc_set_proc_cpubind(topology, process, cpuset, HWLOC_CPUBIND_PROCESS|HWLOC_CPUBIND_STRICT)) - { +#define PRINTF_PROCESS "%d" + if (hwloc_set_proc_cpubind(topology, process, cpuset, + HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT)) { char *str; int error = errno; hwloc_bitmap_asprintf(&str, cpuset); @@ -232,15 +238,13 @@ static int set_process_affinity(hwloc_cpuset_t cpuset) return -1; } return 0; - #undef PRINTF_PROCESS +#undef PRINTF_PROCESS } - -static int set_thread_affinity(hwloc_cpuset_t cpuset) -{ +static int set_thread_affinity(hwloc_cpuset_t cpuset) { pthread_t thread = pthread_self(); - if (hwloc_set_thread_cpubind(topology, thread, cpuset, HWLOC_CPUBIND_THREAD|HWLOC_CPUBIND_STRICT)) - { + if (hwloc_set_thread_cpubind(topology, thread, cpuset, + HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT)) { char *str; int error = errno; hwloc_bitmap_asprintf(&str, cpuset); @@ -251,61 +255,63 @@ static int set_thread_affinity(hwloc_cpuset_t cpuset) return 0; } -static void bind_process_only(hwloc_obj_type_t process_unit) -{ +static void bind_process_only(hwloc_obj_type_t process_unit) { hwloc_cpuset_t cpuset; int process_unitcount = hwloc_get_nbobjs_by_type(topology, process_unit); int process_assignment = CmiMyRank() % process_unitcount; - hwloc_obj_t process_obj = hwloc_get_obj_by_type(topology, process_unit, process_assignment); + hwloc_obj_t process_obj = + hwloc_get_obj_by_type(topology, process_unit, process_assignment); set_process_affinity(process_obj->cpuset); } -static void bind_threads_only(hwloc_obj_type_t thread_unit) -{ +static void bind_threads_only(hwloc_obj_type_t thread_unit) { hwloc_cpuset_t cpuset; int thread_unitcount = hwloc_get_nbobjs_by_type(topology, thread_unit); int thread_assignment = CmiMyRank() % thread_unitcount; - hwloc_obj_t thread_obj = hwloc_get_obj_by_type(topology, thread_unit, thread_assignment); + hwloc_obj_t thread_obj = + hwloc_get_obj_by_type(topology, thread_unit, thread_assignment); hwloc_cpuset_t thread_cpuset = hwloc_bitmap_dup(thread_obj->cpuset); hwloc_bitmap_singlify(thread_cpuset); set_thread_affinity(thread_cpuset); hwloc_bitmap_free(thread_cpuset); } -static void bind_process_and_threads(hwloc_obj_type_t process_unit, hwloc_obj_type_t thread_unit) -{ +static void bind_process_and_threads(hwloc_obj_type_t process_unit, + hwloc_obj_type_t thread_unit) { hwloc_cpuset_t cpuset; int process_unitcount = hwloc_get_nbobjs_by_type(topology, process_unit); int process_assignment = CmiMyRank() % process_unitcount; - hwloc_obj_t process_obj = hwloc_get_obj_by_type(topology, process_unit, process_assignment); + hwloc_obj_t process_obj = + hwloc_get_obj_by_type(topology, process_unit, process_assignment); set_process_affinity(process_obj->cpuset); - int thread_unitcount = hwloc_get_nbobjs_inside_cpuset_by_type(topology, process_obj->cpuset, thread_unit); + int thread_unitcount = hwloc_get_nbobjs_inside_cpuset_by_type( + topology, process_obj->cpuset, thread_unit); int thread_assignment = CmiMyRank() % thread_unitcount; - hwloc_obj_t thread_obj = hwloc_get_obj_inside_cpuset_by_type(topology, process_obj->cpuset, thread_unit, thread_assignment); + hwloc_obj_t thread_obj = hwloc_get_obj_inside_cpuset_by_type( + topology, process_obj->cpuset, thread_unit, thread_assignment); hwloc_cpuset_t thread_cpuset = hwloc_bitmap_dup(thread_obj->cpuset); hwloc_bitmap_singlify(thread_cpuset); set_thread_affinity(thread_cpuset); hwloc_bitmap_free(thread_cpuset); } -static int set_default_affinity(void){ +static int set_default_affinity(void) { char *s; int n = -1; - if ((s = getenv("CmiProcessPerSocket"))) - { + if ((s = getenv("CmiProcessPerSocket"))) { n = atoi(s); if (getenv("CmiOneWthPerCore")) bind_process_and_threads(HWLOC_OBJ_PACKAGE, HWLOC_OBJ_CORE); @@ -313,34 +319,24 @@ static int set_default_affinity(void){ bind_process_and_threads(HWLOC_OBJ_PACKAGE, HWLOC_OBJ_PU); else bind_process_only(HWLOC_OBJ_PACKAGE); - } - else if ((s = getenv("CmiProcessPerCore"))) - { + } else if ((s = getenv("CmiProcessPerCore"))) { n = atoi(s); if (getenv("CmiOneWthPerPU")) bind_process_and_threads(HWLOC_OBJ_CORE, HWLOC_OBJ_PU); else bind_process_only(HWLOC_OBJ_CORE); - } - else if ((s = getenv("CmiProcessPerPU"))) - { + } else if ((s = getenv("CmiProcessPerPU"))) { n = atoi(s); bind_process_only(HWLOC_OBJ_PU); - } - else // if ((s = getenv("CmiProcessPerHost"))) + } else // if ((s = getenv("CmiProcessPerHost"))) { - if (getenv("CmiOneWthPerSocket")) - { + if (getenv("CmiOneWthPerSocket")) { n = 0; bind_threads_only(HWLOC_OBJ_PACKAGE); - } - else if (getenv("CmiOneWthPerCore")) - { + } else if (getenv("CmiOneWthPerCore")) { n = 0; bind_threads_only(HWLOC_OBJ_CORE); - } - else if (getenv("CmiOneWthPerPU")) - { + } else if (getenv("CmiOneWthPerPU")) { n = 0; bind_threads_only(HWLOC_OBJ_PU); } @@ -350,59 +346,65 @@ static int set_default_affinity(void){ } void CmiInitCPUAffinity(char **argv) { - #if defined(CPU_OR) - // check for flags - int affinity_flag = CmiGetArgFlagDesc(argv,"+setcpuaffinity", "set cpu affinity"); - char *pemap = NULL; - // 0 if OS-assigned, 1 if logical hwloc assigned - // for now, stick with os-assigned only - // also no commap, we have no commthreads - int pemap_logical_flag = 0; - CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping"); - if (pemap!=NULL) affinity_flag = 1; - CmiAssignOnce(&cpuPhyNodeAffinityRecvHandlerIdx, CmiRegisterHandler((CmiHandler)cpuPhyNodeAffinityRecvHandler)); - // setting default affinity (always needed, not the same as setting cpu affinity) - int done = 0; - CmiNodeAllBarrier(); - /* must bind the rank 0 which is the main thread first */ - /* binding the main thread seems to change binding for all threads */ - if (CmiMyRank() == 0) { - done = set_default_affinity(); - } - CmiNodeAllBarrier(); - if (CmiMyRank() != 0) { - done = set_default_affinity(); - } - if (done) { - return; - } - //set cmi affinity - if (!affinity_flag) { - if (CmiMyPe() == 0) CmiPrintf("Charm++> cpu affinity NOT enabled.\n"); - return; - } - if (CmiMyPe() == 0) { - CmiPrintf("Charm++> cpu affinity enabled. \n"); - if (pemap!=NULL) - CmiPrintf("Charm++> cpuaffinity PE-core map (%s): %s\n", - pemap_logical_flag ? "logical indices" : "OS indices", pemap); - } - // if a pemap is provided - if (pemap != NULL){ - int mycore = search_pemap(pemap, CmiMyPeGlobal()); - if (CmiSetCPUAffinity(mycore) == -1) CmiAbort("CmiSetCPUAffinity failed!"); - } - // if we are just using +setcpuaffinity - else { - CmiPrintf("Charm++> +setcpuaffinity implementation in progress\n"); - } - #endif - CmiNodeAllBarrier(); +#if defined(CPU_OR) + // check for flags + int affinity_flag = + CmiGetArgFlagDesc(argv, "+setcpuaffinity", "set cpu affinity"); + char *pemap = NULL; + // 0 if OS-assigned, 1 if logical hwloc assigned + // for now, stick with os-assigned only + // also no commap, we have no commthreads + int pemap_logical_flag = 0; + CmiGetArgStringDesc(argv, "+pemap", &pemap, "define pe to core mapping"); + if (pemap != NULL) + affinity_flag = 1; + CmiAssignOnce(&cpuPhyNodeAffinityRecvHandlerIdx, + CmiRegisterHandler((CmiHandler)cpuPhyNodeAffinityRecvHandler)); + // setting default affinity (always needed, not the same as setting cpu + // affinity) + int done = 0; + CmiNodeAllBarrier(); + /* must bind the rank 0 which is the main thread first */ + /* binding the main thread seems to change binding for all threads */ + if (CmiMyRank() == 0) { + done = set_default_affinity(); + } + CmiNodeAllBarrier(); + if (CmiMyRank() != 0) { + done = set_default_affinity(); + } + if (done) { + return; + } + // set cmi affinity + if (!affinity_flag) { + if (CmiMyPe() == 0) + CmiPrintf("Charm++> cpu affinity NOT enabled.\n"); + return; + } + if (CmiMyPe() == 0) { + CmiPrintf("Charm++> cpu affinity enabled. \n"); + if (pemap != NULL) + CmiPrintf("Charm++> cpuaffinity PE-core map (%s): %s\n", + pemap_logical_flag ? "logical indices" : "OS indices", pemap); + } + // if a pemap is provided + if (pemap != NULL) { + int mycore = search_pemap(pemap, CmiMyPeGlobal()); + if (CmiSetCPUAffinity(mycore) == -1) + CmiAbort("CmiSetCPUAffinity failed!"); + } + // if we are just using +setcpuaffinity + else { + CmiPrintf("Charm++> +setcpuaffinity implementation in progress\n"); + } +#endif + CmiNodeAllBarrier(); } // Uses PU indices assigned by the OS int CmiSetCPUAffinity(int mycore) { - #if defined(CPU_OR) +#if defined(CPU_OR) int core = mycore; if (core < 0) { printf("Error with core number"); @@ -431,34 +433,34 @@ int CmiSetCPUAffinity(int mycore) { CmiMyPe(), mycore); return result; - #else +#else return -1; - #endif +#endif } -void CmiCheckAffinity(void) -{ - #if defined(CPU_OR) - if (!CmiCpuTopologyEnabled()) return; // only works if cpu topology enabled +void CmiCheckAffinity(void) { +#if defined(CPU_OR) + if (!CmiCpuTopologyEnabled()) + return; // only works if cpu topology enabled if (CmiNumPes() == 1) return; - if (CmiMyPe() == 0) - { - // wait for every PE affinity from my physical node (for now only done on phy node 0) + if (CmiMyPe() == 0) { + // wait for every PE affinity from my physical node (for now only done on + // phy node 0) cpu_set_t my_aff; - if (get_affinity(&my_aff) == -1) CmiAbort("get_affinity failed\n"); + if (get_affinity(&my_aff) == -1) + CmiAbort("get_affinity failed\n"); CPU_OR(&core_usage, &core_usage, &my_aff); // add my affinity (pe0) cpuAffSyncWait(cpuPhyAffCheckDone); - } - else if (CmiPhysicalNodeID(CmiMyPe()) == 0) - { - // send my affinity to first PE on physical node (only done on phy node 0 for now) - affMsg *m = (affMsg*)CmiAlloc(sizeof(affMsg)); + } else if (CmiPhysicalNodeID(CmiMyPe()) == 0) { + // send my affinity to first PE on physical node (only done on phy node 0 + // for now) + affMsg *m = (affMsg *)CmiAlloc(sizeof(affMsg)); CmiSetHandler((char *)m, cpuPhyNodeAffinityRecvHandlerIdx); if (get_affinity(&m->affinity) == -1) { // put my affinity in msg CmiFree(m); @@ -471,8 +473,7 @@ void CmiCheckAffinity(void) CmiBarrier(); - if (CmiMyPe() == 0) - { + if (CmiMyPe() == 0) { // NOTE this test is simple and may not detect every possible case of // oversubscription const int N = CmiNumPesOnPhysicalNode(0); @@ -480,15 +481,17 @@ void CmiCheckAffinity(void) // TODO suggest command line arguments? if (!aff_is_set) { CmiAbort("Multiple PEs assigned to same core. Set affinity " - "options to correct or lower the number of threads, or pass +setcpuaffinity to ignore.\n"); + "options to correct or lower the number of threads, or pass " + "+setcpuaffinity to ignore.\n"); } else { - CmiPrintf("WARNING: Multiple PEs assigned to same core, recommend " - "adjusting processor affinity or passing +CmiSleepOnIdle to reduce " - "interference.\n"); + CmiPrintf( + "WARNING: Multiple PEs assigned to same core, recommend " + "adjusting processor affinity or passing +CmiSleepOnIdle to reduce " + "interference.\n"); } } } - #endif +#endif } #else // Dummy function if RECONVERSE_ENABLE_CPU_AFFINITY not set diff --git a/src/msgmgr.cpp b/src/msgmgr.cpp index 22e8cb4..849fd8a 100644 --- a/src/msgmgr.cpp +++ b/src/msgmgr.cpp @@ -1,101 +1,112 @@ -#include #include +#include -#define CmiAlloc malloc -#define CmiFree free +#define CmiAlloc malloc +#define CmiFree free typedef struct CmmEntryStruct *CmmEntry; -struct CmmEntryStruct -{ +struct CmmEntryStruct { CmmEntry next; - void *msg; - int ntags; - int tags[1]; + void *msg; + int ntags; + int tags[1]; }; -struct CmmTableStruct -{ - CmmEntry first; +struct CmmTableStruct { + CmmEntry first; CmmEntry *lasth; }; - -CmmTable CmmNew(void) -{ +CmmTable CmmNew(void) { CmmTable result = (CmmTable)CmiAlloc(sizeof(struct CmmTableStruct)); result->first = 0; result->lasth = &(result->first); return result; } -void CmmFree(CmmTable t) -{ - if (t==NULL) return; - if (t->first!=NULL) CmiAbort("Cannot free a non-empty message table!"); +void CmmFree(CmmTable t) { + if (t == NULL) + return; + if (t->first != NULL) + CmiAbort("Cannot free a non-empty message table!"); CmiFree(t); } /* free all table entries but not the space pointed by "msg" */ -void CmmFreeAll(CmmTable t){ - CmmEntry cur; - if(t==NULL) return; - cur = t->first; - while(cur){ - CmmEntry toDel = cur; - cur = cur->next; - CmiFree(toDel); - } +void CmmFreeAll(CmmTable t) { + CmmEntry cur; + if (t == NULL) + return; + cur = t->first; + while (cur) { + CmmEntry toDel = cur; + cur = cur->next; + CmiFree(toDel); + } } -void CmmPut(CmmTable t, int ntags, int *tags, void *msg) -{ +void CmmPut(CmmTable t, int ntags, int *tags, void *msg) { int i; - CmmEntry e=(CmmEntry)CmiAlloc(sizeof(struct CmmEntryStruct)+(ntags*sizeof(int))); + CmmEntry e = + (CmmEntry)CmiAlloc(sizeof(struct CmmEntryStruct) + (ntags * sizeof(int))); e->next = 0; e->msg = msg; e->ntags = ntags; - for (i=0; itags[i] = tags[i]; + for (i = 0; i < ntags; i++) + e->tags[i] = tags[i]; *(t->lasth) = e; t->lasth = &(e->next); } -static int CmmTagsMatch(int ntags1, int *tags1, int ntags2, int *tags2) -{ +static int CmmTagsMatch(int ntags1, int *tags1, int ntags2, int *tags2) { int ntags = ntags1; - if (ntags1 != ntags2) return 0; + if (ntags1 != ntags2) + return 0; while (1) { int tag1, tag2; - if (ntags == 0) return 1; + if (ntags == 0) + return 1; ntags--; tag1 = *tags1++; tag2 = *tags2++; - if (tag1==tag2) continue; - if (tag1==CmmWildCard) continue; - if (tag2==CmmWildCard) continue; + if (tag1 == tag2) + continue; + if (tag1 == CmmWildCard) + continue; + if (tag2 == CmmWildCard) + continue; return 0; } } -void *CmmFind(CmmTable t, int ntags, int *tags, int *rtags, int del) -{ - CmmEntry *enth; CmmEntry ent; void *msg; int i; - /* Added by Chao Mei to handle the case where t is already freed, which happens in ~ampi() when doing out-of-core emulation for AMPI programs. */ - if(t==NULL) return NULL; +void *CmmFind(CmmTable t, int ntags, int *tags, int *rtags, int del) { + CmmEntry *enth; + CmmEntry ent; + void *msg; + int i; + /* Added by Chao Mei to handle the case where t is already freed, which + * happens in ~ampi() when doing out-of-core emulation for AMPI programs. */ + if (t == NULL) + return NULL; enth = &(t->first); while (1) { ent = (*enth); - if (ent==0) return 0; + if (ent == 0) + return 0; if (CmmTagsMatch(ntags, tags, ent->ntags, ent->tags)) { - if (rtags) for (i=0; itags[i]; + if (rtags) + for (i = 0; i < ntags; i++) + rtags[i] = ent->tags[i]; msg = ent->msg; if (del) { - CmmEntry next = ent->next; - (*enth) = next; - if (next == 0) t->lasth = enth; - CmiFree(ent); + CmmEntry next = ent->next; + (*enth) = next; + if (next == 0) + t->lasth = enth; + CmiFree(ent); } return msg; } @@ -104,23 +115,23 @@ void *CmmFind(CmmTable t, int ntags, int *tags, int *rtags, int del) } /* match the first ntags tags and return the last tag */ -int CmmGetLastTag(CmmTable t, int ntags, int* tags) -{ - CmmEntry *enth; CmmEntry ent; +int CmmGetLastTag(CmmTable t, int ntags, int *tags) { + CmmEntry *enth; + CmmEntry ent; enth = &(t->first); while (1) { ent = (*enth); - if (ent==0) return -1; + if (ent == 0) + return -1; if (CmmTagsMatch(ntags, tags, ntags, ent->tags)) { - return (ent->tags[ent->ntags-1]); + return (ent->tags[ent->ntags - 1]); } enth = &(ent->next); } return -1; } -int CmmEntries(CmmTable t) -{ +int CmmEntries(CmmTable t) { int n = 0; CmmEntry e = t->first; while (e) { diff --git a/src/queue.h b/src/queue.h index 2c185aa..f037ab9 100644 --- a/src/queue.h +++ b/src/queue.h @@ -1,144 +1,109 @@ #ifndef QUEUE_H #define QUEUE_H -#include +#include "concurrentqueue.h" #include -#include #include -#include "concurrentqueue.h" +#include +#include -template -using QueueResult = std::optional; +template using QueueResult = std::optional; -template -class MutexAccessControl { - ConcreteQ q; - std::mutex mtx; +template class MutexAccessControl { + ConcreteQ q; + std::mutex mtx; public: - void push(MessageType message) { - std::lock_guard lock(mtx); - q.push(message); - } - - QueueResult pop_result() { - std::lock_guard lock(mtx); - if (q.empty()) { - return std::nullopt; - } else { - MessageType val = q.front(); - q.pop(); - return QueueResult(val); - } - } - - - size_t size() { - std::lock_guard lock(mtx); - return q.size(); - } - - bool empty() { - std::lock_guard lock(mtx); - return q.empty(); - } + void push(MessageType message) { + std::lock_guard lock(mtx); + q.push(message); + } + + QueueResult pop_result() { + std::lock_guard lock(mtx); + if (q.empty()) { + return std::nullopt; + } else { + MessageType val = q.front(); + q.pop(); + return QueueResult(val); + } + } + + size_t size() { + std::lock_guard lock(mtx); + return q.size(); + } + + bool empty() { + std::lock_guard lock(mtx); + return q.empty(); + } }; -template -class AtomicAccessControl { - // what default size? - moodycamel::ConcurrentQueue q{256}; +template class AtomicAccessControl { + // what default size? + moodycamel::ConcurrentQueue q{256}; - public: - void push(MessageType message) { - q.enqueue(message); - } +public: + void push(MessageType message) { q.enqueue(message); } - QueueResult pop_result() { - MessageType message; - bool success = q.try_dequeue(message); - return success ? QueueResult(message) : std::nullopt; - } + QueueResult pop_result() { + MessageType message; + bool success = q.try_dequeue(message); + return success ? QueueResult(message) : std::nullopt; + } - size_t size() { - return q.size_approx(); - } + size_t size() { return q.size_approx(); } - bool empty() { - return q.size_approx() == 0; - } + bool empty() { return q.size_approx() == 0; } }; // An MPSC queue that can be used to send messages between threads. -template -class MPSCQueue -{ - AccessControlPolicy policy; +template class MPSCQueue { + AccessControlPolicy policy; public: - QueueResult pop() - { - return policy.pop_result(); - } + QueueResult pop() { return policy.pop_result(); } - void push(MessageType message) - { - policy.push(message); - } + void push(MessageType message) { policy.push(message); } - bool empty() - { - return policy.empty(); - } + bool empty() { return policy.empty(); } - size_t size() - { - return policy.size(); - } + size_t size() { return policy.size(); } }; -template -class MPMCQueue -{ - AccessControlPolicy policy; +template class MPMCQueue { + AccessControlPolicy policy; public: - QueueResult pop() - { - return policy.pop_result(); - } + QueueResult pop() { return policy.pop_result(); } - void push(MessageType message) - { - policy.push(message); - } + void push(MessageType message) { policy.push(message); } - bool empty() - { - return policy.empty(); - } + bool empty() { return policy.empty(); } - size_t size() - { - return policy.size(); - } + size_t size() { return policy.size(); } }; - #ifdef ATOMIC_QUEUE_ENABLED template using ConverseQueue = MPSCQueue>; template -using ConverseNodeQueue = MPMCQueue>; +using ConverseNodeQueue = + MPMCQueue>; #else template -using ConverseQueue = MPSCQueue, MessageType>>; +using ConverseQueue = + MPSCQueue, MessageType>>; template -using ConverseNodeQueue = MPMCQueue, MessageType>>; +using ConverseNodeQueue = + MPMCQueue, MessageType>>; #endif - #endif \ No newline at end of file diff --git a/src/scheduler.cpp b/src/scheduler.cpp index 8368424..332425b 100644 --- a/src/scheduler.cpp +++ b/src/scheduler.cpp @@ -85,7 +85,7 @@ void CsdSchedulePoll() { // get node level queue ConverseNodeQueue *nodeQueue = CmiGetNodeQueue(); - while(1){ + while (1) { CcdCallBacks(); @@ -124,27 +124,26 @@ void CsdSchedulePoll() { else { comm_backend::progress(); - break; //break when queues are empty + break; // break when queues are empty } - } - } -int CsdScheduler(int maxmsgs){ +int CsdScheduler(int maxmsgs) { if (maxmsgs < 0) { - CsdScheduler(); //equivalent to CsdScheduleForever in old converse - } - else CsdSchedulePoll(); //not implementing CsdScheduleCount + CsdScheduler(); // equivalent to CsdScheduleForever in old converse + } else + CsdSchedulePoll(); // not implementing CsdScheduleCount return 0; - } -void CsdEnqueueGeneral(void *Message, int strategy, int priobits, int *prioptr){ +void CsdEnqueueGeneral(void *Message, int strategy, int priobits, + int *prioptr) { CmiPushPE(CmiMyPe(), sizeof(Message), Message); } -void CsdNodeEnqueueGeneral(void *Message, int strategy, int priobits, unsigned int *prioptr){ +void CsdNodeEnqueueGeneral(void *Message, int strategy, int priobits, + unsigned int *prioptr) { CmiGetNodeQueue()->push(Message); } diff --git a/src/threads.cpp b/src/threads.cpp index 44883e9..bbc5bb9 100644 --- a/src/threads.cpp +++ b/src/threads.cpp @@ -358,17 +358,17 @@ void CthAwaken(CthThread th) { awakenfn(token, strategy, 0, 0); // If this crashes, disable ASLR. } -void CthAwakenPrio(CthThread th, int s, int pb, unsigned int *prio) -{ +void CthAwakenPrio(CthThread th, int s, int pb, unsigned int *prio) { CthAwkFn awakenfn = B(th)->awakenfn; - if (awakenfn == 0) CthNoStrategy(); + if (awakenfn == 0) + CthNoStrategy(); #if CMK_TRACE_ENABLED -#if ! CMK_TRACE_IN_CHARM - if(CpvAccess(traceOn)) +#if !CMK_TRACE_IN_CHARM + if (CpvAccess(traceOn)) traceAwaken(th); #endif #endif - CthThreadToken * token = B(th)->token; + CthThreadToken *token = B(th)->token; awakenfn(token, s, pb, prio); // If this crashes, disable ASLR. B(th)->scheduled++; } @@ -497,15 +497,14 @@ void CthRegistered(size_t maxOffset) { /* possible hack? CW */ char *CthGetData(CthThread t) { return B(t)->data; } -void CthSetEventInfo(CthThread t, int event, int srcPE) -{ +void CthSetEventInfo(CthThread t, int event, int srcPE) { B(t)->eventID = event; B(t)->srcPE = srcPE; } -void CthFree(CthThread t) -{ - if (t==NULL) return; +void CthFree(CthThread t) { + if (t == NULL) + return; if (t != CthSelf()) { CthThreadFree(t); diff --git a/tests/ping_ack/ping.cpp b/tests/ping_ack/ping.cpp index 48b40f7..308a0a6 100644 --- a/tests/ping_ack/ping.cpp +++ b/tests/ping_ack/ping.cpp @@ -1,15 +1,16 @@ -#include +#include "converse.h" #include +#include #include -#include "converse.h" CpvDeclare(int, bigmsg_index); CpvDeclare(int, ackmsg_index); CpvDeclare(int, shortmsg_index); CpvDeclare(int, msg_size); -CpvDeclare(int, trial); // increments per trial, gets set to 0 at the start of a new msg size -CpvDeclare(int, round); // increments per msg size -CpvDeclare(int, warmup_flag); // 1 when in warmup round, 0 when not +CpvDeclare(int, trial); // increments per trial, gets set to 0 at the start of a + // new msg size +CpvDeclare(int, round); // increments per msg size +CpvDeclare(int, warmup_flag); // 1 when in warmup round, 0 when not CpvDeclare(int, recv_count); CpvDeclare(int, ack_count); CpvDeclare(double, total_time); @@ -17,21 +18,20 @@ CpvDeclare(double, process_time); CpvDeclare(double, send_time); int msg_count; -#define nMSG_SIZE 3 // if the msg_sizes are hard_coded, this should be the same as the length of the hard coded array +#define nMSG_SIZE \ + 3 // if the msg_sizes are hard_coded, this should be the same as the length of + // the hard coded array #define nTRIALS_PER_SIZE 10 -#define CALCULATION_PRECISION 0.0001 // the decimal place that the output data is rounded to +#define CALCULATION_PRECISION \ + 0.0001 // the decimal place that the output data is rounded to -double total_time[nTRIALS_PER_SIZE]; // times are stored in us +double total_time[nTRIALS_PER_SIZE]; // times are stored in us double process_time[nTRIALS_PER_SIZE]; double send_time[nTRIALS_PER_SIZE]; - int msg_sizes[nMSG_SIZE] = {56, 4096, 65536}; // hard coded msg_size values - - -typedef struct myMsg -{ +typedef struct myMsg { char header[CmiMsgHeaderSizeBytes]; int payload[1]; } *message; @@ -44,9 +44,9 @@ double round_to(double val, double precision) { double get_average(double arr[]) { double tot = 0; - for (int i = 0; i < nTRIALS_PER_SIZE; ++i) tot += arr[i]; + for (int i = 0; i < nTRIALS_PER_SIZE; ++i) + tot += arr[i]; return (round_to(tot, CALCULATION_PRECISION) / nTRIALS_PER_SIZE); - } double get_stdev(double arr[]) { @@ -61,29 +61,36 @@ double get_stdev(double arr[]) { double get_max(double arr[]) { double max = arr[0]; for (int i = 1; i < nTRIALS_PER_SIZE; ++i) - if (arr[i] > arr[0]) max = arr[i]; - return max; + if (arr[i] > arr[0]) + max = arr[i]; + return max; } - void print_results() { if (!CpvAccess(warmup_flag)) { CmiPrintf("msg_size=%d\n", CpvAccess(msg_size)); - //for (int i = 0; i < nTRIALS_PER_SIZE; ++i) { - //DEBUG: print without trial number: - //CmiPrintf("Send time: %f, process time: %f, total time: %f\n", send_time[i], process_time[i], total_time[i]); - - //DEBUG: print with trial number: - //CmiPrintf("%d %f\n %f\n %f\n", i, send_time[i], process_time[i], total_time[i]); + // for (int i = 0; i < nTRIALS_PER_SIZE; ++i) { + // DEBUG: print without trial number: + // CmiPrintf("Send time: %f, process time: %f, total time: %f\n", + // send_time[i], process_time[i], total_time[i]); + + // DEBUG: print with trial number: + // CmiPrintf("%d %f\n %f\n %f\n", i, send_time[i], process_time[i], + // total_time[i]); //} // print data: - CmiPrintf("Format: {#PEs},{msg_size},{average send/process/total time (us)},{stdevs*3},{maxs*3}\n"); - CmiPrintf("DATA,%d,%d,%f,%f,%f,%f,%f,%f,%f,%f,%f\n", CmiNumPes(), CpvAccess(msg_size), get_average(send_time), get_average(process_time), get_average(total_time), - get_stdev(send_time), get_stdev(process_time), get_stdev(total_time), get_max(send_time), get_max(process_time), get_max(total_time)); - - + CmiPrintf("Format: {#PEs},{msg_size},{average send/process/total time " + "(us)},{stdevs*3},{maxs*3}\n"); + CmiPrintf("DATA,%d,%d,%f,%f,%f,%f,%f,%f,%f,%f,%f\n", CmiNumPes(), + CpvAccess(msg_size), get_average(send_time), + get_average(process_time), get_average(total_time), + get_stdev(send_time), get_stdev(process_time), + get_stdev(total_time), get_max(send_time), get_max(process_time), + get_max(total_time)); + } else { - if (CpvAccess(round) == nMSG_SIZE - 1) // if this is the end of the warmup round + if (CpvAccess(round) == + nMSG_SIZE - 1) // if this is the end of the warmup round CmiPrintf("Warm up Done!\n"); // DEBUG: Print what msg_size the warmup round is on @@ -95,25 +102,29 @@ void print_results() { void send_msg() { double start_time, crt_time; struct myMsg *msg; - // CmiPrintf("\nSending msg fron pe%d to pe%d\n",CmiMyPe(), CmiNumPes()/2+CmiMyPe()); + // CmiPrintf("\nSending msg fron pe%d to pe%d\n",CmiMyPe(), + // CmiNumPes()/2+CmiMyPe()); CpvAccess(process_time) = 0.0; CpvAccess(send_time) = 0.0; CpvAccess(total_time) = CmiWallTimer(); - for(int k = 0; k < msg_count; k++) { + for (int k = 0; k < msg_count; k++) { crt_time = CmiWallTimer(); msg = (message)CmiAlloc(CpvAccess(msg_size)); // Fills payload with ints - for (int i = 0; i < (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int); ++i) msg->payload[i] = i; - + for (int i = 0; + i < (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int); ++i) + msg->payload[i] = i; + // DEBUG: Print ints stored in payload - // for (int i = 0; i < (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int); ++i) CmiPrintf("%d ", msg->payload[i]); - // CmiPrintf("\n"); + // for (int i = 0; i < (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / + // sizeof(int); ++i) CmiPrintf("%d ", msg->payload[i]); CmiPrintf("\n"); CmiSetHandler(msg, CpvAccess(bigmsg_index)); - CpvAccess(process_time) = CmiWallTimer() - crt_time + CpvAccess(process_time); + CpvAccess(process_time) = + CmiWallTimer() - crt_time + CpvAccess(process_time); start_time = CmiWallTimer(); - //Send from my pe-i on node-0 to q+i on node-1 + // Send from my pe-i on node-0 to q+i on node-1 CmiSyncSendAndFree(CmiNumPes() / 2 + CmiMyPe(), CpvAccess(msg_size), msg); CpvAccess(send_time) = CmiWallTimer() - start_time + CpvAccess(send_time); } @@ -122,18 +133,20 @@ void send_msg() { void shortmsg_handler(void *vmsg) { message smsg = (message)vmsg; CmiFree(smsg); - if (!CpvAccess(warmup_flag)) { // normal round handling - if (CpvAccess(trial) == nTRIALS_PER_SIZE) { // if we have run the current msg size for nTRIALS + if (!CpvAccess(warmup_flag)) { // normal round handling + if (CpvAccess(trial) == + nTRIALS_PER_SIZE) { // if we have run the current msg size for nTRIALS CpvAccess(round) = CpvAccess(round) + 1; CpvAccess(trial) = 0; CpvAccess(msg_size) = msg_sizes[CpvAccess(round)]; - } - } else { // warmup round handling - if (CpvAccess(round) == nMSG_SIZE - 1) { // if this is the end of the warmup round + } + } else { // warmup round handling + if (CpvAccess(round) == + nMSG_SIZE - 1) { // if this is the end of the warmup round CpvAccess(round) = 0; CpvAccess(msg_size) = msg_sizes[0]; CpvAccess(warmup_flag) = 0; - } else { // otherwise warm up the next msg size + } else { // otherwise warm up the next msg size CpvAccess(round) = CpvAccess(round) + 1; CpvAccess(msg_size) = msg_sizes[CpvAccess(round)]; } @@ -143,16 +156,14 @@ void shortmsg_handler(void *vmsg) { } void do_work(long start, long end, void *result) { - long tmp=0; - for (long i=start; i<=end; i++) { - tmp+=(long)(sqrt(1+cos(i*1.57))); + long tmp = 0; + for (long i = start; i <= end; i++) { + tmp += (long)(sqrt(1 + cos(i * 1.57))); } *(long *)result = tmp + *(long *)result; } - -void bigmsg_handler(void *vmsg) -{ +void bigmsg_handler(void *vmsg) { int i, next; message msg = (message)vmsg; // if this is a receiving PE @@ -160,25 +171,28 @@ void bigmsg_handler(void *vmsg) CpvAccess(recv_count) = 1 + CpvAccess(recv_count); long sum = 0; long result = 0; - double num_ints = (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int); + double num_ints = + (CpvAccess(msg_size) - CmiMsgHeaderSizeBytes) / sizeof(int); double exp_avg = (num_ints - 1) / 2; for (i = 0; i < num_ints; ++i) { sum += msg->payload[i]; - do_work(i,sum,&result); + do_work(i, sum, &result); } - if(result < 0) { + if (result < 0) { CmiPrintf("Error! in computation"); } double calced_avg = sum / num_ints; if (calced_avg != exp_avg) { - CmiPrintf("Calculated average of %f does not match expected value of %f, exiting\n", calced_avg, exp_avg); + CmiPrintf("Calculated average of %f does not match expected value of %f, " + "exiting\n", + calced_avg, exp_avg); CmiExit(1); - } + } // else // CmiPrintf("Calculation OK\n"); // DEBUG: Computation Check - if(CpvAccess(recv_count) == msg_count) { + if (CpvAccess(recv_count) == msg_count) { CpvAccess(recv_count) = 0; - + CmiFree(msg); msg = (message)CmiAlloc(CpvAccess(msg_size)); CmiSetHandler(msg, CpvAccess(ackmsg_index)); @@ -189,44 +203,50 @@ void bigmsg_handler(void *vmsg) CmiPrintf("\nError: Only node-1 can be receiving node!!!!\n"); } -void pe0_ack_handler(void *vmsg) -{ +void pe0_ack_handler(void *vmsg) { int pe; message msg = (message)vmsg; - //Pe-0 receives all acks + // Pe-0 receives all acks CpvAccess(ack_count) = 1 + CpvAccess(ack_count); // DEBUG: Computation Print Check - // CmiPrintf("All %d messages of size %d on trial %d OK\n", MSG_COUNT, CpvAccess(msg_size), CpvAccess(trial)); - + // CmiPrintf("All %d messages of size %d on trial %d OK\n", MSG_COUNT, + // CpvAccess(msg_size), CpvAccess(trial)); - if(CpvAccess(ack_count) == CmiNumPes()/2) { + if (CpvAccess(ack_count) == CmiNumPes() / 2) { CpvAccess(ack_count) = 0; CpvAccess(total_time) = CmiWallTimer() - CpvAccess(total_time); // DEBUG: Original Print Statement - //CmiPrintf("Received [Trial=%d, msg size=%d] ack on PE-#%d send time=%lf, process time=%lf, total time=%lf\n", - // CpvAccess(trial), CpvAccess(msg_size), CmiMyPe(), CpvAccess(send_time), CpvAccess(process_time), CpvAccess(total_time)); + // CmiPrintf("Received [Trial=%d, msg size=%d] ack on PE-#%d send time=%lf, + // process time=%lf, total time=%lf\n", + // CpvAccess(trial), CpvAccess(msg_size), CmiMyPe(), + // CpvAccess(send_time), CpvAccess(process_time), + // CpvAccess(total_time)); CmiFree(msg); // store times in arrays - send_time[CpvAccess(trial)] = CpvAccess(send_time) * 1000000.0; // convert to microsecs. + send_time[CpvAccess(trial)] = + CpvAccess(send_time) * 1000000.0; // convert to microsecs. process_time[CpvAccess(trial)] = CpvAccess(process_time) * 1000000.0; total_time[CpvAccess(trial)] = CpvAccess(total_time) * 1000000.0; CpvAccess(trial) = CpvAccess(trial) + 1; // print results - if (CpvAccess(warmup_flag) || CpvAccess(trial) == nTRIALS_PER_SIZE) print_results(); + if (CpvAccess(warmup_flag) || CpvAccess(trial) == nTRIALS_PER_SIZE) + print_results(); - // if this is not the warmup round, and we have finished the final trial, and we are on the final msg size, exit - if(!CpvAccess(warmup_flag) && CpvAccess(trial) == nTRIALS_PER_SIZE && CpvAccess(round) == nMSG_SIZE - 1) + // if this is not the warmup round, and we have finished the final trial, + // and we are on the final msg size, exit + if (!CpvAccess(warmup_flag) && CpvAccess(trial) == nTRIALS_PER_SIZE && + CpvAccess(round) == nMSG_SIZE - 1) CmiExit(0); else { // CmiPrintf("\nSending short msgs from PE-%d", CmiMyPe()); - for(pe = 0 ; pe