Merge branch 'release/1.1' into user/barry/fix_sm100_r1_ci

Barry-Delaney · web-flow · commit 6744876dd1f4 · 2025-10-29T08:56:54.000+08:00
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@@ -128,7 +128,7 @@ class BlockRange
     BaseKVCacheManager const* mManager;
     runtime::ITensor::SharedPtr mPool;
     SizeType32 mWindowSize;
-    const LlmRequest::RequestIdType mRequestId;
+    LlmRequest::RequestIdType const mRequestId;
     std::vector<SizeType32> mBlockIds;
 
     static constexpr SizeType32 kFIRST_AND_ONLY_BEAM = 0;
@@ -203,7 +203,18 @@ class BlockIterator
     {
         if (mIdx < mRange->mBlockIds.size())
         {
-            mCurrent = runtime::ITensor::slice(mRange->mPool, mRange->mBlockIds.at(mIdx), 1);
+            if (mRange->mManager != nullptr)
+            {
+                BlockPtr const& block
+                    = mRange->mManager->getBlockManager().getBlockById(mRange->mBlockIds.at(mIdx), mRange->mWindowSize);
+                TLLM_CHECK_WITH_INFO(block->isPrimary(), "cache transceiver only supports primary blocks");
+                auto const blockOffset = block->getMemoryPoolBlockIndex();
+                mCurrent = runtime::ITensor::slice(mRange->mPool, blockOffset, 1);
+            }
+            else
+            {
+                mCurrent = runtime::ITensor::slice(mRange->mPool, mRange->mBlockIds.at(mIdx), 1);
+            }
         }
     }
 
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -1952,7 +1952,7 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(
             return 0;
         }
 
-        auto const numCurrTokens = mSequences.at(req.mRequestId).getNumTokens();
+        auto const numCurrTokens = getSequence(req.mRequestId).getNumTokens();
         auto const generatedTokens = numCurrTokens - req.getPromptLen();
         auto const maxTokensToAddToKVCache = req.mMaxNewTokens - generatedTokens;
         auto const tokensPerStep = req.getNumDraftTokens() + 1;
@@ -2198,7 +2198,13 @@ void KVCacheManager::addSequence(
 void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)
 {
     auto const requestId = llmRequest.mRequestId;
-    if (mSequences.find(requestId) != mSequences.end())
+    bool found = false;
+    {
+        // protect the mSequences
+        std::scoped_lock lock(mSequencesMtx);
+        found = mSequences.find(requestId) != mSequences.end();
+    }
+    if (found)
     {
         auto& sequence = getSequence(requestId);
         if (mEnableBlockReuse && !llmRequest.isDummyRequest())