NVIDIA
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 0 deletions b/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/pull_request_template.md‎
Lines changed: 1 addition & 0 deletions b/‎.github/pull_request_template.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/tava_architecture_diagram.md‎
Lines changed: 108 additions & 0 deletions b/‎.github/tava_architecture_diagram.md‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎constraints.txt‎
Lines changed: 1 addition & 4 deletions b/‎constraints.txt‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 21 additions & 0 deletions b/‎cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h‎
Lines changed: 21 additions & 0 deletions
@@ -186,6 +186,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
 ## These scripts install and pin dependency versions
 /docker/common/** @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs @NVIDIA/trt-llm-oss-compliance
 
+### TAVA Architecture Diagram
+/.github/tava_architecture_diagram.md @NVIDIA/trt-llm-TAVA-design-change
+
 ### CODEOWNERS file itself
 /.github/CODEOWNERS @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs @NVIDIA/trt-llm-oss-compliance
 
 
@@ -49,6 +49,7 @@ Please review the following before submitting your PR:
 - Any new dependencies have been scanned for license and vulnerabilities
 - [CODEOWNERS](https://github.com/NVIDIA/TensorRT-LLM/blob/main/.github/CODEOWNERS) updated if ownership changes
 - Documentation updated as needed
+- Update [tava architecture diagram](https://github.com/NVIDIA/TensorRT-LLM/blob/main/.github/tava_architecture_diagram.md) if there is a significant design change in PR.
 - The reviewers assigned automatically/manually are appropriate for the PR.
 
 
 
@@ -0,0 +1,108 @@
+```mermaid
+graph TB
+    subgraph "User API & CLI Tools"
+        CLI[CLI Tools]
+        LLMAPI[LLM API]
+        CLI --> LLMAPI
+    end
+
+    subgraph "Model Checkpoint"
+        Checkpoint[Huggingface Models]
+        Checkpoint --> CLI
+        Checkpoint --> LLMAPI
+    end
+
+    subgraph "TensorRT_Flow"
+        trtllmExecutor[trtllm.Executor]
+        Engine[TensorRT Engine]
+        TRTGraph[TensorRT Graph]
+        Plugins[TensorRT Plugins]
+        cudaKernel[CUDA Kernel]
+        Executor[Executor]
+        LLMAPI --> trtllmExecutor
+        trtllmExecutor --> |build|Engine
+        trtllmExecutor --> |compile|TRTGraph
+        trtllmExecutor --> |compile|Plugins
+        Engine --> Executor
+        Plugins --> Executor
+        TRTGraph --> Executor
+        Plugins --> cudaKernel
+    end
+
+    subgraph "PyTorch_Flow"
+        PyExecutor[PyExecutor]
+        PyEngine[PyTorch Engine]
+        CustomOps[Custom Ops]
+        PyTorchOps[Pytorch Ops]
+        KernelLibs[Kernel Libs]
+        PyScheduler[Scheduler]
+        PyDecoder[Decoder]
+        CUDAKernel[CUDA Kernel]
+        LLMAPI --> PyExecutor
+        PyExecutor --> PyEngine[PyTorch Engine]
+        PyEngine --> CustomOps
+        PyEngine --> PyTorchOps
+        PyEngine --> KernelLibs
+        PyEngine --> PyScheduler
+        PyEngine --> PyDecoder
+        KernelLibs --> CUDAKernel
+        CustomOps --> CUDAKernel
+    end
+
+    subgraph "Shared_Component"
+        Shared_Decoder[Decoder]
+        Shared_Scheduler[Scheduler]
+        Sampling[Sampling]
+        BatchManager[Batch Manager]
+        KVCache[KV Cache Manager]
+        PyScheduler --> |Pybind|Shared_Scheduler
+        PyDecoder --> |Pybind|Shared_Decoder
+        Executor --> Shared_Decoder
+        Shared_Decoder --> Sampling
+        Executor --> Shared_Scheduler[Scheduler]
+        Shared_Scheduler --> |In-flight Batching| BatchManager
+        BatchManager --> KVCache
+    end
+
+    subgraph "Output_Results"
+        Tokens[Generated Tokens]
+        Stats[Performance Stats]
+        Metrics[Accuracy Metrics]
+    end
+
+    %% PyTorch_Flow ~~~ TensorRT_Flow 
+
+    TensorRT_Flow --> Output_Results
+    PyTorch_Flow --> Output_Results
+
+    %% Force Output_Results to be between PyTorch_flow and TensorRT_flow
+    PyTorch_Flow ~~~ Output_Results
+
+    %% Model checkpoint format
+    classDef checkpoint fill:#ff1,stroke:#333,stroke-width:2px;
+    class Checkpoint checkpoint;
+
+    %% CLI tools format
+    classDef cli fill:#f9f,stroke:#333,stroke-width:2px;
+    class CLI cli;
+    
+    %% TRT flow format
+    classDef trt fill:#bbf,stroke:#333,stroke-width:2px;
+    class trtllmExecutor,TRTGraph,Plugins,Engine,Executor,cudaKernel trt;
+
+    %% PyTorch flow format
+    classDef pytorch fill:#8bf,stroke:#333,stroke-width:2px;
+    class PyExecutor,PyEngine,CustomOps,PyTorchOps,KernelLibs,PyScheduler,PyDecoder,CUDAKernel pytorch;
+
+    %% Shared Componnet format
+    classDef component fill:#fc8,stroke:#333,stroke-width:2px;
+    class Shared_Decoder,Sampling,Shared_Scheduler,BatchManager,KVCache component;
+    
+    %% APIs format
+    classDef api fill:#bfb,stroke:#333,stroke-width:2px;
+    class PythonAPI,CppAPI,LLMAPI api;
+
+    %% Results format
+    classDef result fill:#fbb,stroke:#333,stroke-width:2px;
+    class Tokens,Stats,Metrics result;
+```
@@ -1,5 +1,2 @@
-# These vulnerabilities were inherited from the base image (pytorch:25.06-py3) and should be removed when the base image
+# These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image
 # is updated.
-
-# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
-protobuf>=4.25.8
@@ -871,6 +871,13 @@ class WindowBlockManager
         return mIsValidStoreForReuseSequence.at(requestId);
     }
 
+    void resetReuseState()
+    {
+        std::lock_guard<std::mutex> lock(mCachedBlocksRootMutex);
+        mCachedBlocksRoot
+            = std::make_shared<KVCacheBlock>(KVCacheBlock::kCachedBlocksRootId, tensorrt_llm::kernels::KVCacheIndex{0});
+    }
+
 private:
     //! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
     void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -1347,6 +1354,14 @@ class BlockManager
         return mWindowBlockManagers.at(windowSize).isSequenceValidForStoreForReuse(requestId);
     }
 
+    void resetReuseState()
+    {
+        for (auto& [windowSize, manager] : mWindowBlockManagers)
+        {
+            manager.resetReuseState();
+        }
+    }
+
 private:
     [[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const
     {
@@ -1533,6 +1548,7 @@ class BaseKVCacheManager
 
     virtual void refreshBlocks() = 0;
     virtual void flushIterationEvents() = 0;
+    virtual void resetReuseState() = 0;
 
     [[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
 
@@ -1913,6 +1929,11 @@ class KVCacheManager : public BaseKVCacheManager
         return mBlockManager.findBlocksInReuseTreeByBlockKey(blockKey, windowSize);
     }
 
+    void resetReuseState() override
+    {
+        mBlockManager.resetReuseState();
+    }
+
     /// @brief Finds the maximum attention window that can be used on a sequence, given some kv-cache block capacity.
     ///
     /// @param inputLength The number of input tokens in the sequence.