Skip to content

Commit 073aef2

Browse files
authored
Merge branch 'main' into 2-model-kernels
2 parents ce7462c + 7a552c4 commit 073aef2

File tree

1,685 files changed

+15352
-8425
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,685 files changed

+15352
-8425
lines changed

.github/CODEOWNERS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,9 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers
186186
## These scripts install and pin dependency versions
187187
/docker/common/** @NVIDIA/trt-llm-setup-infra-devs @NVIDIA/trt-llm-infra-devs @NVIDIA/trt-llm-oss-compliance
188188

189+
### TAVA Architecture Diagram
190+
/.github/tava_architecture_diagram.md @NVIDIA/trt-llm-TAVA-design-change
191+
189192
### CODEOWNERS file itself
190193
/.github/CODEOWNERS @NVIDIA/trt-llm-gh-workflows-infra-devs @NVIDIA/trt-llm-infra-devs @NVIDIA/trt-llm-oss-compliance
191194

.github/pull_request_template.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ Please review the following before submitting your PR:
4949
- Any new dependencies have been scanned for license and vulnerabilities
5050
- [CODEOWNERS](https://github.com/NVIDIA/TensorRT-LLM/blob/main/.github/CODEOWNERS) updated if ownership changes
5151
- Documentation updated as needed
52+
- Update [tava architecture diagram](https://github.com/NVIDIA/TensorRT-LLM/blob/main/.github/tava_architecture_diagram.md) if there is a significant design change in PR.
5253
- The reviewers assigned automatically/manually are appropriate for the PR.
5354

5455

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
```mermaid
2+
graph TB
3+
subgraph "User API & CLI Tools"
4+
CLI[CLI Tools]
5+
LLMAPI[LLM API]
6+
CLI --> LLMAPI
7+
end
8+
9+
subgraph "Model Checkpoint"
10+
Checkpoint[Huggingface Models]
11+
Checkpoint --> CLI
12+
Checkpoint --> LLMAPI
13+
end
14+
15+
subgraph "TensorRT_Flow"
16+
trtllmExecutor[trtllm.Executor]
17+
Engine[TensorRT Engine]
18+
TRTGraph[TensorRT Graph]
19+
Plugins[TensorRT Plugins]
20+
cudaKernel[CUDA Kernel]
21+
Executor[Executor]
22+
LLMAPI --> trtllmExecutor
23+
trtllmExecutor --> |build|Engine
24+
trtllmExecutor --> |compile|TRTGraph
25+
trtllmExecutor --> |compile|Plugins
26+
Engine --> Executor
27+
Plugins --> Executor
28+
TRTGraph --> Executor
29+
Plugins --> cudaKernel
30+
end
31+
32+
subgraph "PyTorch_Flow"
33+
PyExecutor[PyExecutor]
34+
PyEngine[PyTorch Engine]
35+
CustomOps[Custom Ops]
36+
PyTorchOps[Pytorch Ops]
37+
KernelLibs[Kernel Libs]
38+
PyScheduler[Scheduler]
39+
PyDecoder[Decoder]
40+
CUDAKernel[CUDA Kernel]
41+
LLMAPI --> PyExecutor
42+
PyExecutor --> PyEngine[PyTorch Engine]
43+
PyEngine --> CustomOps
44+
PyEngine --> PyTorchOps
45+
PyEngine --> KernelLibs
46+
PyEngine --> PyScheduler
47+
PyEngine --> PyDecoder
48+
KernelLibs --> CUDAKernel
49+
CustomOps --> CUDAKernel
50+
end
51+
52+
subgraph "Shared_Component"
53+
Shared_Decoder[Decoder]
54+
Shared_Scheduler[Scheduler]
55+
Sampling[Sampling]
56+
BatchManager[Batch Manager]
57+
KVCache[KV Cache Manager]
58+
PyScheduler --> |Pybind|Shared_Scheduler
59+
PyDecoder --> |Pybind|Shared_Decoder
60+
Executor --> Shared_Decoder
61+
Shared_Decoder --> Sampling
62+
Executor --> Shared_Scheduler[Scheduler]
63+
Shared_Scheduler --> |In-flight Batching| BatchManager
64+
BatchManager --> KVCache
65+
end
66+
67+
subgraph "Output_Results"
68+
Tokens[Generated Tokens]
69+
Stats[Performance Stats]
70+
Metrics[Accuracy Metrics]
71+
end
72+
73+
%% PyTorch_Flow ~~~ TensorRT_Flow
74+
75+
TensorRT_Flow --> Output_Results
76+
PyTorch_Flow --> Output_Results
77+
78+
%% Force Output_Results to be between PyTorch_flow and TensorRT_flow
79+
PyTorch_Flow ~~~ Output_Results
80+
81+
%% Model checkpoint format
82+
classDef checkpoint fill:#ff1,stroke:#333,stroke-width:2px;
83+
class Checkpoint checkpoint;
84+
85+
%% CLI tools format
86+
classDef cli fill:#f9f,stroke:#333,stroke-width:2px;
87+
class CLI cli;
88+
89+
%% TRT flow format
90+
classDef trt fill:#bbf,stroke:#333,stroke-width:2px;
91+
class trtllmExecutor,TRTGraph,Plugins,Engine,Executor,cudaKernel trt;
92+
93+
%% PyTorch flow format
94+
classDef pytorch fill:#8bf,stroke:#333,stroke-width:2px;
95+
class PyExecutor,PyEngine,CustomOps,PyTorchOps,KernelLibs,PyScheduler,PyDecoder,CUDAKernel pytorch;
96+
97+
%% Shared Componnet format
98+
classDef component fill:#fc8,stroke:#333,stroke-width:2px;
99+
class Shared_Decoder,Sampling,Shared_Scheduler,BatchManager,KVCache component;
100+
101+
%% APIs format
102+
classDef api fill:#bfb,stroke:#333,stroke-width:2px;
103+
class PythonAPI,CppAPI,LLMAPI api;
104+
105+
%% Results format
106+
classDef result fill:#fbb,stroke:#333,stroke-width:2px;
107+
class Tokens,Stats,Metrics result;
108+
```

constraints.txt

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,2 @@
1-
# These vulnerabilities were inherited from the base image (pytorch:25.06-py3) and should be removed when the base image
1+
# These vulnerabilities were inherited from the base image (pytorch:25.10-py3) and should be removed when the base image
22
# is updated.
3-
4-
# WAR against https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
5-
protobuf>=4.25.8

cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,13 @@ class WindowBlockManager
871871
return mIsValidStoreForReuseSequence.at(requestId);
872872
}
873873

874+
void resetReuseState()
875+
{
876+
std::lock_guard<std::mutex> lock(mCachedBlocksRootMutex);
877+
mCachedBlocksRoot
878+
= std::make_shared<KVCacheBlock>(KVCacheBlock::kCachedBlocksRootId, tensorrt_llm::kernels::KVCacheIndex{0});
879+
}
880+
874881
private:
875882
//! \brief Add single block to beam of sequence and mAllocatedBlocksPerSeq.
876883
void addBlockToBeam(BlockPtr& block, GenerationRequest& sequence, SizeType32 beamIdx);
@@ -1347,6 +1354,14 @@ class BlockManager
13471354
return mWindowBlockManagers.at(windowSize).isSequenceValidForStoreForReuse(requestId);
13481355
}
13491356

1357+
void resetReuseState()
1358+
{
1359+
for (auto& [windowSize, manager] : mWindowBlockManagers)
1360+
{
1361+
manager.resetReuseState();
1362+
}
1363+
}
1364+
13501365
private:
13511366
[[nodiscard]] WindowBlockManager const& windowManagerByLayer(SizeType32 layerIdx) const
13521367
{
@@ -1533,6 +1548,7 @@ class BaseKVCacheManager
15331548

15341549
virtual void refreshBlocks() = 0;
15351550
virtual void flushIterationEvents() = 0;
1551+
virtual void resetReuseState() = 0;
15361552

15371553
[[nodiscard]] static SizeType32 getSinkBubbleLength(SizeType32 sinkTokenLen, SizeType32 tokensPerBlock);
15381554

@@ -1913,6 +1929,11 @@ class KVCacheManager : public BaseKVCacheManager
19131929
return mBlockManager.findBlocksInReuseTreeByBlockKey(blockKey, windowSize);
19141930
}
19151931

1932+
void resetReuseState() override
1933+
{
1934+
mBlockManager.resetReuseState();
1935+
}
1936+
19161937
/// @brief Finds the maximum attention window that can be used on a sequence, given some kv-cache block capacity.
19171938
///
19181939
/// @param inputLength The number of input tokens in the sequence.

0 commit comments

Comments
 (0)