Skip to content
This repository was archived by the owner on Apr 23, 2020. It is now read-only.

Commit c16d7ee

Browse files
committed
[MCA][LSUnit] Track loads and stores until retirement.
Before this patch, loads and stores were only tracked by their corresponding queues in the LSUnit from dispatch until execute stage. In practice we should be more conservative and assume that memory opcodes leave their queues at retirement stage. Basically, loads should leave the load queue only when they have completed and delivered their data. We conservatively assume that a load is completed when it is retired. Stores should be tracked by the store queue from dispatch until retirement. In practice, stores can only leave the store queue if their data can be written to the data cache. This is mostly a mechanical change. With this patch, the retire stage notifies the LSUnit when a memory instruction is retired. That would triggers the release of LDQ/STQ entries. The only visible change is in memory tests for the bdver2 model. That is because bdver2 is the only model that defines the load/store queue size. This patch partially addresses PR39830. Differential Revision: https://reviews.llvm.org/D68266 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@374034 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent b8534ab commit c16d7ee

File tree

8 files changed

+96
-89
lines changed

8 files changed

+96
-89
lines changed

Diff for: include/llvm/MCA/HardwareUnits/LSUnit.h

+6-4
Original file line numberDiff line numberDiff line change
@@ -291,9 +291,14 @@ class LSUnitBase : public HardwareUnit {
291291
return NextGroupID++;
292292
}
293293

294-
// Instruction executed event handlers.
295294
virtual void onInstructionExecuted(const InstRef &IR);
296295

296+
// Loads are tracked by the LDQ (load queue) from dispatch until completion.
297+
// Stores are tracked by the STQ (store queue) from dispatch until commitment.
298+
// By default we conservatively assume that the LDQ receives a load at
299+
// dispatch. Loads leave the LDQ at retirement stage.
300+
virtual void onInstructionRetired(const InstRef &IR);
301+
297302
virtual void onInstructionIssued(const InstRef &IR) {
298303
unsigned GroupID = IR.getInstruction()->getLSUTokenID();
299304
Groups[GroupID]->onInstructionIssued(IR);
@@ -438,9 +443,6 @@ class LSUnit : public LSUnitBase {
438443
/// 6. A store has to wait until an older store barrier is fully executed.
439444
unsigned dispatch(const InstRef &IR) override;
440445

441-
// FIXME: For simplicity, we optimistically assume a similar behavior for
442-
// store instructions. In practice, store operations don't tend to leave the
443-
// store queue until they reach the 'Retired' stage (See PR39830).
444446
void onInstructionExecuted(const InstRef &IR) override;
445447
};
446448

Diff for: include/llvm/MCA/Stages/RetireStage.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#ifndef LLVM_MCA_RETIRE_STAGE_H
1717
#define LLVM_MCA_RETIRE_STAGE_H
1818

19+
#include "llvm/MCA/HardwareUnits/LSUnit.h"
1920
#include "llvm/MCA/HardwareUnits/RegisterFile.h"
2021
#include "llvm/MCA/HardwareUnits/RetireControlUnit.h"
2122
#include "llvm/MCA/Stages/Stage.h"
@@ -27,13 +28,14 @@ class RetireStage final : public Stage {
2728
// Owner will go away when we move listeners/eventing to the stages.
2829
RetireControlUnit &RCU;
2930
RegisterFile &PRF;
31+
LSUnitBase &LSU;
3032

3133
RetireStage(const RetireStage &Other) = delete;
3234
RetireStage &operator=(const RetireStage &Other) = delete;
3335

3436
public:
35-
RetireStage(RetireControlUnit &R, RegisterFile &F)
36-
: Stage(), RCU(R), PRF(F) {}
37+
RetireStage(RetireControlUnit &R, RegisterFile &F, LSUnitBase &LS)
38+
: Stage(), RCU(R), PRF(F), LSU(LS) {}
3739

3840
bool hasWorkToComplete() const override { return !RCU.isEmpty(); }
3941
Error cycleStart() override;

Diff for: lib/MCA/Context.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Context::createDefaultPipeline(const PipelineOptions &Opts, SourceMgr &SrcMgr) {
4444
*RCU, *PRF);
4545
auto Execute =
4646
std::make_unique<ExecuteStage>(*HWS, Opts.EnableBottleneckAnalysis);
47-
auto Retire = std::make_unique<RetireStage>(*RCU, *PRF);
47+
auto Retire = std::make_unique<RetireStage>(*RCU, *PRF, *LSU);
4848

4949
// Pass the ownership of all the hardware units to this Context.
5050
addHardwareUnit(std::move(RCU));

Diff for: lib/MCA/HardwareUnits/LSUnit.cpp

+9-7
Original file line numberDiff line numberDiff line change
@@ -160,17 +160,19 @@ LSUnit::Status LSUnit::isAvailable(const InstRef &IR) const {
160160
}
161161

162162
void LSUnitBase::onInstructionExecuted(const InstRef &IR) {
163-
const InstrDesc &Desc = IR.getInstruction()->getDesc();
164-
bool IsALoad = Desc.MayLoad;
165-
bool IsAStore = Desc.MayStore;
166-
assert((IsALoad || IsAStore) && "Expected a memory operation!");
167-
168163
unsigned GroupID = IR.getInstruction()->getLSUTokenID();
169164
auto It = Groups.find(GroupID);
165+
assert(It != Groups.end() && "Instruction not dispatched to the LS unit");
170166
It->second->onInstructionExecuted();
171-
if (It->second->isExecuted()) {
167+
if (It->second->isExecuted())
172168
Groups.erase(It);
173-
}
169+
}
170+
171+
void LSUnitBase::onInstructionRetired(const InstRef &IR) {
172+
const InstrDesc &Desc = IR.getInstruction()->getDesc();
173+
bool IsALoad = Desc.MayLoad;
174+
bool IsAStore = Desc.MayStore;
175+
assert((IsALoad || IsAStore) && "Expected a memory operation!");
174176

175177
if (IsALoad) {
176178
releaseLQSlot();

Diff for: lib/MCA/Stages/RetireStage.cpp

+4
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ void RetireStage::notifyInstructionRetired(const InstRef &IR) const {
5252
llvm::SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
5353
const Instruction &Inst = *IR.getInstruction();
5454

55+
// Release the load/store queue entries.
56+
if (Inst.isMemOp())
57+
LSU.onInstructionRetired(IR);
58+
5559
for (const WriteState &WS : Inst.getDefs())
5660
PRF.removeRegisterWrite(WS, FreedRegs);
5761
notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));

Diff for: test/tools/llvm-mca/X86/BdVer2/load-store-throughput.s

+15-15
Original file line numberDiff line numberDiff line change
@@ -507,12 +507,12 @@ movaps %xmm3, (%rbx)
507507

508508
# CHECK: Iterations: 100
509509
# CHECK-NEXT: Instructions: 400
510-
# CHECK-NEXT: Total Cycles: 593
510+
# CHECK-NEXT: Total Cycles: 554
511511
# CHECK-NEXT: Total uOps: 400
512512

513513
# CHECK: Dispatch Width: 4
514-
# CHECK-NEXT: uOps Per Cycle: 0.67
515-
# CHECK-NEXT: IPC: 0.67
514+
# CHECK-NEXT: uOps Per Cycle: 0.72
515+
# CHECK-NEXT: IPC: 0.72
516516
# CHECK-NEXT: Block RThroughput: 4.0
517517

518518
# CHECK: Instruction Info:
@@ -532,24 +532,24 @@ movaps %xmm3, (%rbx)
532532
# CHECK: Dynamic Dispatch Stall Cycles:
533533
# CHECK-NEXT: RAT - Register unavailable: 0
534534
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
535-
# CHECK-NEXT: SCHEDQ - Scheduler full: 187 (31.5%)
535+
# CHECK-NEXT: SCHEDQ - Scheduler full: 55 (9.9%)
536536
# CHECK-NEXT: LQ - Load queue full: 0
537-
# CHECK-NEXT: SQ - Store queue full: 342 (57.7%)
537+
# CHECK-NEXT: SQ - Store queue full: 437 (78.9%)
538538
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
539539

540540
# CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched:
541541
# CHECK-NEXT: [# dispatched], [# cycles]
542-
# CHECK-NEXT: 0, 403 (68.0%)
543-
# CHECK-NEXT: 1, 90 (15.2%)
544-
# CHECK-NEXT: 2, 2 (0.3%)
545-
# CHECK-NEXT: 3, 86 (14.5%)
546-
# CHECK-NEXT: 4, 12 (2.0%)
542+
# CHECK-NEXT: 0, 365 (65.9%)
543+
# CHECK-NEXT: 1, 88 (15.9%)
544+
# CHECK-NEXT: 2, 3 (0.5%)
545+
# CHECK-NEXT: 3, 86 (15.5%)
546+
# CHECK-NEXT: 4, 12 (2.2%)
547547

548548
# CHECK: Schedulers - number of cycles where we saw N micro opcodes issued:
549549
# CHECK-NEXT: [# issued], [# cycles]
550-
# CHECK-NEXT: 0, 292 (49.2%)
551-
# CHECK-NEXT: 1, 202 (34.1%)
552-
# CHECK-NEXT: 2, 99 (16.7%)
550+
# CHECK-NEXT: 0, 253 (45.7%)
551+
# CHECK-NEXT: 1, 202 (36.5%)
552+
# CHECK-NEXT: 2, 99 (17.9%)
553553

554554
# CHECK: Scheduler's queue usage:
555555
# CHECK-NEXT: [1] Resource name.
@@ -595,8 +595,8 @@ movaps %xmm3, (%rbx)
595595
# CHECK: Resource pressure by instruction:
596596
# CHECK-NEXT: [0.0] [0.1] [1] [2] [3] [4] [5] [6] [7.0] [7.1] [8.0] [8.1] [9] [10] [11] [12] [13] [14] [15] [16.0] [16.1] [17] [18] Instructions:
597597
# CHECK-NEXT: - 1.00 - - - - - - - - - - - 1.00 - - - 3.00 - - - - 1.00 movd %mm0, (%rax)
598-
# CHECK-NEXT: 0.36 2.64 - - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - movd (%rcx), %mm1
599-
# CHECK-NEXT: 2.64 0.36 - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - - movd (%rdx), %mm2
598+
# CHECK-NEXT: 1.53 1.47 - - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - movd (%rcx), %mm1
599+
# CHECK-NEXT: 1.47 1.53 - - - - - - - - 3.00 - - - 1.00 - - - - 3.00 - - - movd (%rdx), %mm2
600600
# CHECK-NEXT: 1.00 - - - - - - - - - - - - 1.00 - - 3.00 - - - - - 1.00 movd %mm3, (%rbx)
601601

602602
# CHECK: Timeline view:

Diff for: test/tools/llvm-mca/X86/BdVer2/load-throughput.s

+22-22
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ vmovaps (%rbx), %ymm3
8080
# CHECK-NEXT: RAT - Register unavailable: 0
8181
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
8282
# CHECK-NEXT: SCHEDQ - Scheduler full: 0
83-
# CHECK-NEXT: LQ - Load queue full: 353 (86.9%)
83+
# CHECK-NEXT: LQ - Load queue full: 354 (87.2%)
8484
# CHECK-NEXT: SQ - Store queue full: 0
8585
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
8686

@@ -102,9 +102,9 @@ vmovaps (%rbx), %ymm3
102102
# CHECK-NEXT: [4] Total number of buffer entries.
103103

104104
# CHECK: [1] [2] [3] [4]
105-
# CHECK-NEXT: PdEX 32 36 40
105+
# CHECK-NEXT: PdEX 31 34 40
106106
# CHECK-NEXT: PdFPU 0 0 64
107-
# CHECK-NEXT: PdLoad 37 40 40
107+
# CHECK-NEXT: PdLoad 36 40 40
108108
# CHECK-NEXT: PdStore 0 0 24
109109

110110
# CHECK: Resources:
@@ -193,7 +193,7 @@ vmovaps (%rbx), %ymm3
193193
# CHECK-NEXT: RAT - Register unavailable: 0
194194
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
195195
# CHECK-NEXT: SCHEDQ - Scheduler full: 0
196-
# CHECK-NEXT: LQ - Load queue full: 353 (86.9%)
196+
# CHECK-NEXT: LQ - Load queue full: 354 (87.2%)
197197
# CHECK-NEXT: SQ - Store queue full: 0
198198
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
199199

@@ -215,9 +215,9 @@ vmovaps (%rbx), %ymm3
215215
# CHECK-NEXT: [4] Total number of buffer entries.
216216

217217
# CHECK: [1] [2] [3] [4]
218-
# CHECK-NEXT: PdEX 32 36 40
218+
# CHECK-NEXT: PdEX 31 34 40
219219
# CHECK-NEXT: PdFPU 0 0 64
220-
# CHECK-NEXT: PdLoad 37 40 40
220+
# CHECK-NEXT: PdLoad 36 40 40
221221
# CHECK-NEXT: PdStore 0 0 24
222222

223223
# CHECK: Resources:
@@ -306,7 +306,7 @@ vmovaps (%rbx), %ymm3
306306
# CHECK-NEXT: RAT - Register unavailable: 0
307307
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
308308
# CHECK-NEXT: SCHEDQ - Scheduler full: 0
309-
# CHECK-NEXT: LQ - Load queue full: 353 (86.9%)
309+
# CHECK-NEXT: LQ - Load queue full: 354 (87.2%)
310310
# CHECK-NEXT: SQ - Store queue full: 0
311311
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
312312

@@ -328,9 +328,9 @@ vmovaps (%rbx), %ymm3
328328
# CHECK-NEXT: [4] Total number of buffer entries.
329329

330330
# CHECK: [1] [2] [3] [4]
331-
# CHECK-NEXT: PdEX 32 36 40
331+
# CHECK-NEXT: PdEX 31 34 40
332332
# CHECK-NEXT: PdFPU 0 0 64
333-
# CHECK-NEXT: PdLoad 37 40 40
333+
# CHECK-NEXT: PdLoad 36 40 40
334334
# CHECK-NEXT: PdStore 0 0 24
335335

336336
# CHECK: Resources:
@@ -419,7 +419,7 @@ vmovaps (%rbx), %ymm3
419419
# CHECK-NEXT: RAT - Register unavailable: 0
420420
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
421421
# CHECK-NEXT: SCHEDQ - Scheduler full: 0
422-
# CHECK-NEXT: LQ - Load queue full: 353 (86.9%)
422+
# CHECK-NEXT: LQ - Load queue full: 354 (87.2%)
423423
# CHECK-NEXT: SQ - Store queue full: 0
424424
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
425425

@@ -441,9 +441,9 @@ vmovaps (%rbx), %ymm3
441441
# CHECK-NEXT: [4] Total number of buffer entries.
442442

443443
# CHECK: [1] [2] [3] [4]
444-
# CHECK-NEXT: PdEX 32 36 40
444+
# CHECK-NEXT: PdEX 31 34 40
445445
# CHECK-NEXT: PdFPU 0 0 64
446-
# CHECK-NEXT: PdLoad 37 40 40
446+
# CHECK-NEXT: PdLoad 36 40 40
447447
# CHECK-NEXT: PdStore 0 0 24
448448

449449
# CHECK: Resources:
@@ -532,7 +532,7 @@ vmovaps (%rbx), %ymm3
532532
# CHECK-NEXT: RAT - Register unavailable: 0
533533
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
534534
# CHECK-NEXT: SCHEDQ - Scheduler full: 0
535-
# CHECK-NEXT: LQ - Load queue full: 532 (87.9%)
535+
# CHECK-NEXT: LQ - Load queue full: 533 (88.1%)
536536
# CHECK-NEXT: SQ - Store queue full: 0
537537
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
538538

@@ -554,8 +554,8 @@ vmovaps (%rbx), %ymm3
554554
# CHECK-NEXT: [4] Total number of buffer entries.
555555

556556
# CHECK: [1] [2] [3] [4]
557-
# CHECK-NEXT: PdEX 34 38 40
558-
# CHECK-NEXT: PdFPU 34 38 64
557+
# CHECK-NEXT: PdEX 33 36 40
558+
# CHECK-NEXT: PdFPU 33 36 64
559559
# CHECK-NEXT: PdLoad 37 40 40
560560
# CHECK-NEXT: PdStore 0 0 24
561561

@@ -646,7 +646,7 @@ vmovaps (%rbx), %ymm3
646646
# CHECK-NEXT: RAT - Register unavailable: 0
647647
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
648648
# CHECK-NEXT: SCHEDQ - Scheduler full: 0
649-
# CHECK-NEXT: LQ - Load queue full: 532 (87.9%)
649+
# CHECK-NEXT: LQ - Load queue full: 533 (88.1%)
650650
# CHECK-NEXT: SQ - Store queue full: 0
651651
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
652652

@@ -668,8 +668,8 @@ vmovaps (%rbx), %ymm3
668668
# CHECK-NEXT: [4] Total number of buffer entries.
669669

670670
# CHECK: [1] [2] [3] [4]
671-
# CHECK-NEXT: PdEX 34 38 40
672-
# CHECK-NEXT: PdFPU 34 38 64
671+
# CHECK-NEXT: PdEX 33 36 40
672+
# CHECK-NEXT: PdFPU 33 36 64
673673
# CHECK-NEXT: PdLoad 37 40 40
674674
# CHECK-NEXT: PdStore 0 0 24
675675

@@ -760,7 +760,7 @@ vmovaps (%rbx), %ymm3
760760
# CHECK-NEXT: RAT - Register unavailable: 0
761761
# CHECK-NEXT: RCU - Retire tokens unavailable: 0
762762
# CHECK-NEXT: SCHEDQ - Scheduler full: 0
763-
# CHECK-NEXT: LQ - Load queue full: 344 (56.9%)
763+
# CHECK-NEXT: LQ - Load queue full: 345 (57.0%)
764764
# CHECK-NEXT: SQ - Store queue full: 0
765765
# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 0
766766

@@ -781,9 +781,9 @@ vmovaps (%rbx), %ymm3
781781
# CHECK-NEXT: [4] Total number of buffer entries.
782782

783783
# CHECK: [1] [2] [3] [4]
784-
# CHECK-NEXT: PdEX 33 38 40
785-
# CHECK-NEXT: PdFPU 33 38 64
786-
# CHECK-NEXT: PdLoad 37 40 40
784+
# CHECK-NEXT: PdEX 33 36 40
785+
# CHECK-NEXT: PdFPU 33 36 64
786+
# CHECK-NEXT: PdLoad 36 40 40
787787
# CHECK-NEXT: PdStore 0 0 24
788788

789789
# CHECK: Resources:

0 commit comments

Comments
 (0)