Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Experimental] Update deep tiled matmul for Parallelize Graphs on multiple numa nodes #153

Open
wants to merge 5 commits into
base: zhicong/run_pipeline_with_tuner
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update deeptile
ZhangYan committed Jul 9, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 5f56b96f0b68f13d77742be4af1eb6ed6d30e41d
24 changes: 23 additions & 1 deletion include/gc/Analysis/MatmulConfigAnalysis.h
Original file line number Diff line number Diff line change
@@ -28,11 +28,29 @@ struct SystemDesc {
// get runtime OMP_NUM_THREADS
uint32_t getNumThreads() {
char *numThreads = getenv("OMP_NUM_THREADS");
if (numThreads) {
if (!threads_limited && numThreads) {
return std::stoi(numThreads);
}
return curThreads;
}

// set the expected threads
void limitOnSingleNode(uint32_t numa_node) {
char *cacheSize = getenv("NUMA_THREADS");
if (cacheSize) {
curThreads = std::stoi(cacheSize);
threads_limited = true;
}
}

uint32_t getNumNodes() {
char *numThreads = getenv("OMP_NUM_THREADS");
if (threads_limited && numThreads) {
return std::stoi(numThreads) / curThreads;
}
return 1;
}

// get cache size by cacheLevel
size_t getCacheSize(uint8_t cacheLevel) {
if (cacheLevel == 1) {
@@ -57,6 +75,10 @@ struct SystemDesc {
SmallVector<size_t> getContractionOperationMaxVectorLength() {
return {512UL, 512UL};
}

private:
uint32_t curThreads = 1;
bool threads_limited = false;
};

struct MatmulConfig {
6 changes: 6 additions & 0 deletions lib/gc/Analysis/MatmulConfigAnalysis.cpp
Original file line number Diff line number Diff line change
@@ -345,6 +345,12 @@ previous matmul
MatmulConfigAnalysis::MatmulConfigAnalysis(Operation *root) {
SystemDesc sysDesc;
if (auto linalgOp = dyn_cast<linalg::LinalgOp>(root)) {
// Check if the operation has an attribute named 'splited'
auto splitedAttr = linalgOp->getAttrOfType<IntegerAttr>("splited");
if (splitedAttr) {
sysDesc.limitOnSingleNode(splitedAttr.getInt());
llvm::outs() << "splited mm, and should be allocated on numa node 0.\n";
}
auto oprandDimType = *getOprandDimType(linalgOp);
// get the origin M,N,K size
auto MDimTypeIdx = extractDimTypeIdx(oprandDimType[0], DimType::M);
7 changes: 6 additions & 1 deletion lib/gc/Transforms/DeepTileContractionNamedOp.cpp
Original file line number Diff line number Diff line change
@@ -471,6 +471,7 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
else
tileSizes[d] = getAsIndexOpFoldResult(b.getContext(), tile);
}

SmallVector<Range> loopRanges =
cast<TilingInterface>(currentOp.getOperation()).getIterationDomain(b);
OpBuilder::InsertionGuard guard(b);
@@ -482,7 +483,6 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
tileSizes[idx] = loopRanges[idx].size;
}
}

SmallVector<OpFoldResult> newParallelDims;
for (auto i = 0UL; i < reductionDims.size(); i++) {
newParallelDims.push_back(getAsIndexOpFoldResult(b.getContext(), i));
@@ -595,6 +595,11 @@ struct deepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
auto NOuterBlockSize = NDimPos.size() > 1
? (cfg.NBlock - 1) / cfg.innerMostNBlock + 1
: cfg.NBlock;
// Outermost Numa loop
option.nestedTileSizes.emplace_back(
SmallVector<size_t>{uint32_t(MFirstDim / 2)});
option.loopType.emplace_back(OuterLoopGenerationOption::LoopType::ForallOp);
option.loopDim.emplace_back(SmallVector<size_t>{MDimPos[0]});
// Outer
option.nestedTileSizes.emplace_back(SmallVector<size_t>{
MParallelBlockSize, NParallelBlockSize, KParallelBlockSize});
74 changes: 64 additions & 10 deletions lib/gc/Transforms/Tiling.cpp
Original file line number Diff line number Diff line change
@@ -782,6 +782,22 @@ FailureOr<TiledLinalgOp> static tileLinalgOpImpl(
return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector, options);
}

FailureOr<TilingResult>
getTiledImplementationOnNuma(Operation *op, OpBuilder &b,
ArrayRef<OpFoldResult> offsets,
ArrayRef<OpFoldResult> sizes) {
// Leave the `sizeBounds` value empty. That is only needed when the `sizes`
// specified could lead to out of bounds accesses.
Location loc = op->getLoc();
LinalgOp linalgOp = cast<LinalgOp>(op);
SmallVector<Value> valuesToTile = linalgOp->getOperands();

SmallVector<Type> resultTensorTypes =
getTensorOutputTypes(linalgOp, valuesToTile);
Operation *tiledOp = clone(b, linalgOp, resultTensorTypes, valuesToTile);
return TilingResult{{tiledOp}, SmallVector<Value>(tiledOp->getResults())};
}

FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
RewriterBase &b, PartialReductionOpInterface op,
ArrayRef<OpFoldResult> threadNums, ArrayRef<OpFoldResult> tileSizes,
@@ -964,6 +980,16 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
// 4.b. Clone the op and update init operands.
// We cannot use a IRMapping here because it can replace
// different OpOperands with the same value.
bool isNumaLoop = false;
if (tileSizes.size() == iterationDomain.size()) {
for (auto [idx, tile] : llvm::enumerate(tileSizes)) {
if (idx == 0 && tileSizes[idx] == iterationDomain[idx].size)
break;
if (idx > 0 && tileSizes[idx] != iterationDomain[idx].size)
break;
isNumaLoop = true;
}
}
Operation *clonedOp = b.clone(*op.getOperation());
b.modifyOpInPlace(clonedOp, [&]() {
for (auto [initOperandPtr, tiledInitValue] : llvm::zip_equal(
@@ -974,17 +1000,32 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
});
// 5. Tile the cloned op and delete the clone.
if (tileSizes.empty() || threadNums.empty()) {
FailureOr<TilingResult> tilingResult =
cast<TilingInterface>(clonedOp).getTiledImplementation(
b, tiledOffsets, tiledSizes);
if (failed(tilingResult))
return clonedOp->emitError("Failed to tile op: ");
if (tilingResult->tiledOps.size() != 1) {
return clonedOp->emitError("expected a single produced tiled op, got ")
<< tilingResult->tiledOps.size();
if (!isNumaLoop) {
FailureOr<TilingResult> tilingResult =
cast<TilingInterface>(clonedOp).getTiledImplementation(
b, tiledOffsets, tiledSizes);
if (failed(tilingResult))
return clonedOp->emitError("Failed to tile op: ");
if (tilingResult->tiledOps.size() != 1) {
return clonedOp->emitError(
"expected a single produced tiled op, got ")
<< tilingResult->tiledOps.size();
}
tiledOp = tilingResult->tiledOps.front();
tilingResults = tilingResult->tiledValues;
} else {
FailureOr<TilingResult> tilingResult = getTiledImplementationOnNuma(
cast<TilingInterface>(clonedOp), b, tiledOffsets, tiledSizes);
if (failed(tilingResult))
return clonedOp->emitError("Failed to tile op: ");
if (tilingResult->tiledOps.size() != 1) {
return clonedOp->emitError(
"expected a single produced tiled op, got ")
<< tilingResult->tiledOps.size();
}
tiledOp = tilingResult->tiledOps.front();
tilingResults = tilingResult->tiledValues;
}
tiledOp = tilingResult->tiledOps.front();
tilingResults = tilingResult->tiledValues;
} else {
LinalgTilingOptions options;
FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(
@@ -1039,6 +1080,19 @@ FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
nonZeroDimIdx++;
}
}
if (auto attr = resultSizesRank[0].dyn_cast<Attribute>()) {
if (auto intAttr = attr.dyn_cast<IntegerAttr>()) {
if (intAttr.getInt() == 16)
resultSizesRank[0] = b.getIndexAttr(32);
}
} else if (auto value = resultSizesRank[0].dyn_cast<Value>()) {
if (auto constantOp = value.getDefiningOp<arith::ConstantOp>()) {
if (auto intAttr = constantOp.getValue().dyn_cast<IntegerAttr>()) {
if (intAttr.getInt() == 16)
resultSizesRank[0] = b.getIndexAttr(32);
}
}
}
if (hasReductionThreads) {
for (auto [parallelDims, redVar] :
llvm::zip(constantNewParallelDims, reductionInductionVars)) {