Enable more aggresive trimming for very large kernels

Enable trimming of small functions, in case kernel far exceedes
threshold even after default trimming is performed, to keep compilation
time in check.
This commit is contained in:
Stefan Ilic
2025-10-23 10:01:33 +00:00
committed by igcbot
parent 66e66680f7
commit ef5a973738
5 changed files with 99 additions and 7 deletions

View File

@ -752,7 +752,7 @@ bool ProcessFuncAttributes::runOnModule(Module &M) {
if (shouldAlwaysInline) {
if ((IGC_IS_FLAG_ENABLED(ControlKernelTotalSize) || IGC_IS_FLAG_ENABLED(ControlUnitSize)) &&
efs.shouldEnableSubroutine() && efs.isTrimmedFunction(F)) {
if (IGC_IS_FLAG_ENABLED(AddNoInlineToTrimmedFunctions)) {
if (IGC_IS_FLAG_ENABLED(AddNoInlineToTrimmedFunctions) || efs.isLargeKernelThresholdExceeded()) {
SetNoInline(F);
}
} else {

View File

@ -96,6 +96,7 @@ EstimateFunctionSize::EstimateFunctionSize(AnalysisLevel AL, bool EnableStaticPr
ForceInlineStackCallWithImplArg = IGC_IS_FLAG_ENABLED(ForceInlineStackCallWithImplArg);
ControlInlineImplicitArgs = IGC_IS_FLAG_ENABLED(ControlInlineImplicitArgs);
SubroutineThreshold = IGC_GET_FLAG_VALUE(SubroutineThreshold);
LargeKernelThresholdMultiplier = IGC_GET_FLAG_VALUE(LargeKernelThresholdMultiplier);
KernelTotalSizeThreshold = IGC_GET_FLAG_VALUE(KernelTotalSizeThreshold);
ExpandedUnitSizeThreshold = IGC_GET_FLAG_VALUE(ExpandedUnitSizeThreshold);
if (EnableStaticProfileGuidedTrimming) {
@ -1132,6 +1133,16 @@ void EstimateFunctionSize::reduceKernelSize() {
bool EstimateFunctionSize::isTrimmedFunction(llvm::Function *F) { return get<FunctionNode>(F)->isTrimmed(); }
bool EstimateFunctionSize::isLargeKernelThresholdExceeded() const {
for (auto *node : kernelEntries) {
auto *kernelNode = (FunctionNode *)node;
if (kernelNode->ExpandedSize > KernelTotalSizeThreshold * LargeKernelThresholdMultiplier) {
return true;
}
}
return false;
}
// Initialize data structures for topological traversal: FunctionsInKernel and BottomUpQueue.
// FunctionsInKernel is a map data structure where the key is FunctionNode and value is the number of edges to callee
// nodes. FunctionsInKernel is primarily used for topological traversal and also used to check whether a function is in
@ -1469,6 +1480,7 @@ void EstimateFunctionSize::reduceCompilationUnitSize() {
// Top down traverse to find and retrieve functions that meet trimming criteria
void EstimateFunctionSize::getFunctionsToTrim(llvm::Function *root, llvm::SmallVector<void *, 64> &trimming_pool,
llvm::SmallVector<void *, 64> &tiny_fn_trimming_pool,
bool ignoreStackCallBoundary, uint32_t &func_cnt) {
FunctionNode *unitHead = get<FunctionNode>(root);
std::unordered_set<void *> visit;
@ -1529,6 +1541,8 @@ void EstimateFunctionSize::getFunctionsToTrim(llvm::Function *root, llvm::SmallV
Node->dumpFuncInfo(0x4, "Good to trim (Big enough > " + std::to_string(tinySizeThreshold) + ")");
break;
case FT_TOO_TINY:
// Small functions will be trimmed in special case if kernel still far exceeds threshold
tiny_fn_trimming_pool.push_back(Node);
Node->dumpFuncInfo(0x4, "Can't trim (Too tiny < " + std::to_string(tinySizeThreshold) + ")");
break;
case FT_HIGHER_WEIGHT:
@ -1601,9 +1615,9 @@ void EstimateFunctionSize::trimCompilationUnit(llvm::SmallVector<void *, 64> &un
SmallVector<void *, 64>
trimming_pool;
SmallVector<void *, 64> tiny_fn_trimming_pool;
uint32_t func_cnt = 0;
getFunctionsToTrim(unit->F, trimming_pool, ignoreStackCallBoundary, func_cnt);
getFunctionsToTrim(unit->F, trimming_pool, tiny_fn_trimming_pool, ignoreStackCallBoundary, func_cnt);
PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " has " << trimming_pool.size()
<< " functions for trimming out of " << func_cnt) if (trimming_pool.empty()) {
PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " size " << unit->ExpandedSize
@ -1614,13 +1628,18 @@ void EstimateFunctionSize::trimCompilationUnit(llvm::SmallVector<void *, 64> &un
performGreedyTrimming(unit->F, trimming_pool, threshold, ignoreStackCallBoundary);
} else {
performTrimming(unit->F, trimming_pool, threshold, ignoreStackCallBoundary);
if (ignoreStackCallBoundary && unit->ExpandedSize > threshold * LargeKernelThresholdMultiplier) {
PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << ": Size: " << unit->ExpandedSize
<< " is much larger than threshold, trimming small functions as well.")
performTrimming(unit->F, tiny_fn_trimming_pool, threshold, ignoreStackCallBoundary);
}
}
if (unit->ExpandedSize < threshold) {
PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << ": The size becomes below threshold")
} else {
PrintTrimUnit(0x2, "Kernel / Unit "
<< unit->F->getName().str()
<< ": The size is still above threhosld even though all candidates are trimmed")
<< ": The size is still above threshold even though all candidates are trimmed")
}
PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " final size " << unit->ExpandedSize
@ -1712,6 +1731,7 @@ void EstimateFunctionSize::performGreedyTrimming(Function *head, llvm::SmallVect
PrintTrimUnit(0x8, "In total, " << total_trim_cnt << " function(s) are trimmed out of " << functions_to_trim.size());
return;
}
void EstimateFunctionSize::performTrimming(Function *head, llvm::SmallVector<void *, 64> &functions_to_trim,
uint32_t threshold, bool ignoreStackCallBoundary) {
FunctionNode *unitHead = get<FunctionNode>(head);
@ -1745,7 +1765,7 @@ void EstimateFunctionSize::performTrimming(Function *head, llvm::SmallVector<voi
functionToTrim->setTrimmed();
}
total_trim_cnt += 1;
// After trimming, update exapnded size
// After trimming, update expanded size
updateExpandedUnitSize(head, ignoreStackCallBoundary);
PrintTrimUnit(0x8, "The kernel size is reduced after trimming from " << original_expandedSize << " to "
<< unitHead->ExpandedSize);

View File

@ -63,6 +63,8 @@ public:
bool isStackCallAssigned(llvm::Function *F);
bool isLargeKernelThresholdExceeded() const;
private:
void analyze();
void checkSubroutine();
@ -92,7 +94,8 @@ private:
bool ignoreStackCallBoundary);
uint32_t getMaxUnitSize();
void getFunctionsToTrim(llvm::Function *root, llvm::SmallVector<void *, 64> &trimming_pool,
bool ignoreStackCallBoundary, uint32_t &func_cnt);
llvm::SmallVector<void *, 64> &tiny_fn_trimming_pool, bool ignoreStackCallBoundary,
uint32_t &func_cnt);
void updateStaticFuncFreq();
void estimateTotalLoopIteration(llvm::Function &F, llvm::LoopInfo *LI);
@ -162,12 +165,13 @@ private:
bool PartitionUnit;
bool StaticProfileGuidedPartitioning;
// Flags for implcit arguments and external functions
// Flags for implicit arguments and external functions
bool ForceInlineExternalFunctions;
bool ForceInlineStackCallWithImplArg;
bool ControlInlineImplicitArgs;
unsigned SubroutineThreshold;
unsigned KernelTotalSizeThreshold;
unsigned LargeKernelThresholdMultiplier;
unsigned ExpandedUnitSizeThreshold;
};

View File

@ -1375,6 +1375,8 @@ DECLARE_IGC_REGKEY(bool, AddNoInlineToTrimmedFunctions, false, "Tell late passes
DECLARE_IGC_REGKEY(bool, ForceInlineExternalFunctions, false, "not to trim functions called from multiple kernels",
true)
DECLARE_IGC_REGKEY(DWORD, KernelTotalSizeThreshold, 50000, "Trimming target of kernel total size", true)
DECLARE_IGC_REGKEY(DWORD, LargeKernelThresholdMultiplier, 13,
"Multipler to kernel threshold. When exceeded more agressive trimming will be performed", false)
DECLARE_IGC_REGKEY(bool, PartitionUnit, false, "Partition compilation unit", true)
DECLARE_IGC_REGKEY(DWORD, PrintPartitionUnit, 0, "Print information about compilation unit partitioning", true)
DECLARE_IGC_REGKEY(bool, PartitionWithFastHybridRA, false, "Enable FastRA and HybridRA when partition is enabled", true)

View File

@ -0,0 +1,66 @@
/*========================== begin_copyright_notice ============================
Copyright (C) 2025 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/
// REQUIRES: regkeys, dg2-supported
// Test checks that in case, kernel is above the large kernel kernel threshold small functions will be trimmed as well.
// RUN: ocloc compile -file %s -device dg2 -options "-igc_opts 'SubroutineThreshold=1,KernelTotalSizeThreshold=1,ControlInlineTinySize=15,PrintControlKernelTotalSize=15'" 2>&1 | FileCheck %s
// CHECK: is much larger than threshold, trimming small functions as well.
// CHECK: Trim the function, bar2
// CHECK: Trim the function, bar3
int bar3(__global int *c) {
int k = 10;
for (int i = 0 ; i < 100 ; i++) {
*c += k * i;
}
return k;
}
int bar2(__global int *b) {
int k = 10;
for (int i = 0 ; i < 100 ; i++) {
*b += k * bar3(b);
}
return k;
}
int bar1(__global int *a) {
int k = 10;
for (int i = 0 ; i < 100 ; i++) {
*a += k * bar2(a);
*a += k * bar3(a);
}
return k;
}
__kernel void foo(int __global *p) {
int a = 0;
for (int i = 0; i < 100; i++) {
a += bar1(p);
a += bar2(p);
a += bar3(p);
}
for (int i = 300; i < 500000; i++) {
a += *p;
}
for (int i = 300; i < 500000; i++) {
a += *p;
}
for (int i = 300; i < 500000; i++) {
a += *p;
}
for (int i = 300; i < 500000; i++) {
a += *p;
}
for (int i = 300; i < 500000; i++) {
a += *p;
}
*p = a;
}