mirror of
				https://github.com/intel/intel-graphics-compiler.git
				synced 2025-10-30 08:18:26 +08:00 
			
		
		
		
	Enable more aggresive trimming for very large kernels
Enable trimming of small functions, in case kernel far exceedes threshold even after default trimming is performed, to keep compilation time in check.
This commit is contained in:
		| @ -752,7 +752,7 @@ bool ProcessFuncAttributes::runOnModule(Module &M) { | ||||
|         if (shouldAlwaysInline) { | ||||
|           if ((IGC_IS_FLAG_ENABLED(ControlKernelTotalSize) || IGC_IS_FLAG_ENABLED(ControlUnitSize)) && | ||||
|               efs.shouldEnableSubroutine() && efs.isTrimmedFunction(F)) { | ||||
|             if (IGC_IS_FLAG_ENABLED(AddNoInlineToTrimmedFunctions)) { | ||||
|             if (IGC_IS_FLAG_ENABLED(AddNoInlineToTrimmedFunctions) || efs.isLargeKernelThresholdExceeded()) { | ||||
|               SetNoInline(F); | ||||
|             } | ||||
|           } else { | ||||
|  | ||||
| @ -96,6 +96,7 @@ EstimateFunctionSize::EstimateFunctionSize(AnalysisLevel AL, bool EnableStaticPr | ||||
|   ForceInlineStackCallWithImplArg = IGC_IS_FLAG_ENABLED(ForceInlineStackCallWithImplArg); | ||||
|   ControlInlineImplicitArgs = IGC_IS_FLAG_ENABLED(ControlInlineImplicitArgs); | ||||
|   SubroutineThreshold = IGC_GET_FLAG_VALUE(SubroutineThreshold); | ||||
|   LargeKernelThresholdMultiplier = IGC_GET_FLAG_VALUE(LargeKernelThresholdMultiplier); | ||||
|   KernelTotalSizeThreshold = IGC_GET_FLAG_VALUE(KernelTotalSizeThreshold); | ||||
|   ExpandedUnitSizeThreshold = IGC_GET_FLAG_VALUE(ExpandedUnitSizeThreshold); | ||||
|   if (EnableStaticProfileGuidedTrimming) { | ||||
| @ -1132,6 +1133,16 @@ void EstimateFunctionSize::reduceKernelSize() { | ||||
|  | ||||
| bool EstimateFunctionSize::isTrimmedFunction(llvm::Function *F) { return get<FunctionNode>(F)->isTrimmed(); } | ||||
|  | ||||
| bool EstimateFunctionSize::isLargeKernelThresholdExceeded() const { | ||||
|   for (auto *node : kernelEntries) { | ||||
|     auto *kernelNode = (FunctionNode *)node; | ||||
|     if (kernelNode->ExpandedSize > KernelTotalSizeThreshold * LargeKernelThresholdMultiplier) { | ||||
|       return true; | ||||
|     } | ||||
|   } | ||||
|   return false; | ||||
| } | ||||
|  | ||||
| // Initialize data structures for topological traversal: FunctionsInKernel and BottomUpQueue. | ||||
| // FunctionsInKernel is a map data structure where the key is FunctionNode and value is the number of edges to callee | ||||
| // nodes. FunctionsInKernel is primarily used for topological traversal and also used to check whether a function is in | ||||
| @ -1469,6 +1480,7 @@ void EstimateFunctionSize::reduceCompilationUnitSize() { | ||||
|  | ||||
| // Top down traverse to find and retrieve functions that meet trimming criteria | ||||
| void EstimateFunctionSize::getFunctionsToTrim(llvm::Function *root, llvm::SmallVector<void *, 64> &trimming_pool, | ||||
|                                               llvm::SmallVector<void *, 64> &tiny_fn_trimming_pool, | ||||
|                                               bool ignoreStackCallBoundary, uint32_t &func_cnt) { | ||||
|   FunctionNode *unitHead = get<FunctionNode>(root); | ||||
|   std::unordered_set<void *> visit; | ||||
| @ -1529,6 +1541,8 @@ void EstimateFunctionSize::getFunctionsToTrim(llvm::Function *root, llvm::SmallV | ||||
|       Node->dumpFuncInfo(0x4, "Good to trim (Big enough > " + std::to_string(tinySizeThreshold) + ")"); | ||||
|       break; | ||||
|     case FT_TOO_TINY: | ||||
|       // Small functions will be trimmed in special case if kernel still far exceeds threshold | ||||
|       tiny_fn_trimming_pool.push_back(Node); | ||||
|       Node->dumpFuncInfo(0x4, "Can't trim (Too tiny < " + std::to_string(tinySizeThreshold) + ")"); | ||||
|       break; | ||||
|     case FT_HIGHER_WEIGHT: | ||||
| @ -1601,9 +1615,9 @@ void EstimateFunctionSize::trimCompilationUnit(llvm::SmallVector<void *, 64> &un | ||||
|  | ||||
|         SmallVector<void *, 64> | ||||
|             trimming_pool; | ||||
|  | ||||
|     SmallVector<void *, 64> tiny_fn_trimming_pool; | ||||
|     uint32_t func_cnt = 0; | ||||
|     getFunctionsToTrim(unit->F, trimming_pool, ignoreStackCallBoundary, func_cnt); | ||||
|     getFunctionsToTrim(unit->F, trimming_pool, tiny_fn_trimming_pool, ignoreStackCallBoundary, func_cnt); | ||||
|     PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " has " << trimming_pool.size() | ||||
|                                         << " functions for trimming out of " << func_cnt) if (trimming_pool.empty()) { | ||||
|       PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " size " << unit->ExpandedSize | ||||
| @ -1614,13 +1628,18 @@ void EstimateFunctionSize::trimCompilationUnit(llvm::SmallVector<void *, 64> &un | ||||
|       performGreedyTrimming(unit->F, trimming_pool, threshold, ignoreStackCallBoundary); | ||||
|     } else { | ||||
|       performTrimming(unit->F, trimming_pool, threshold, ignoreStackCallBoundary); | ||||
|       if (ignoreStackCallBoundary && unit->ExpandedSize > threshold * LargeKernelThresholdMultiplier) { | ||||
|         PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << ": Size: " << unit->ExpandedSize | ||||
|                                             << " is much larger than threshold, trimming small functions as well.") | ||||
|             performTrimming(unit->F, tiny_fn_trimming_pool, threshold, ignoreStackCallBoundary); | ||||
|       } | ||||
|     } | ||||
|     if (unit->ExpandedSize < threshold) { | ||||
|       PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << ": The size becomes below threshold") | ||||
|     } else { | ||||
|       PrintTrimUnit(0x2, "Kernel / Unit " | ||||
|                              << unit->F->getName().str() | ||||
|                              << ": The size is still above threhosld even though all candidates are trimmed") | ||||
|                              << ": The size is still above threshold even though all candidates are trimmed") | ||||
|     } | ||||
|  | ||||
|     PrintTrimUnit(0x2, "Kernel / Unit " << unit->F->getName().str() << " final size " << unit->ExpandedSize | ||||
| @ -1712,6 +1731,7 @@ void EstimateFunctionSize::performGreedyTrimming(Function *head, llvm::SmallVect | ||||
|   PrintTrimUnit(0x8, "In total, " << total_trim_cnt << " function(s) are trimmed out of " << functions_to_trim.size()); | ||||
|   return; | ||||
| } | ||||
|  | ||||
| void EstimateFunctionSize::performTrimming(Function *head, llvm::SmallVector<void *, 64> &functions_to_trim, | ||||
|                                            uint32_t threshold, bool ignoreStackCallBoundary) { | ||||
|   FunctionNode *unitHead = get<FunctionNode>(head); | ||||
| @ -1745,7 +1765,7 @@ void EstimateFunctionSize::performTrimming(Function *head, llvm::SmallVector<voi | ||||
|       functionToTrim->setTrimmed(); | ||||
|     } | ||||
|     total_trim_cnt += 1; | ||||
|     // After trimming, update exapnded size | ||||
|     // After trimming, update expanded size | ||||
|     updateExpandedUnitSize(head, ignoreStackCallBoundary); | ||||
|     PrintTrimUnit(0x8, "The kernel size is reduced after trimming from " << original_expandedSize << " to " | ||||
|                                                                          << unitHead->ExpandedSize); | ||||
|  | ||||
| @ -63,6 +63,8 @@ public: | ||||
|  | ||||
|   bool isStackCallAssigned(llvm::Function *F); | ||||
|  | ||||
|   bool isLargeKernelThresholdExceeded() const; | ||||
|  | ||||
| private: | ||||
|   void analyze(); | ||||
|   void checkSubroutine(); | ||||
| @ -92,7 +94,8 @@ private: | ||||
|                              bool ignoreStackCallBoundary); | ||||
|   uint32_t getMaxUnitSize(); | ||||
|   void getFunctionsToTrim(llvm::Function *root, llvm::SmallVector<void *, 64> &trimming_pool, | ||||
|                           bool ignoreStackCallBoundary, uint32_t &func_cnt); | ||||
|                           llvm::SmallVector<void *, 64> &tiny_fn_trimming_pool, bool ignoreStackCallBoundary, | ||||
|                           uint32_t &func_cnt); | ||||
|   void updateStaticFuncFreq(); | ||||
|   void estimateTotalLoopIteration(llvm::Function &F, llvm::LoopInfo *LI); | ||||
|  | ||||
| @ -162,12 +165,13 @@ private: | ||||
|   bool PartitionUnit; | ||||
|   bool StaticProfileGuidedPartitioning; | ||||
|  | ||||
|   // Flags for implcit arguments and external functions | ||||
|   // Flags for implicit arguments and external functions | ||||
|   bool ForceInlineExternalFunctions; | ||||
|   bool ForceInlineStackCallWithImplArg; | ||||
|   bool ControlInlineImplicitArgs; | ||||
|   unsigned SubroutineThreshold; | ||||
|   unsigned KernelTotalSizeThreshold; | ||||
|   unsigned LargeKernelThresholdMultiplier; | ||||
|   unsigned ExpandedUnitSizeThreshold; | ||||
| }; | ||||
|  | ||||
|  | ||||
| @ -1375,6 +1375,8 @@ DECLARE_IGC_REGKEY(bool, AddNoInlineToTrimmedFunctions, false, "Tell late passes | ||||
| DECLARE_IGC_REGKEY(bool, ForceInlineExternalFunctions, false, "not to trim functions called from multiple kernels", | ||||
|                    true) | ||||
| DECLARE_IGC_REGKEY(DWORD, KernelTotalSizeThreshold, 50000, "Trimming target of kernel total size", true) | ||||
| DECLARE_IGC_REGKEY(DWORD, LargeKernelThresholdMultiplier, 13, | ||||
|                    "Multipler to kernel threshold. When exceeded more agressive trimming will be performed", false) | ||||
| DECLARE_IGC_REGKEY(bool, PartitionUnit, false, "Partition compilation unit", true) | ||||
| DECLARE_IGC_REGKEY(DWORD, PrintPartitionUnit, 0, "Print information about compilation unit partitioning", true) | ||||
| DECLARE_IGC_REGKEY(bool, PartitionWithFastHybridRA, false, "Enable FastRA and HybridRA when partition is enabled", true) | ||||
|  | ||||
							
								
								
									
										66
									
								
								IGC/ocloc_tests/features/large_kernel_trimming.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										66
									
								
								IGC/ocloc_tests/features/large_kernel_trimming.cl
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,66 @@ | ||||
| /*========================== begin_copyright_notice ============================ | ||||
|  | ||||
| Copyright (C) 2025 Intel Corporation | ||||
|  | ||||
| SPDX-License-Identifier: MIT | ||||
|  | ||||
| ============================= end_copyright_notice ===========================*/ | ||||
|  | ||||
| // REQUIRES: regkeys, dg2-supported | ||||
|  | ||||
| // Test checks that in case, kernel is above the large kernel kernel threshold small functions will be trimmed as well. | ||||
| // RUN: ocloc compile -file %s -device dg2 -options "-igc_opts 'SubroutineThreshold=1,KernelTotalSizeThreshold=1,ControlInlineTinySize=15,PrintControlKernelTotalSize=15'" 2>&1 | FileCheck %s | ||||
|  | ||||
| // CHECK: is much larger than threshold, trimming small functions as well. | ||||
| // CHECK: Trim the function, bar2 | ||||
| // CHECK: Trim the function, bar3 | ||||
|  | ||||
| int bar3(__global int *c) { | ||||
|     int k = 10; | ||||
|     for (int i = 0 ; i < 100 ; i++) { | ||||
|         *c += k * i; | ||||
|     } | ||||
|     return k; | ||||
| } | ||||
|  | ||||
| int bar2(__global int *b) { | ||||
|     int k = 10; | ||||
|     for (int i = 0 ; i < 100 ; i++) { | ||||
|         *b += k * bar3(b); | ||||
|     } | ||||
|     return k; | ||||
| } | ||||
|  | ||||
| int bar1(__global int *a) { | ||||
|     int k = 10; | ||||
|     for (int i = 0 ; i < 100 ; i++) { | ||||
|         *a += k * bar2(a); | ||||
|         *a += k * bar3(a); | ||||
|     } | ||||
|     return k; | ||||
| } | ||||
|  | ||||
| __kernel void foo(int __global *p) { | ||||
|     int a = 0; | ||||
|     for (int i = 0; i < 100; i++) { | ||||
|         a += bar1(p); | ||||
|         a += bar2(p); | ||||
|         a += bar3(p); | ||||
|     } | ||||
|     for (int i = 300; i < 500000; i++) { | ||||
|         a += *p; | ||||
|     } | ||||
|     for (int i = 300; i < 500000; i++) { | ||||
|         a += *p; | ||||
|     } | ||||
|     for (int i = 300; i < 500000; i++) { | ||||
|         a += *p; | ||||
|     } | ||||
|     for (int i = 300; i < 500000; i++) { | ||||
|         a += *p; | ||||
|     } | ||||
|     for (int i = 300; i < 500000; i++) { | ||||
|         a += *p; | ||||
|     } | ||||
|     *p = a; | ||||
| } | ||||
		Reference in New Issue
	
	Block a user
	 Stefan Ilic
					Stefan Ilic