mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-19 16:24:18 +08:00
fix: Use proper value about hw local id generations
- remove useless flag ForceNumberOfThreadsInGpgpuThreadGroup - add new flag "RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup" to restore old path without restrictions about number of threads in thread group - fix forwarding information about hw local ids generations to calculate numOfThreadsInThreadGroup correctly Related-To: NEO-7952, NEO-7982 Signed-off-by: Cencelewska, Katarzyna <katarzyna.cencelewska@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
aea5f435db
commit
68d81c82a7
@@ -339,7 +339,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
|
||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, kernelRequiresGenerationOfLocalIdsByRuntime);
|
||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime);
|
||||
|
||||
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
|
||||
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;
|
||||
|
||||
@@ -120,7 +120,11 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
|
||||
}
|
||||
|
||||
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {}
|
||||
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
|
||||
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
|
||||
if (enableForcingOfGenerateLocalIdByHw) {
|
||||
kernelRequiresGenerationOfLocalIdsByRuntime = !forceGenerateLocalIdByHw;
|
||||
}
|
||||
}
|
||||
ze_result_t setArgBufferWithAlloc(uint32_t argIndex, uintptr_t argVal, NEO::GraphicsAllocation *allocation, NEO::SvmAllocationData *peerAllocData) override {
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
@@ -135,6 +139,8 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
|
||||
NEO::KernelInfo info;
|
||||
uint32_t printPrintfOutputCalledTimes = 0;
|
||||
bool hangDetectedPassedToPrintfOutput = false;
|
||||
bool enableForcingOfGenerateLocalIdByHw = false;
|
||||
bool forceGenerateLocalIdByHw = false;
|
||||
};
|
||||
|
||||
} // namespace ult
|
||||
|
||||
@@ -243,7 +243,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
auto &gfxCoreHelper = device.getGfxCoreHelper();
|
||||
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
|
||||
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
||||
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, localIdsGenerationByRuntime);
|
||||
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, !localIdsGenerationByRuntime);
|
||||
|
||||
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||
|
||||
|
||||
@@ -82,6 +82,7 @@ DECLARE_DEBUG_VARIABLE(bool, AppendMemoryPrefetchForKmdMigratedSharedAllocations
|
||||
DECLARE_DEBUG_VARIABLE(bool, ForceMemoryPrefetchForKmdMigratedSharedAllocations, false, "Force prefetch of shared memory in command queue execute command lists")
|
||||
DECLARE_DEBUG_VARIABLE(bool, ClKhrExternalMemoryExtension, false, "Enable cl_khr_external_memory extension")
|
||||
DECLARE_DEBUG_VARIABLE(bool, WaitForMemoryRelease, false, "Wait for memory release when out of memory")
|
||||
DECLARE_DEBUG_VARIABLE(bool, RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup, 0, "0 - default disabled, 1- remove restrictions on NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
|
||||
DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "Override device id in AUB/TBX mode")
|
||||
DECLARE_DEBUG_VARIABLE(std::string, FilterDeviceId, std::string("unk"), "Device id filter, adapter matching device id will be opened; ignored when unk")
|
||||
DECLARE_DEBUG_VARIABLE(std::string, FilterBdfPath, std::string("unk"), "Linux-only, BDF path filter, only matching paths will be opened; ignored when unk")
|
||||
@@ -144,7 +145,6 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch C
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceNumberOfThreadsInGpgpuThreadGroup, -1, "-1 - default, set NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")
|
||||
|
||||
@@ -168,7 +168,7 @@ class GfxCoreHelper {
|
||||
virtual bool isChipsetUniqueUUIDSupported() const = 0;
|
||||
virtual bool isTimestampShiftRequired() const = 0;
|
||||
virtual bool isRelaxedOrderingSupported() const = 0;
|
||||
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const = 0;
|
||||
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const = 0;
|
||||
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
|
||||
virtual ~GfxCoreHelper() = default;
|
||||
|
||||
@@ -381,7 +381,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||
bool isChipsetUniqueUUIDSupported() const override;
|
||||
bool isTimestampShiftRequired() const override;
|
||||
bool isRelaxedOrderingSupported() const override;
|
||||
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const override;
|
||||
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const override;
|
||||
|
||||
~GfxCoreHelperHw() override = default;
|
||||
|
||||
|
||||
@@ -688,10 +688,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const {
|
||||
if (DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get() != -1) {
|
||||
return DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get();
|
||||
}
|
||||
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const {
|
||||
return getThreadsPerWG(simd, totalWorkItems);
|
||||
}
|
||||
} // namespace NEO
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
namespace NEO {
|
||||
|
||||
struct MockRootDeviceEnvironment : public RootDeviceEnvironment {
|
||||
using RootDeviceEnvironment::hwInfo;
|
||||
using RootDeviceEnvironment::isDummyAllocationInitialized;
|
||||
using RootDeviceEnvironment::RootDeviceEnvironment;
|
||||
~MockRootDeviceEnvironment() override = default;
|
||||
|
||||
@@ -534,8 +534,8 @@ PrintGlobalTimestampInNs = 0
|
||||
EnableDeviceStateVerification = -1
|
||||
VfBarResourceAllocationWa = 1
|
||||
EnableDynamicPostSyncAllocLayout = -1
|
||||
ForceNumberOfThreadsInGpgpuThreadGroup = -1
|
||||
PrintTimestampPacketUsage = -1
|
||||
TrackNumCsrClientsOnSyncPoints = -1
|
||||
CommandListTimestampRefreshIntervalInMilliSec = -1
|
||||
RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup = 0
|
||||
# Please don't edit below this line
|
||||
|
||||
@@ -1587,15 +1587,23 @@ HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThe
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST_F(GfxCoreHelperTest, givenFlagForceNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
|
||||
HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.set(1);
|
||||
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
|
||||
|
||||
uint32_t expectedNumThreadsPerThreadGroup = 10u;
|
||||
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
|
||||
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
|
||||
std::array<std::array<uint32_t, 5>, 8> values = {{
|
||||
{32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation
|
||||
{32u, 64u, 32u, 1, 2u},
|
||||
{32u, 128u, 256u, 1, 4u},
|
||||
{32u, 1024u, 128u, 1, 32u},
|
||||
{16u, 32u, 32u, 0, 2u},
|
||||
{16u, 64u, 256u, 0, 4u},
|
||||
{16u, 128u, 128u, 0, 8u},
|
||||
{16u, 1024u, 256u, 0, 64u},
|
||||
}};
|
||||
|
||||
expectedNumThreadsPerThreadGroup = 20u;
|
||||
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
|
||||
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
|
||||
for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
|
||||
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user