fix: Use proper value about hw local id generations

- remove useless flag ForceNumberOfThreadsInGpgpuThreadGroup
- add new flag "RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup"
to restore old path without restrictions about number of threads in
thread group
- fix forwarding information about hw local ids generations to
calculate numOfThreadsInThreadGroup correctly

Related-To: NEO-7952, NEO-7982
Signed-off-by: Cencelewska, Katarzyna <katarzyna.cencelewska@intel.com>
This commit is contained in:
Cencelewska, Katarzyna
2023-06-23 14:26:00 +00:00
committed by Compute-Runtime-Automation
parent aea5f435db
commit 68d81c82a7
9 changed files with 31 additions and 19 deletions

View File

@@ -339,7 +339,7 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, kernelRequiresGenerationOfLocalIdsByRuntime);
simdSize, static_cast<uint32_t>(itemsInGroup), grfSize, !kernelRequiresGenerationOfLocalIdsByRuntime);
if (kernelRequiresGenerationOfLocalIdsByRuntime) {
auto grfSize = this->module->getDevice()->getHwInfo().capabilityTable.grfSize;

View File

@@ -120,7 +120,11 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
}
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {}
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {}
void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {
if (enableForcingOfGenerateLocalIdByHw) {
kernelRequiresGenerationOfLocalIdsByRuntime = !forceGenerateLocalIdByHw;
}
}
ze_result_t setArgBufferWithAlloc(uint32_t argIndex, uintptr_t argVal, NEO::GraphicsAllocation *allocation, NEO::SvmAllocationData *peerAllocData) override {
return ZE_RESULT_SUCCESS;
}
@@ -135,6 +139,8 @@ struct Mock<::L0::Kernel> : public WhiteBox<::L0::Kernel> {
NEO::KernelInfo info;
uint32_t printPrintfOutputCalledTimes = 0;
bool hangDetectedPassedToPrintfOutput = false;
bool enableForcingOfGenerateLocalIdByHw = false;
bool forceGenerateLocalIdByHw = false;
};
} // namespace ult

View File

@@ -243,7 +243,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
auto &gfxCoreHelper = device.getGfxCoreHelper();
auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired;
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, localIdsGenerationByRuntime);
auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast<uint32_t>(localWorkItems), grfSize, !localIdsGenerationByRuntime);
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();

View File

@@ -82,6 +82,7 @@ DECLARE_DEBUG_VARIABLE(bool, AppendMemoryPrefetchForKmdMigratedSharedAllocations
DECLARE_DEBUG_VARIABLE(bool, ForceMemoryPrefetchForKmdMigratedSharedAllocations, false, "Force prefetch of shared memory in command queue execute command lists")
DECLARE_DEBUG_VARIABLE(bool, ClKhrExternalMemoryExtension, false, "Enable cl_khr_external_memory extension")
DECLARE_DEBUG_VARIABLE(bool, WaitForMemoryRelease, false, "Wait for memory release when out of memory")
DECLARE_DEBUG_VARIABLE(bool, RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup, 0, "0 - default disabled, 1- remove restrictions on NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "Override device id in AUB/TBX mode")
DECLARE_DEBUG_VARIABLE(std::string, FilterDeviceId, std::string("unk"), "Device id filter, adapter matching device id will be opened; ignored when unk")
DECLARE_DEBUG_VARIABLE(std::string, FilterBdfPath, std::string("unk"), "Linux-only, BDF path filter, only matching paths will be opened; ignored when unk")
@@ -144,7 +145,6 @@ DECLARE_DEBUG_VARIABLE(int32_t, CFEOverDispatchControl, -1, "Set Over Dispatch C
DECLARE_DEBUG_VARIABLE(int32_t, CFELargeGRFThreadAdjustDisable, -1, "Set Large GRF thread adjust Disable field in CFE_STATE, -1 - do not set")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideNumComputeUnitsForScratch, -1, "Override number of compute units used for scratch size calculation")
DECLARE_DEBUG_VARIABLE(int32_t, ForceThreadGroupDispatchSize, -1, "Set ThreadGroupDispatchSize in INTERFACE_DESCRIPTOR_DATA, -1 - default, 0 - TG size 8, 1 - TG size 4, 2 - TG size 2, 3 - Reserved")
DECLARE_DEBUG_VARIABLE(int32_t, ForceNumberOfThreadsInGpgpuThreadGroup, -1, "-1 - default, set NumberOfThreadsInGpgpuThreadGroup in INTERFACE_DESCRIPTOR_DATA")
DECLARE_DEBUG_VARIABLE(int32_t, ForceStatelessL1CachingPolicy, -1, "-1: default, >=0 : program value for stateless L1 caching")
DECLARE_DEBUG_VARIABLE(int32_t, ForceMemoryBankIndexOverride, -1, "-1: default, 0: disable, 1:enable, Force index=1 of memory bank for XEHP")
DECLARE_DEBUG_VARIABLE(int32_t, EnablePrivateScratchSlot1, -1, "-1: default, 0: disable, 1: enable Allows using private scratch space")

View File

@@ -168,7 +168,7 @@ class GfxCoreHelper {
virtual bool isChipsetUniqueUUIDSupported() const = 0;
virtual bool isTimestampShiftRequired() const = 0;
virtual bool isRelaxedOrderingSupported() const = 0;
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const = 0;
virtual uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const = 0;
static bool isWorkaroundRequired(uint32_t lowestSteppingWithBug, uint32_t steppingWithFix, const HardwareInfo &hwInfo, const ProductHelper &productHelper);
virtual ~GfxCoreHelper() = default;
@@ -381,7 +381,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
bool isChipsetUniqueUUIDSupported() const override;
bool isTimestampShiftRequired() const override;
bool isRelaxedOrderingSupported() const override;
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const override;
uint32_t calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const override;
~GfxCoreHelperHw() override = default;

View File

@@ -688,10 +688,7 @@ uint32_t GfxCoreHelperHw<GfxFamily>::getMinimalGrfSize() const {
}
template <typename GfxFamily>
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalGeneration) const {
if (DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get() != -1) {
return DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.get();
}
uint32_t GfxCoreHelperHw<GfxFamily>::calculateNumThreadsPerThreadGroup(uint32_t simd, uint32_t totalWorkItems, uint32_t grfSize, bool isHwLocalIdGeneration) const {
return getThreadsPerWG(simd, totalWorkItems);
}
} // namespace NEO

View File

@@ -13,6 +13,7 @@
namespace NEO {
struct MockRootDeviceEnvironment : public RootDeviceEnvironment {
using RootDeviceEnvironment::hwInfo;
using RootDeviceEnvironment::isDummyAllocationInitialized;
using RootDeviceEnvironment::RootDeviceEnvironment;
~MockRootDeviceEnvironment() override = default;

View File

@@ -534,8 +534,8 @@ PrintGlobalTimestampInNs = 0
EnableDeviceStateVerification = -1
VfBarResourceAllocationWa = 1
EnableDynamicPostSyncAllocLayout = -1
ForceNumberOfThreadsInGpgpuThreadGroup = -1
PrintTimestampPacketUsage = -1
TrackNumCsrClientsOnSyncPoints = -1
CommandListTimestampRefreshIntervalInMilliSec = -1
RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup = 0
# Please don't edit below this line

View File

@@ -1587,15 +1587,23 @@ HWTEST2_F(GfxCoreHelperTest, givenParamsWhenCalculateNumThreadsPerThreadGroupThe
}
}
HWTEST_F(GfxCoreHelperTest, givenFlagForceNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
HWTEST_F(GfxCoreHelperTest, givenFlagRemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroupWhenCalculateNumThreadsPerThreadGroupThenMethodReturnProperValue) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup.set(1);
const auto &gfxCoreHelper = getHelper<GfxCoreHelper>();
uint32_t expectedNumThreadsPerThreadGroup = 10u;
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
std::array<std::array<uint32_t, 5>, 8> values = {{
{32u, 32u, 128u, 1, 1u}, // SIMT Size, totalWorkItems, Max Num of threads, Grf size, Hw local id generation
{32u, 64u, 32u, 1, 2u},
{32u, 128u, 256u, 1, 4u},
{32u, 1024u, 128u, 1, 32u},
{16u, 32u, 32u, 0, 2u},
{16u, 64u, 256u, 0, 4u},
{16u, 128u, 128u, 0, 8u},
{16u, 1024u, 256u, 0, 64u},
}};
expectedNumThreadsPerThreadGroup = 20u;
DebugManager.flags.ForceNumberOfThreadsInGpgpuThreadGroup.set(expectedNumThreadsPerThreadGroup);
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(32u, 100u, 32u, true));
for (auto &[simtSize, totalWgSize, grfsize, isHwLocalIdGeneration, expectedNumThreadsPerThreadGroup] : values) {
EXPECT_EQ(expectedNumThreadsPerThreadGroup, gfxCoreHelper.calculateNumThreadsPerThreadGroup(simtSize, totalWgSize, grfsize, isHwLocalIdGeneration));
}
}