performance: get work group count per tile value when setting new group size

- change interface to function to accept external group size

Related-To: NEO-12639

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz
2024-09-16 09:06:17 +00:00
committed by Compute-Runtime-Automation
parent bb3466d07a
commit 7e00590994
9 changed files with 129 additions and 15 deletions

View File

@@ -186,7 +186,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
updateStreamProperties(*kernel, launchParams.isCooperative, threadGroupDimensions, launchParams.isIndirect);
auto maxWgCountPerTile = kernel->suggestMaxCooperativeGroupCount(this->engineGroupType, device->getNEODevice()->isEngineInstanced(), true);
auto maxWgCountPerTile = kernel->getMaxWgCountPerTile(this->engineGroupType);
NEO::EncodeDispatchKernelArgs dispatchKernelArgs{
0, // eventAddress

View File

@@ -339,7 +339,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
}
}
auto maxWgCountPerTile = kernel->suggestMaxCooperativeGroupCount(this->engineGroupType, device->getNEODevice()->isEngineInstanced(), true);
auto maxWgCountPerTile = kernel->getMaxWgCountPerTile(this->engineGroupType);
NEO::EncodeDispatchKernelArgs dispatchKernelArgs{
eventAddress, // eventAddress

View File

@@ -187,8 +187,27 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
return midThreadPreemptionDisallowedForRayTracingKernels;
}
uint32_t getMaxWgCountPerTile(NEO::EngineGroupType engineGroupType) const {
auto value = maxWgCountPerTileCcs;
if (engineGroupType == NEO::EngineGroupType::renderCompute) {
value = maxWgCountPerTileRcs;
} else if (engineGroupType == NEO::EngineGroupType::cooperativeCompute) {
value = maxWgCountPerTileCooperative;
}
DEBUG_BREAK_IF(value == 0);
return value;
}
protected:
uint32_t maxWgCountPerTileCcs = 0;
uint32_t maxWgCountPerTileRcs = 0;
uint32_t maxWgCountPerTileCooperative = 0;
bool midThreadPreemptionDisallowedForRayTracingKernels = false;
bool heaplessEnabled = false;
bool implicitScalingEnabled = false;
bool localDispatchSupport = false;
bool rcsAvailable = false;
bool cooperativeSupport = false;
};
using KernelAllocatorFn = Kernel *(*)(Module *module);

View File

@@ -16,6 +16,7 @@
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/bindless_heaps_helper.h"
#include "shared/source/helpers/blit_commands_helper.h"
#include "shared/source/helpers/compiler_product_helper.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/hw_info.h"
#include "shared/source/helpers/kernel_helpers.h"
@@ -370,7 +371,8 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
auto neoDevice = module->getDevice()->getNEODevice();
auto &rootDeviceEnvironment = neoDevice->getRootDeviceEnvironment();
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
simdSize, static_cast<uint32_t>(itemsInGroup), grfCount, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
@@ -414,6 +416,17 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
this->perThreadDataSizeForWholeThreadGroup = 0;
this->perThreadDataSize = 0;
}
if (this->heaplessEnabled && this->localDispatchSupport) {
auto isEngineIstanced = neoDevice->isEngineInstanced();
this->maxWgCountPerTileCcs = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::compute, isEngineIstanced, true);
if (this->rcsAvailable) {
this->maxWgCountPerTileRcs = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::renderCompute, isEngineIstanced, true);
}
if (this->cooperativeSupport) {
this->maxWgCountPerTileCooperative = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::cooperativeCompute, isEngineIstanced, true);
}
}
return ZE_RESULT_SUCCESS;
}
@@ -477,11 +490,7 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
return ZE_RESULT_SUCCESS;
}
uint32_t KernelImp::suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, bool isEngineInstanced, bool forceSingleTileQuery) {
UNRECOVERABLE_IF(0 == groupSize[0]);
UNRECOVERABLE_IF(0 == groupSize[1]);
UNRECOVERABLE_IF(0 == groupSize[2]);
uint32_t KernelImp::suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, uint32_t *groupSize, bool isEngineInstanced, bool forceSingleTileQuery) {
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
auto &helper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
auto &descriptor = kernelImmData->getDescriptor();
@@ -492,10 +501,8 @@ uint32_t KernelImp::suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineG
uint32_t numSubDevicesForExecution = 1;
bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
auto deviceBitfield = module->getDevice()->getNEODevice()->getDeviceBitfield();
if (!forceSingleTileQuery && NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
if (!forceSingleTileQuery && this->implicitScalingEnabled) {
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
}
@@ -993,6 +1000,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
auto neoDevice = module->getDevice()->getNEODevice();
const auto &productHelper = neoDevice->getProductHelper();
const auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
auto &kernelDescriptor = kernelImmData->getDescriptor();
auto ret = NEO::KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(kernelDescriptor.kernelAttributes, neoDevice);
if (ret == NEO::KernelHelper::ErrorCode::invalidKernel) {
@@ -1006,7 +1014,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
if (isaAllocation->getAllocationType() == NEO::AllocationType::kernelIsaInternal && this->kernelImmData->getIsaParentAllocation() == nullptr) {
isaAllocation->setTbxWritable(true, std::numeric_limits<uint32_t>::max());
isaAllocation->setAubWritable(true, std::numeric_limits<uint32_t>::max());
NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(neoDevice->getRootDeviceEnvironment(), *isaAllocation),
NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *isaAllocation),
*neoDevice,
isaAllocation,
this->kernelImmData->getIsaOffsetInParentAllocation(),
@@ -1153,7 +1161,22 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
this->internalResidencyContainer.push_back(rtDispatchGlobalsInfo->rtDispatchGlobalsArray);
}
const auto &hwInfo = neoDevice->getHardwareInfo();
auto deviceBitfield = neoDevice->getDeviceBitfield();
const auto &gfxHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
this->midThreadPreemptionDisallowedForRayTracingKernels = productHelper.isMidThreadPreemptionDisallowedForRayTracingKernels();
this->heaplessEnabled = rootDeviceEnvironment.getHelper<NEO::CompilerProductHelper>().isHeaplessModeEnabled();
this->localDispatchSupport = productHelper.getSupportedLocalDispatchSizes(hwInfo).size() > 0;
bool platformImplicitScaling = gfxHelper.platformSupportsImplicitScaling(rootDeviceEnvironment);
this->implicitScalingEnabled = NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling);
this->rcsAvailable = gfxHelper.isRcsAvailable(hwInfo);
this->cooperativeSupport = productHelper.isCooperativeEngineSupported(hwInfo);
return ZE_RESULT_SUCCESS;
}

View File

@@ -69,7 +69,15 @@ struct KernelImp : Kernel {
ze_result_t getKernelName(size_t *pSize, char *pName) override;
uint32_t suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, bool isEngineInstanced, bool forceSingleTileQuery) override;
uint32_t suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, bool isEngineInstanced, bool forceSingleTileQuery) override {
UNRECOVERABLE_IF(0 == this->groupSize[0]);
UNRECOVERABLE_IF(0 == this->groupSize[1]);
UNRECOVERABLE_IF(0 == this->groupSize[2]);
return suggestMaxCooperativeGroupCount(engineGroupType, this->groupSize, isEngineInstanced, forceSingleTileQuery);
}
uint32_t suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, uint32_t *groupSize, bool isEngineInstanced, bool forceSingleTileQuery);
const uint8_t *getCrossThreadData() const override { return crossThreadData.get(); }
uint32_t getCrossThreadDataSize() const override { return crossThreadDataSize; }

View File

@@ -40,7 +40,7 @@ uint32_t KernelImpSuggestMaxCooperativeGroupCountFixture::getMaxWorkGroupCount()
kernel.kernelImmData = &kernelInfo;
auto module = std::make_unique<ModuleImp>(device, nullptr, ModuleType::user);
kernel.module = module.get();
kernel.implicitScalingEnabled = device->getNEODevice()->getDeviceBitfield().count() > 1;
kernel.groupSize[0] = lws[0];
kernel.groupSize[1] = lws[1];
kernel.groupSize[2] = lws[2];

View File

@@ -39,18 +39,25 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp {
using BaseClass = ::L0::KernelImp;
using BaseClass::BaseClass;
using ::L0::KernelImp::argumentsResidencyContainer;
using ::L0::KernelImp::cooperativeSupport;
using ::L0::KernelImp::createPrintfBuffer;
using ::L0::KernelImp::crossThreadData;
using ::L0::KernelImp::crossThreadDataSize;
using ::L0::KernelImp::dynamicStateHeapData;
using ::L0::KernelImp::dynamicStateHeapDataSize;
using ::L0::KernelImp::groupSize;
using ::L0::KernelImp::heaplessEnabled;
using ::L0::KernelImp::implicitArgsResidencyContainerIndices;
using ::L0::KernelImp::implicitScalingEnabled;
using ::L0::KernelImp::internalResidencyContainer;
using ::L0::KernelImp::isBindlessOffsetSet;
using ::L0::KernelImp::kernelHasIndirectAccess;
using ::L0::KernelImp::kernelImmData;
using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime;
using ::L0::KernelImp::localDispatchSupport;
using ::L0::KernelImp::maxWgCountPerTileCcs;
using ::L0::KernelImp::maxWgCountPerTileCooperative;
using ::L0::KernelImp::maxWgCountPerTileRcs;
using ::L0::KernelImp::midThreadPreemptionDisallowedForRayTracingKernels;
using ::L0::KernelImp::module;
using ::L0::KernelImp::numThreadsPerThreadGroup;
@@ -62,6 +69,7 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp {
using ::L0::KernelImp::perThreadDataSizeForWholeThreadGroup;
using ::L0::KernelImp::pImplicitArgs;
using ::L0::KernelImp::printfBuffer;
using ::L0::KernelImp::rcsAvailable;
using ::L0::KernelImp::regionGroupBarrierIndex;
using ::L0::KernelImp::requiredWorkgroupOrder;
using ::L0::KernelImp::setAssertBuffer;

View File

@@ -1103,7 +1103,7 @@ TEST_F(KernelImmutableDataTests, givenModuleWithPrivateMemoryBiggerThanGlobalMem
EXPECT_EQ(nullptr, kernel->getPrivateMemoryGraphicsAllocation());
}
TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenRayTracingIsInitializedAndPatchedInImplicitArgsBuffer) {
HWTEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenRayTracingIsInitializedAndPatchedInImplicitArgsBuffer) {
auto &hwInfo = *neoDevice->getRootDeviceEnvironment().getMutableHardwareInfo();
hwInfo.gtSystemInfo.IsDynamicallyPopulated = false;
hwInfo.gtSystemInfo.SliceCount = 1;

View File

@@ -894,5 +894,61 @@ TEST_F(KernelImpTest, GivenGroupSizeRequiresSwLocalIdsGenerationWhenKernelSpecif
alignedFree(testPerThreadDataBuffer);
}
TEST_F(KernelImpTest, givenHeaplessAndLocalDispatchEnabledWheSettingGroupSizeThenGetMaxWgCountPerTileCalculated) {
Mock<Module> module(device, nullptr);
Mock<::L0::KernelImp> kernel;
kernel.module = &module;
kernel.heaplessEnabled = false;
kernel.localDispatchSupport = false;
kernel.setGroupSize(128, 1, 1);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
kernel.heaplessEnabled = true;
kernel.setGroupSize(64, 2, 1);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
kernel.localDispatchSupport = true;
kernel.setGroupSize(32, 4, 1);
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
kernel.rcsAvailable = true;
kernel.setGroupSize(16, 8, 1);
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
EXPECT_NE(0u, kernel.maxWgCountPerTileRcs);
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
kernel.cooperativeSupport = true;
kernel.setGroupSize(8, 8, 2);
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
EXPECT_NE(0u, kernel.maxWgCountPerTileRcs);
EXPECT_NE(0u, kernel.maxWgCountPerTileCooperative);
}
TEST_F(KernelImpTest, givenCorrectEngineTypeWhenGettingMaxWgCountPerTileThenReturnActualValue) {
Mock<Module> module(device, nullptr);
Mock<::L0::KernelImp> kernel;
kernel.module = &module;
kernel.maxWgCountPerTileCcs = 4;
kernel.maxWgCountPerTileRcs = 2;
kernel.maxWgCountPerTileCooperative = 100;
EXPECT_EQ(4u, kernel.getMaxWgCountPerTile(NEO::EngineGroupType::compute));
EXPECT_EQ(2u, kernel.getMaxWgCountPerTile(NEO::EngineGroupType::renderCompute));
EXPECT_EQ(100u, kernel.getMaxWgCountPerTile(NEO::EngineGroupType::cooperativeCompute));
}
} // namespace ult
} // namespace L0