mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-29 09:03:14 +08:00
performance: get work group count per tile value when setting new group size
- change interface to function to accept external group size Related-To: NEO-12639 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
bb3466d07a
commit
7e00590994
@@ -186,7 +186,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
|
||||
updateStreamProperties(*kernel, launchParams.isCooperative, threadGroupDimensions, launchParams.isIndirect);
|
||||
|
||||
auto maxWgCountPerTile = kernel->suggestMaxCooperativeGroupCount(this->engineGroupType, device->getNEODevice()->isEngineInstanced(), true);
|
||||
auto maxWgCountPerTile = kernel->getMaxWgCountPerTile(this->engineGroupType);
|
||||
|
||||
NEO::EncodeDispatchKernelArgs dispatchKernelArgs{
|
||||
0, // eventAddress
|
||||
|
||||
@@ -339,7 +339,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
|
||||
}
|
||||
}
|
||||
|
||||
auto maxWgCountPerTile = kernel->suggestMaxCooperativeGroupCount(this->engineGroupType, device->getNEODevice()->isEngineInstanced(), true);
|
||||
auto maxWgCountPerTile = kernel->getMaxWgCountPerTile(this->engineGroupType);
|
||||
|
||||
NEO::EncodeDispatchKernelArgs dispatchKernelArgs{
|
||||
eventAddress, // eventAddress
|
||||
|
||||
@@ -187,8 +187,27 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
|
||||
return midThreadPreemptionDisallowedForRayTracingKernels;
|
||||
}
|
||||
|
||||
uint32_t getMaxWgCountPerTile(NEO::EngineGroupType engineGroupType) const {
|
||||
auto value = maxWgCountPerTileCcs;
|
||||
if (engineGroupType == NEO::EngineGroupType::renderCompute) {
|
||||
value = maxWgCountPerTileRcs;
|
||||
} else if (engineGroupType == NEO::EngineGroupType::cooperativeCompute) {
|
||||
value = maxWgCountPerTileCooperative;
|
||||
}
|
||||
DEBUG_BREAK_IF(value == 0);
|
||||
return value;
|
||||
}
|
||||
|
||||
protected:
|
||||
uint32_t maxWgCountPerTileCcs = 0;
|
||||
uint32_t maxWgCountPerTileRcs = 0;
|
||||
uint32_t maxWgCountPerTileCooperative = 0;
|
||||
bool midThreadPreemptionDisallowedForRayTracingKernels = false;
|
||||
bool heaplessEnabled = false;
|
||||
bool implicitScalingEnabled = false;
|
||||
bool localDispatchSupport = false;
|
||||
bool rcsAvailable = false;
|
||||
bool cooperativeSupport = false;
|
||||
};
|
||||
|
||||
using KernelAllocatorFn = Kernel *(*)(Module *module);
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
#include "shared/source/helpers/basic_math.h"
|
||||
#include "shared/source/helpers/bindless_heaps_helper.h"
|
||||
#include "shared/source/helpers/blit_commands_helper.h"
|
||||
#include "shared/source/helpers/compiler_product_helper.h"
|
||||
#include "shared/source/helpers/gfx_core_helper.h"
|
||||
#include "shared/source/helpers/hw_info.h"
|
||||
#include "shared/source/helpers/kernel_helpers.h"
|
||||
@@ -370,7 +371,8 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
evaluateIfRequiresGenerationOfLocalIdsByRuntime(kernelDescriptor);
|
||||
|
||||
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
|
||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto neoDevice = module->getDevice()->getNEODevice();
|
||||
auto &rootDeviceEnvironment = neoDevice->getRootDeviceEnvironment();
|
||||
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
this->numThreadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(
|
||||
simdSize, static_cast<uint32_t>(itemsInGroup), grfCount, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
@@ -414,6 +416,17 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
|
||||
this->perThreadDataSizeForWholeThreadGroup = 0;
|
||||
this->perThreadDataSize = 0;
|
||||
}
|
||||
|
||||
if (this->heaplessEnabled && this->localDispatchSupport) {
|
||||
auto isEngineIstanced = neoDevice->isEngineInstanced();
|
||||
this->maxWgCountPerTileCcs = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::compute, isEngineIstanced, true);
|
||||
if (this->rcsAvailable) {
|
||||
this->maxWgCountPerTileRcs = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::renderCompute, isEngineIstanced, true);
|
||||
}
|
||||
if (this->cooperativeSupport) {
|
||||
this->maxWgCountPerTileCooperative = suggestMaxCooperativeGroupCount(NEO::EngineGroupType::cooperativeCompute, isEngineIstanced, true);
|
||||
}
|
||||
}
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -477,11 +490,7 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
uint32_t KernelImp::suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, bool isEngineInstanced, bool forceSingleTileQuery) {
|
||||
UNRECOVERABLE_IF(0 == groupSize[0]);
|
||||
UNRECOVERABLE_IF(0 == groupSize[1]);
|
||||
UNRECOVERABLE_IF(0 == groupSize[2]);
|
||||
|
||||
uint32_t KernelImp::suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, uint32_t *groupSize, bool isEngineInstanced, bool forceSingleTileQuery) {
|
||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto &helper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
auto &descriptor = kernelImmData->getDescriptor();
|
||||
@@ -492,10 +501,8 @@ uint32_t KernelImp::suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineG
|
||||
|
||||
uint32_t numSubDevicesForExecution = 1;
|
||||
|
||||
bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
|
||||
auto deviceBitfield = module->getDevice()->getNEODevice()->getDeviceBitfield();
|
||||
|
||||
if (!forceSingleTileQuery && NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
|
||||
if (!forceSingleTileQuery && this->implicitScalingEnabled) {
|
||||
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
|
||||
}
|
||||
|
||||
@@ -993,6 +1000,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
|
||||
|
||||
auto neoDevice = module->getDevice()->getNEODevice();
|
||||
const auto &productHelper = neoDevice->getProductHelper();
|
||||
const auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto &kernelDescriptor = kernelImmData->getDescriptor();
|
||||
auto ret = NEO::KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(kernelDescriptor.kernelAttributes, neoDevice);
|
||||
if (ret == NEO::KernelHelper::ErrorCode::invalidKernel) {
|
||||
@@ -1006,7 +1014,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
|
||||
if (isaAllocation->getAllocationType() == NEO::AllocationType::kernelIsaInternal && this->kernelImmData->getIsaParentAllocation() == nullptr) {
|
||||
isaAllocation->setTbxWritable(true, std::numeric_limits<uint32_t>::max());
|
||||
isaAllocation->setAubWritable(true, std::numeric_limits<uint32_t>::max());
|
||||
NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(neoDevice->getRootDeviceEnvironment(), *isaAllocation),
|
||||
NEO::MemoryTransferHelper::transferMemoryToAllocation(productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *isaAllocation),
|
||||
*neoDevice,
|
||||
isaAllocation,
|
||||
this->kernelImmData->getIsaOffsetInParentAllocation(),
|
||||
@@ -1153,7 +1161,22 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
|
||||
|
||||
this->internalResidencyContainer.push_back(rtDispatchGlobalsInfo->rtDispatchGlobalsArray);
|
||||
}
|
||||
|
||||
const auto &hwInfo = neoDevice->getHardwareInfo();
|
||||
auto deviceBitfield = neoDevice->getDeviceBitfield();
|
||||
const auto &gfxHelper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
|
||||
this->midThreadPreemptionDisallowedForRayTracingKernels = productHelper.isMidThreadPreemptionDisallowedForRayTracingKernels();
|
||||
|
||||
this->heaplessEnabled = rootDeviceEnvironment.getHelper<NEO::CompilerProductHelper>().isHeaplessModeEnabled();
|
||||
this->localDispatchSupport = productHelper.getSupportedLocalDispatchSizes(hwInfo).size() > 0;
|
||||
|
||||
bool platformImplicitScaling = gfxHelper.platformSupportsImplicitScaling(rootDeviceEnvironment);
|
||||
this->implicitScalingEnabled = NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling);
|
||||
|
||||
this->rcsAvailable = gfxHelper.isRcsAvailable(hwInfo);
|
||||
this->cooperativeSupport = productHelper.isCooperativeEngineSupported(hwInfo);
|
||||
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -69,7 +69,15 @@ struct KernelImp : Kernel {
|
||||
|
||||
ze_result_t getKernelName(size_t *pSize, char *pName) override;
|
||||
|
||||
uint32_t suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, bool isEngineInstanced, bool forceSingleTileQuery) override;
|
||||
uint32_t suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, bool isEngineInstanced, bool forceSingleTileQuery) override {
|
||||
UNRECOVERABLE_IF(0 == this->groupSize[0]);
|
||||
UNRECOVERABLE_IF(0 == this->groupSize[1]);
|
||||
UNRECOVERABLE_IF(0 == this->groupSize[2]);
|
||||
|
||||
return suggestMaxCooperativeGroupCount(engineGroupType, this->groupSize, isEngineInstanced, forceSingleTileQuery);
|
||||
}
|
||||
|
||||
uint32_t suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, uint32_t *groupSize, bool isEngineInstanced, bool forceSingleTileQuery);
|
||||
|
||||
const uint8_t *getCrossThreadData() const override { return crossThreadData.get(); }
|
||||
uint32_t getCrossThreadDataSize() const override { return crossThreadDataSize; }
|
||||
|
||||
@@ -40,7 +40,7 @@ uint32_t KernelImpSuggestMaxCooperativeGroupCountFixture::getMaxWorkGroupCount()
|
||||
kernel.kernelImmData = &kernelInfo;
|
||||
auto module = std::make_unique<ModuleImp>(device, nullptr, ModuleType::user);
|
||||
kernel.module = module.get();
|
||||
|
||||
kernel.implicitScalingEnabled = device->getNEODevice()->getDeviceBitfield().count() > 1;
|
||||
kernel.groupSize[0] = lws[0];
|
||||
kernel.groupSize[1] = lws[1];
|
||||
kernel.groupSize[2] = lws[2];
|
||||
|
||||
@@ -39,18 +39,25 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp {
|
||||
using BaseClass = ::L0::KernelImp;
|
||||
using BaseClass::BaseClass;
|
||||
using ::L0::KernelImp::argumentsResidencyContainer;
|
||||
using ::L0::KernelImp::cooperativeSupport;
|
||||
using ::L0::KernelImp::createPrintfBuffer;
|
||||
using ::L0::KernelImp::crossThreadData;
|
||||
using ::L0::KernelImp::crossThreadDataSize;
|
||||
using ::L0::KernelImp::dynamicStateHeapData;
|
||||
using ::L0::KernelImp::dynamicStateHeapDataSize;
|
||||
using ::L0::KernelImp::groupSize;
|
||||
using ::L0::KernelImp::heaplessEnabled;
|
||||
using ::L0::KernelImp::implicitArgsResidencyContainerIndices;
|
||||
using ::L0::KernelImp::implicitScalingEnabled;
|
||||
using ::L0::KernelImp::internalResidencyContainer;
|
||||
using ::L0::KernelImp::isBindlessOffsetSet;
|
||||
using ::L0::KernelImp::kernelHasIndirectAccess;
|
||||
using ::L0::KernelImp::kernelImmData;
|
||||
using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime;
|
||||
using ::L0::KernelImp::localDispatchSupport;
|
||||
using ::L0::KernelImp::maxWgCountPerTileCcs;
|
||||
using ::L0::KernelImp::maxWgCountPerTileCooperative;
|
||||
using ::L0::KernelImp::maxWgCountPerTileRcs;
|
||||
using ::L0::KernelImp::midThreadPreemptionDisallowedForRayTracingKernels;
|
||||
using ::L0::KernelImp::module;
|
||||
using ::L0::KernelImp::numThreadsPerThreadGroup;
|
||||
@@ -62,6 +69,7 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp {
|
||||
using ::L0::KernelImp::perThreadDataSizeForWholeThreadGroup;
|
||||
using ::L0::KernelImp::pImplicitArgs;
|
||||
using ::L0::KernelImp::printfBuffer;
|
||||
using ::L0::KernelImp::rcsAvailable;
|
||||
using ::L0::KernelImp::regionGroupBarrierIndex;
|
||||
using ::L0::KernelImp::requiredWorkgroupOrder;
|
||||
using ::L0::KernelImp::setAssertBuffer;
|
||||
|
||||
@@ -1103,7 +1103,7 @@ TEST_F(KernelImmutableDataTests, givenModuleWithPrivateMemoryBiggerThanGlobalMem
|
||||
EXPECT_EQ(nullptr, kernel->getPrivateMemoryGraphicsAllocation());
|
||||
}
|
||||
|
||||
TEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenRayTracingIsInitializedAndPatchedInImplicitArgsBuffer) {
|
||||
HWTEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenRayTracingIsInitializedAndPatchedInImplicitArgsBuffer) {
|
||||
auto &hwInfo = *neoDevice->getRootDeviceEnvironment().getMutableHardwareInfo();
|
||||
hwInfo.gtSystemInfo.IsDynamicallyPopulated = false;
|
||||
hwInfo.gtSystemInfo.SliceCount = 1;
|
||||
|
||||
@@ -894,5 +894,61 @@ TEST_F(KernelImpTest, GivenGroupSizeRequiresSwLocalIdsGenerationWhenKernelSpecif
|
||||
alignedFree(testPerThreadDataBuffer);
|
||||
}
|
||||
|
||||
TEST_F(KernelImpTest, givenHeaplessAndLocalDispatchEnabledWheSettingGroupSizeThenGetMaxWgCountPerTileCalculated) {
|
||||
Mock<Module> module(device, nullptr);
|
||||
Mock<::L0::KernelImp> kernel;
|
||||
kernel.module = &module;
|
||||
|
||||
kernel.heaplessEnabled = false;
|
||||
kernel.localDispatchSupport = false;
|
||||
kernel.setGroupSize(128, 1, 1);
|
||||
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileCcs);
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
|
||||
|
||||
kernel.heaplessEnabled = true;
|
||||
kernel.setGroupSize(64, 2, 1);
|
||||
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileCcs);
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
|
||||
|
||||
kernel.localDispatchSupport = true;
|
||||
kernel.setGroupSize(32, 4, 1);
|
||||
|
||||
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileRcs);
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
|
||||
|
||||
kernel.rcsAvailable = true;
|
||||
kernel.setGroupSize(16, 8, 1);
|
||||
|
||||
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
|
||||
EXPECT_NE(0u, kernel.maxWgCountPerTileRcs);
|
||||
EXPECT_EQ(0u, kernel.maxWgCountPerTileCooperative);
|
||||
|
||||
kernel.cooperativeSupport = true;
|
||||
kernel.setGroupSize(8, 8, 2);
|
||||
|
||||
EXPECT_NE(0u, kernel.maxWgCountPerTileCcs);
|
||||
EXPECT_NE(0u, kernel.maxWgCountPerTileRcs);
|
||||
EXPECT_NE(0u, kernel.maxWgCountPerTileCooperative);
|
||||
}
|
||||
|
||||
TEST_F(KernelImpTest, givenCorrectEngineTypeWhenGettingMaxWgCountPerTileThenReturnActualValue) {
|
||||
Mock<Module> module(device, nullptr);
|
||||
Mock<::L0::KernelImp> kernel;
|
||||
kernel.module = &module;
|
||||
|
||||
kernel.maxWgCountPerTileCcs = 4;
|
||||
kernel.maxWgCountPerTileRcs = 2;
|
||||
kernel.maxWgCountPerTileCooperative = 100;
|
||||
|
||||
EXPECT_EQ(4u, kernel.getMaxWgCountPerTile(NEO::EngineGroupType::compute));
|
||||
EXPECT_EQ(2u, kernel.getMaxWgCountPerTile(NEO::EngineGroupType::renderCompute));
|
||||
EXPECT_EQ(100u, kernel.getMaxWgCountPerTile(NEO::EngineGroupType::cooperativeCompute));
|
||||
}
|
||||
|
||||
} // namespace ult
|
||||
} // namespace L0
|
||||
|
||||
Reference in New Issue
Block a user