mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-19 06:24:51 +08:00
refactor: unify further calculation to get max work group count
- move available device calculcation into common helper - change interface to have code available where no descriptor is available - expand unit test for implementation of new inteface Related-To: NEO-13350 Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
99a353a15a
commit
56b15f17f7
@@ -503,28 +503,24 @@ ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSiz
|
||||
}
|
||||
|
||||
uint32_t KernelImp::suggestMaxCooperativeGroupCount(NEO::EngineGroupType engineGroupType, uint32_t *groupSize, bool forceSingleTileQuery) {
|
||||
auto &rootDeviceEnvironment = module->getDevice()->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto &helper = rootDeviceEnvironment.getHelper<NEO::GfxCoreHelper>();
|
||||
auto &neoDevice = *module->getDevice()->getNEODevice();
|
||||
auto &helper = neoDevice.getGfxCoreHelper();
|
||||
auto &descriptor = kernelImmData->getDescriptor();
|
||||
|
||||
auto usedSlmSize = helper.alignSlmSize(slmArgsTotalSize + descriptor.kernelAttributes.slmInlineSize);
|
||||
const uint32_t workDim = 3;
|
||||
const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
|
||||
|
||||
uint32_t numSubDevicesForExecution = 1;
|
||||
|
||||
auto deviceBitfield = module->getDevice()->getNEODevice()->getDeviceBitfield();
|
||||
if (!forceSingleTileQuery && this->implicitScalingEnabled) {
|
||||
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
|
||||
}
|
||||
|
||||
return NEO::KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
|
||||
descriptor,
|
||||
numSubDevicesForExecution,
|
||||
return NEO::KernelHelper::getMaxWorkGroupCount(neoDevice,
|
||||
descriptor.kernelAttributes.numGrfRequired,
|
||||
descriptor.kernelAttributes.simdSize,
|
||||
descriptor.kernelAttributes.barrierCount,
|
||||
usedSlmSize,
|
||||
workDim,
|
||||
localWorkSize,
|
||||
engineGroupType);
|
||||
engineGroupType,
|
||||
this->implicitScalingEnabled,
|
||||
forceSingleTileQuery);
|
||||
}
|
||||
|
||||
ze_result_t KernelImp::setIndirectAccess(ze_kernel_indirect_access_flags_t flags) {
|
||||
|
||||
@@ -1177,30 +1177,27 @@ void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *glob
|
||||
|
||||
uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue, bool forceSingleTileQuery) const {
|
||||
auto &hardwareInfo = getHardwareInfo();
|
||||
auto &rootDeviceEnvironment = this->getDevice().getRootDeviceEnvironment();
|
||||
auto &helper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
|
||||
auto &device = this->getDevice();
|
||||
auto &helper = device.getGfxCoreHelper();
|
||||
|
||||
auto engineGroupType = helper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(),
|
||||
commandQueue->getGpgpuEngine().getEngineUsage(), hardwareInfo);
|
||||
|
||||
auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
|
||||
|
||||
uint32_t numSubDevicesForExecution = 1;
|
||||
bool platformImplicitScaling = helper.platformSupportsImplicitScaling(device.getRootDeviceEnvironment());
|
||||
bool isImplicitScalingEnabled = ImplicitScalingHelper::isImplicitScalingEnabled(device.getDeviceBitfield(), platformImplicitScaling);
|
||||
|
||||
bool platformImplicitScaling = helper.platformSupportsImplicitScaling(rootDeviceEnvironment);
|
||||
auto deviceBitfield = commandQueue->getClDevice().getDeviceBitfield();
|
||||
|
||||
if (!forceSingleTileQuery && NEO::ImplicitScalingHelper::isImplicitScalingEnabled(deviceBitfield, platformImplicitScaling)) {
|
||||
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
|
||||
}
|
||||
|
||||
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment,
|
||||
kernelInfo.kernelDescriptor,
|
||||
numSubDevicesForExecution,
|
||||
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(device.getDevice(),
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.numGrfRequired,
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.simdSize,
|
||||
kernelInfo.kernelDescriptor.kernelAttributes.barrierCount,
|
||||
usedSlmSize,
|
||||
workDim,
|
||||
localWorkSize,
|
||||
engineGroupType);
|
||||
engineGroupType,
|
||||
isImplicitScalingEnabled,
|
||||
forceSingleTileQuery);
|
||||
|
||||
return maxWorkGroupCount;
|
||||
}
|
||||
|
||||
@@ -18,10 +18,16 @@
|
||||
|
||||
namespace NEO {
|
||||
|
||||
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor, uint32_t numSubDevices,
|
||||
uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType) {
|
||||
return KernelHelper::getMaxWorkGroupCount(rootDeviceEnvironment, kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, kernelDescriptor.kernelAttributes.barrierCount,
|
||||
numSubDevices, usedSlmSize, workDim, localWorkSize, engineGroupType);
|
||||
uint32_t KernelHelper::getMaxWorkGroupCount(Device &device, uint16_t numGrfRequired, uint8_t simdSize, uint8_t barrierCount, uint32_t alignedSlmSize, uint32_t workDim, const size_t *localWorkSize,
|
||||
EngineGroupType engineGroupType, bool implicitScalingEnabled, bool forceSingleTileQuery) {
|
||||
uint32_t numSubDevicesForExecution = 1;
|
||||
|
||||
auto deviceBitfield = device.getDeviceBitfield();
|
||||
if (!forceSingleTileQuery && implicitScalingEnabled) {
|
||||
numSubDevicesForExecution = static_cast<uint32_t>(deviceBitfield.count());
|
||||
}
|
||||
|
||||
return KernelHelper::getMaxWorkGroupCount(device.getRootDeviceEnvironment(), numGrfRequired, simdSize, barrierCount, numSubDevicesForExecution, alignedSlmSize, workDim, localWorkSize, engineGroupType);
|
||||
}
|
||||
|
||||
uint32_t KernelHelper::getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, uint16_t numGrfRequired, uint8_t simdSize, uint8_t barrierCount,
|
||||
|
||||
@@ -27,8 +27,8 @@ struct KernelHelper {
|
||||
outOfDeviceMemory = 1,
|
||||
invalidKernel = 2
|
||||
};
|
||||
static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, const KernelDescriptor &kernelDescriptor,
|
||||
uint32_t numSubDevices, uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType);
|
||||
static uint32_t getMaxWorkGroupCount(Device &device, uint16_t numGrfRequired, uint8_t simdSize, uint8_t barrierCount, uint32_t alignedSlmSize, uint32_t workDim, const size_t *localWorkSize,
|
||||
EngineGroupType engineGroupType, bool implicitScalingEnabled, bool forceSingleTileQuery);
|
||||
static uint32_t getMaxWorkGroupCount(const RootDeviceEnvironment &rootDeviceEnvironment, uint16_t numGrfRequired, uint8_t simdSize, uint8_t barrierCount,
|
||||
uint32_t numSubDevices, uint32_t usedSlmSize, uint32_t workDim, const size_t *localWorkSize, EngineGroupType engineGroupType);
|
||||
static inline uint64_t getPrivateSurfaceSize(uint64_t perHwThreadPrivateMemorySize, uint32_t computeUnitsUsedForScratch) {
|
||||
|
||||
@@ -24,40 +24,50 @@
|
||||
|
||||
using namespace NEO;
|
||||
|
||||
struct KernelHelperMaxWorkGroupsTests : ::testing::Test {
|
||||
struct KernelHelperMaxWorkGroupsFixture : public DeviceFixture {
|
||||
size_t lws[3] = {10, 10, 10};
|
||||
|
||||
EngineGroupType engineType = EngineGroupType::compute;
|
||||
uint32_t simd = 8;
|
||||
uint32_t dssCount = 16;
|
||||
uint32_t availableSlm = 64 * MemoryConstants::kiloByte;
|
||||
uint32_t usedSlm = 0;
|
||||
uint32_t numberOfBarriers = 0;
|
||||
uint32_t workDim = 3;
|
||||
uint32_t grf = 128;
|
||||
uint32_t numSubdevices = 1;
|
||||
size_t lws[3] = {10, 10, 10};
|
||||
|
||||
void SetUp() override {
|
||||
executionEnvironment = std::make_unique<MockExecutionEnvironment>(defaultHwInfo.get(), false, 1u);
|
||||
rootDeviceEnvironment = executionEnvironment->rootDeviceEnvironments[0].get();
|
||||
uint16_t grf = 128;
|
||||
|
||||
uint8_t simd = 8;
|
||||
uint8_t numberOfBarriers = 0;
|
||||
|
||||
bool implicitScalingEnabled = false;
|
||||
bool forceSingleTileQuery = true;
|
||||
|
||||
void setUp() {
|
||||
DeviceFixture::setUp();
|
||||
rootDeviceEnvironment = &pDevice->getRootDeviceEnvironmentRef();
|
||||
}
|
||||
|
||||
uint32_t getMaxWorkGroupCount() {
|
||||
KernelDescriptor descriptor = {};
|
||||
descriptor.kernelAttributes.simdSize = simd;
|
||||
descriptor.kernelAttributes.barrierCount = numberOfBarriers;
|
||||
descriptor.kernelAttributes.numGrfRequired = grf;
|
||||
|
||||
auto hwInfo = rootDeviceEnvironment->getMutableHardwareInfo();
|
||||
hwInfo->gtSystemInfo.DualSubSliceCount = dssCount;
|
||||
hwInfo->capabilityTable.slmSize = (availableSlm / MemoryConstants::kiloByte) / dssCount;
|
||||
|
||||
return KernelHelper::getMaxWorkGroupCount(*rootDeviceEnvironment, descriptor, numSubdevices, usedSlm, workDim, lws, engineType);
|
||||
if (numSubdevices > 1) {
|
||||
forceSingleTileQuery = false;
|
||||
implicitScalingEnabled = true;
|
||||
for (uint32_t pos = 0; pos < numSubdevices; pos++) {
|
||||
pDevice->deviceBitfield.set(pos);
|
||||
}
|
||||
}
|
||||
|
||||
return KernelHelper::getMaxWorkGroupCount(*pDevice, grf, simd, numberOfBarriers, usedSlm, workDim, lws, engineType, implicitScalingEnabled, forceSingleTileQuery);
|
||||
}
|
||||
|
||||
std::unique_ptr<MockExecutionEnvironment> executionEnvironment;
|
||||
RootDeviceEnvironment *rootDeviceEnvironment = nullptr;
|
||||
};
|
||||
|
||||
using KernelHelperMaxWorkGroupsTests = Test<KernelHelperMaxWorkGroupsFixture>;
|
||||
|
||||
TEST_F(KernelHelperMaxWorkGroupsTests, GivenNoBarriersOrSlmUsedWhenCalculatingMaxWorkGroupsCountThenResultIsCalculatedWithSimd) {
|
||||
auto &helper = rootDeviceEnvironment->getHelper<NEO::GfxCoreHelper>();
|
||||
|
||||
@@ -72,6 +82,8 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenDebugFlagSetWhenGetMaxWorkGroupCount
|
||||
DebugManagerStateRestore restore;
|
||||
debugManager.flags.OverrideMaxWorkGroupCount.set(123);
|
||||
|
||||
forceSingleTileQuery = false;
|
||||
|
||||
EXPECT_EQ(123u, getMaxWorkGroupCount());
|
||||
}
|
||||
|
||||
@@ -160,7 +172,7 @@ TEST_F(KernelHelperMaxWorkGroupsTests, GivenVariousValuesWhenCalculatingMaxWorkG
|
||||
hwInfo->gtSystemInfo.ThreadCount = 1024;
|
||||
EXPECT_NE(1u, getMaxWorkGroupCount());
|
||||
|
||||
numberOfBarriers = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
|
||||
numberOfBarriers = static_cast<uint8_t>(helper.getMaxBarrierRegisterPerSlice());
|
||||
EXPECT_EQ(1u, getMaxWorkGroupCount());
|
||||
|
||||
numberOfBarriers = 1;
|
||||
|
||||
Reference in New Issue
Block a user