fix: use required walk order when local ids are software generated

Signed-off-by: Zbigniew Zdanowicz <zbigniew.zdanowicz@intel.com>
This commit is contained in:
Zbigniew Zdanowicz 2024-04-18 18:45:19 +00:00 committed by Compute-Runtime-Automation
parent aadce445bf
commit c60b19a8ba
2 changed files with 55 additions and 1 deletions

View File

@ -398,13 +398,22 @@ ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
if (numChannels > 0) {
UNRECOVERABLE_IF(3 != numChannels);
std::array<uint8_t, 3> walkOrder{0, 1, 2};
if (kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder) {
walkOrder = {
kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]};
}
NEO::generateLocalIDs(
perThreadDataForWholeThreadGroup,
static_cast<uint16_t>(simdSize),
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
static_cast<uint16_t>(groupSizeY),
static_cast<uint16_t>(groupSizeZ)}},
std::array<uint8_t, 3>{{0, 1, 2}},
walkOrder,
false, grfSize, grfCount, rootDeviceEnvironment);
}

View File

@ -8,6 +8,7 @@
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/source/helpers/local_id_gen.h"
#include "shared/source/helpers/simd_helper.h"
#include "shared/test/common/helpers/raii_gfx_core_helper.h"
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
@ -831,5 +832,49 @@ TEST_F(KernelImpTest, GivenGroupSizeRequiresSwLocalIdsGenerationWhenNextGroupSiz
EXPECT_EQ(0u, kernel.getPerThreadDataSize());
}
TEST_F(KernelImpTest, GivenGroupSizeRequiresSwLocalIdsGenerationWhenKernelSpecifiesRequiredWalkOrderThenUseCorrectOrderToGenerateLocalIds) {
Mock<Module> module(device, nullptr);
Mock<::L0::KernelImp> kernel;
kernel.module = &module;
auto grfSize = device->getHwInfo().capabilityTable.grfSize;
WhiteBox<::L0::KernelImmutableData> kernelInfo = {};
NEO::KernelDescriptor descriptor;
kernelInfo.kernelDescriptor = &descriptor;
kernelInfo.kernelDescriptor->kernelAttributes.numLocalIdChannels = 3;
kernelInfo.kernelDescriptor->kernelAttributes.numGrfRequired = GrfConfig::defaultGrfNumber;
kernelInfo.kernelDescriptor->kernelAttributes.flags.requiresWorkgroupWalkOrder = true;
kernelInfo.kernelDescriptor->kernelAttributes.workgroupWalkOrder[0] = 2;
kernelInfo.kernelDescriptor->kernelAttributes.workgroupWalkOrder[1] = 1;
kernelInfo.kernelDescriptor->kernelAttributes.workgroupWalkOrder[2] = 0;
kernelInfo.kernelDescriptor->kernelAttributes.simdSize = 32;
kernel.kernelImmData = &kernelInfo;
kernel.enableForcingOfGenerateLocalIdByHw = true;
kernel.forceGenerateLocalIdByHw = false;
kernel.KernelImp::setGroupSize(12, 12, 1);
uint32_t perThreadSizeNeeded = kernel.getPerThreadDataSizeForWholeThreadGroup();
auto testPerThreadDataBuffer = static_cast<uint8_t *>(alignedMalloc(perThreadSizeNeeded, 32));
std::array<uint8_t, 3> walkOrder{2, 1, 0};
NEO::generateLocalIDs(
testPerThreadDataBuffer,
static_cast<uint16_t>(32),
std::array<uint16_t, 3>{{static_cast<uint16_t>(12),
static_cast<uint16_t>(12),
static_cast<uint16_t>(1)}},
walkOrder,
false, grfSize, GrfConfig::defaultGrfNumber, device->getNEODevice()->getRootDeviceEnvironment());
EXPECT_EQ(0, memcmp(testPerThreadDataBuffer, kernel.KernelImp::getPerThreadData(), perThreadSizeNeeded));
alignedFree(testPerThreadDataBuffer);
}
} // namespace ult
} // namespace L0