Add work_dim patching to l0 kernel

Related-To: NEO-5931

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2021-06-25 13:35:30 +00:00
committed by Compute-Runtime-Automation
parent f6443a2304
commit 62f89b174a
11 changed files with 220 additions and 15 deletions

View File

@@ -58,6 +58,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
pThreadGroupDimensions->groupCountY,
pThreadGroupDimensions->groupCountZ);
kernel->patchWorkDim(pThreadGroupDimensions->groupCountX,
pThreadGroupDimensions->groupCountY,
pThreadGroupDimensions->groupCountZ);
}
if (isIndirect && pThreadGroupDimensions) {

View File

@@ -104,7 +104,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
commandListPreemptionMode = std::min(commandListPreemptionMode, functionPreemptionMode);
kernel->patchGlobalOffset();
if (isIndirect && pThreadGroupDimensions) {
prepareIndirectParams(pThreadGroupDimensions);
}
@@ -112,6 +111,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
pThreadGroupDimensions->groupCountY,
pThreadGroupDimensions->groupCountZ);
kernel->patchWorkDim(pThreadGroupDimensions->groupCountX,
pThreadGroupDimensions->groupCountY,
pThreadGroupDimensions->groupCountZ);
}
NEO::GraphicsAllocation *eventAlloc = nullptr;
uint64_t eventAddress = 0;

View File

@@ -116,6 +116,8 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
virtual ze_result_t setGlobalOffsetExp(uint32_t offsetX, uint32_t offsetY, uint32_t offsetZ) = 0;
virtual uint32_t patchGlobalOffset() = 0;
virtual void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) = 0;
virtual ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) = 0;
virtual ze_result_t setCacheConfig(ze_cache_config_flags_t flags) = 0;

View File

@@ -14,6 +14,7 @@
#include "shared/source/helpers/register_offsets.h"
#include "shared/source/helpers/string.h"
#include "shared/source/helpers/surface_format_info.h"
#include "shared/source/kernel/kernel_arg_descriptor.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/memory_manager/memory_operations_handler.h"
@@ -888,6 +889,21 @@ uint32_t KernelImp::patchGlobalOffset() {
return NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkOffset, this->globalOffsets);
}
void KernelImp::patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) {
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
auto dataOffset = kernelDescriptor.payloadMappings.dispatchTraits.workDim;
if (NEO::isValidOffset(dataOffset)) {
auto destinationBuffer = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
uint32_t workDim = 1;
if (groupCountZ * groupSize[2] > 1) {
workDim = 3;
} else if (groupCountY * groupSize[1] > 1) {
workDim = 2;
}
NEO::patchNonPointer(destinationBuffer, kernelDescriptor.payloadMappings.dispatchTraits.workDim, workDim);
}
}
Kernel *Kernel::create(uint32_t productFamily, Module *module,
const ze_kernel_desc_t *desc, ze_result_t *res) {
UNRECOVERABLE_IF(productFamily >= IGFX_MAX_PRODUCT);

View File

@@ -126,6 +126,8 @@ struct KernelImp : Kernel {
ze_result_t setGlobalOffsetExp(uint32_t offsetX, uint32_t offsetY, uint32_t offsetZ) override;
uint32_t patchGlobalOffset() override;
void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) override;
ze_result_t setCacheConfig(ze_cache_config_flags_t flags) override;
bool usesRayTracing() {
return kernelImmData->getDescriptor().hasRTCalls();

View File

@@ -591,44 +591,95 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenCommandListWhenAppendLaunchKernelS
EXPECT_EQ(1u, event->getPacketsInUse());
}
HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWorkGroupCountAndGlobalWorkSizeIsSetInCrossThreadData) {
HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWorkGroupCountAndGlobalWorkSizeAndWorkDimIsSetInCrossThreadData) {
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
Mock<::L0::Kernel> kernel;
kernel.groupSize[0] = 2;
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2;
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2;
kernel.descriptor.payloadMappings.dispatchTraits.workDim = 2;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
void *alloc = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc);
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
static_cast<ze_group_count_t *>(alloc),
nullptr, 0, nullptr);
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
kernel.groupSize[2] = 2;
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
static_cast<ze_group_count_t *>(alloc),
nullptr, 0, nullptr);
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
auto itor = find<MI_STORE_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_REG *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_LOAD_REGISTER_IMM *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
EXPECT_NE(cmdList.end(), itor);
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor++; //MI_MATH_ALU_INST_INLINE doesn't have tagMI_COMMAND_OPCODE, can't find it in cmdList
EXPECT_NE(itor, cmdList.end());
itor++;
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor++;
EXPECT_NE(itor, cmdList.end());
itor++;
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor++;
EXPECT_NE(itor, cmdList.end());
itor++;
EXPECT_NE(itor, cmdList.end());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end()); //kernel with groupSize[2] = 2
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
EXPECT_NE(itor, cmdList.end());
context->freeMem(alloc);
}

View File

@@ -1824,6 +1824,66 @@ HWTEST_F(KernelGlobalWorkOffsetTests, whenSettingGlobalOffsetThenCrossThreadData
EXPECT_EQ(*(dst.begin() + desc.payloadMappings.dispatchTraits.globalWorkOffset[2]), globalOffsetz);
}
using KernelWorkDimTests = Test<ModuleImmutableDataFixture>;
HWTEST_F(KernelWorkDimTests, givenGroupCountsWhenPatchingWorkDimThenCrossThreadDataIsPatched) {
struct MockKernelWithMockCrossThreadData : public MockKernel {
public:
MockKernelWithMockCrossThreadData(MockModule *mockModule) : MockKernel(mockModule) {}
void setCrossThreadData(uint32_t _crossThreadDataSize) {
crossThreadData.reset(new uint8_t[_crossThreadDataSize]);
crossThreadDataSize = _crossThreadDataSize;
memset(crossThreadData.get(), 0x00, crossThreadDataSize);
}
};
uint32_t perHwThreadPrivateMemorySizeRequested = 32u;
std::unique_ptr<MockImmutableData> mockKernelImmData =
std::make_unique<MockImmutableData>(perHwThreadPrivateMemorySizeRequested);
createModuleFromBinary(perHwThreadPrivateMemorySizeRequested, false, mockKernelImmData.get());
auto kernel = std::make_unique<MockKernelWithMockCrossThreadData>(module.get());
createKernel(kernel.get());
kernel->setCrossThreadData(sizeof(uint32_t));
kernel->patchWorkDim(1, 1, 1);
mockKernelImmData->mockKernelDescriptor->payloadMappings.dispatchTraits.workDim = 0x0u;
auto destinationBuffer = ArrayRef<const uint8_t>(kernel->getCrossThreadData(), kernel->getCrossThreadDataSize());
auto &kernelDescriptor = mockKernelImmData->getDescriptor();
auto workDimInCrossThreadDataPtr = destinationBuffer.begin() + kernelDescriptor.payloadMappings.dispatchTraits.workDim;
EXPECT_EQ(*workDimInCrossThreadDataPtr, 0u);
std::array<std::array<uint32_t, 7>, 8> sizesCountsWorkDim{
std::array<uint32_t, 7>{2, 1, 1, 1, 1, 1, 1},
std::array<uint32_t, 7>{1, 1, 1, 1, 1, 1, 1},
std::array<uint32_t, 7>{1, 2, 1, 2, 1, 1, 2},
std::array<uint32_t, 7>{1, 2, 1, 1, 1, 1, 2},
std::array<uint32_t, 7>{1, 1, 1, 1, 2, 1, 2},
std::array<uint32_t, 7>{1, 1, 1, 2, 2, 2, 3},
std::array<uint32_t, 7>{1, 1, 2, 1, 1, 1, 3},
std::array<uint32_t, 7>{1, 1, 1, 1, 1, 2, 3}};
for (auto parameters : sizesCountsWorkDim) {
uint32_t groupSizeX = parameters[0];
uint32_t groupSizeY = parameters[1];
uint32_t groupSizeZ = parameters[2];
uint32_t groupCountX = parameters[3];
uint32_t groupCountY = parameters[4];
uint32_t groupCountZ = parameters[5];
uint32_t expectedWorkDim = parameters[6];
ze_result_t res = kernel->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
kernel->patchWorkDim(groupCountX, groupCountY, groupCountZ);
EXPECT_EQ(*workDimInCrossThreadDataPtr, expectedWorkDim);
}
}
using KernelPrintHandlerTest = Test<ModuleFixture>;
struct MyPrintfHandler : public PrintfHandler {
static uint32_t getPrintfSurfaceInitialDataSize() {