mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-05 09:09:04 +08:00
Add work_dim patching to l0 kernel
Related-To: NEO-5931 Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
f6443a2304
commit
62f89b174a
@@ -58,6 +58,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
||||
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
|
||||
pThreadGroupDimensions->groupCountY,
|
||||
pThreadGroupDimensions->groupCountZ);
|
||||
kernel->patchWorkDim(pThreadGroupDimensions->groupCountX,
|
||||
pThreadGroupDimensions->groupCountY,
|
||||
pThreadGroupDimensions->groupCountZ);
|
||||
}
|
||||
|
||||
if (isIndirect && pThreadGroupDimensions) {
|
||||
|
||||
@@ -104,7 +104,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
||||
commandListPreemptionMode = std::min(commandListPreemptionMode, functionPreemptionMode);
|
||||
|
||||
kernel->patchGlobalOffset();
|
||||
|
||||
if (isIndirect && pThreadGroupDimensions) {
|
||||
prepareIndirectParams(pThreadGroupDimensions);
|
||||
}
|
||||
@@ -112,6 +111,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
|
||||
kernel->setGroupCount(pThreadGroupDimensions->groupCountX,
|
||||
pThreadGroupDimensions->groupCountY,
|
||||
pThreadGroupDimensions->groupCountZ);
|
||||
kernel->patchWorkDim(pThreadGroupDimensions->groupCountX,
|
||||
pThreadGroupDimensions->groupCountY,
|
||||
pThreadGroupDimensions->groupCountZ);
|
||||
}
|
||||
NEO::GraphicsAllocation *eventAlloc = nullptr;
|
||||
uint64_t eventAddress = 0;
|
||||
|
||||
@@ -116,6 +116,8 @@ struct Kernel : _ze_kernel_handle_t, virtual NEO::DispatchKernelEncoderI {
|
||||
virtual ze_result_t setGlobalOffsetExp(uint32_t offsetX, uint32_t offsetY, uint32_t offsetZ) = 0;
|
||||
virtual uint32_t patchGlobalOffset() = 0;
|
||||
|
||||
virtual void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) = 0;
|
||||
|
||||
virtual ze_result_t suggestMaxCooperativeGroupCount(uint32_t *totalGroupCount) = 0;
|
||||
virtual ze_result_t setCacheConfig(ze_cache_config_flags_t flags) = 0;
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "shared/source/helpers/register_offsets.h"
|
||||
#include "shared/source/helpers/string.h"
|
||||
#include "shared/source/helpers/surface_format_info.h"
|
||||
#include "shared/source/kernel/kernel_arg_descriptor.h"
|
||||
#include "shared/source/kernel/kernel_descriptor.h"
|
||||
#include "shared/source/memory_manager/memory_manager.h"
|
||||
#include "shared/source/memory_manager/memory_operations_handler.h"
|
||||
@@ -888,6 +889,21 @@ uint32_t KernelImp::patchGlobalOffset() {
|
||||
return NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkOffset, this->globalOffsets);
|
||||
}
|
||||
|
||||
void KernelImp::patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) {
|
||||
const NEO::KernelDescriptor &kernelDescriptor = kernelImmData->getDescriptor();
|
||||
auto dataOffset = kernelDescriptor.payloadMappings.dispatchTraits.workDim;
|
||||
if (NEO::isValidOffset(dataOffset)) {
|
||||
auto destinationBuffer = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
|
||||
uint32_t workDim = 1;
|
||||
if (groupCountZ * groupSize[2] > 1) {
|
||||
workDim = 3;
|
||||
} else if (groupCountY * groupSize[1] > 1) {
|
||||
workDim = 2;
|
||||
}
|
||||
NEO::patchNonPointer(destinationBuffer, kernelDescriptor.payloadMappings.dispatchTraits.workDim, workDim);
|
||||
}
|
||||
}
|
||||
|
||||
Kernel *Kernel::create(uint32_t productFamily, Module *module,
|
||||
const ze_kernel_desc_t *desc, ze_result_t *res) {
|
||||
UNRECOVERABLE_IF(productFamily >= IGFX_MAX_PRODUCT);
|
||||
|
||||
@@ -126,6 +126,8 @@ struct KernelImp : Kernel {
|
||||
ze_result_t setGlobalOffsetExp(uint32_t offsetX, uint32_t offsetY, uint32_t offsetZ) override;
|
||||
uint32_t patchGlobalOffset() override;
|
||||
|
||||
void patchWorkDim(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) override;
|
||||
|
||||
ze_result_t setCacheConfig(ze_cache_config_flags_t flags) override;
|
||||
bool usesRayTracing() {
|
||||
return kernelImmData->getDescriptor().hasRTCalls();
|
||||
|
||||
@@ -591,44 +591,95 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenCommandListWhenAppendLaunchKernelS
|
||||
EXPECT_EQ(1u, event->getPacketsInUse());
|
||||
}
|
||||
|
||||
HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWorkGroupCountAndGlobalWorkSizeIsSetInCrossThreadData) {
|
||||
HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWorkGroupCountAndGlobalWorkSizeAndWorkDimIsSetInCrossThreadData) {
|
||||
using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
|
||||
using MI_LOAD_REGISTER_REG = typename FamilyType::MI_LOAD_REGISTER_REG;
|
||||
using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
|
||||
|
||||
Mock<::L0::Kernel> kernel;
|
||||
kernel.groupSize[0] = 2;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 2;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.globalWorkSize[0] = 2;
|
||||
kernel.descriptor.payloadMappings.dispatchTraits.workDim = 2;
|
||||
ze_result_t returnValue;
|
||||
std::unique_ptr<L0::CommandList> commandList(L0::CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue));
|
||||
|
||||
void *alloc = nullptr;
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc);
|
||||
ASSERT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
|
||||
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
|
||||
static_cast<ze_group_count_t *>(alloc),
|
||||
nullptr, 0, nullptr);
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
|
||||
kernel.groupSize[2] = 2;
|
||||
result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
|
||||
static_cast<ze_group_count_t *>(alloc),
|
||||
nullptr, 0, nullptr);
|
||||
EXPECT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
|
||||
cmdList, ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), commandList->commandContainer.getCommandStream()->getUsed()));
|
||||
|
||||
auto itor = find<MI_STORE_REGISTER_MEM *>(cmdList.begin(), cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(itor, cmdList.end());
|
||||
EXPECT_NE(cmdList.end(), itor);
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor++; //MI_MATH_ALU_INST_INLINE doesn't have tagMI_COMMAND_OPCODE, can't find it in cmdList
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor++;
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor++;
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor++;
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor++;
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor++;
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end()); //kernel with groupSize[2] = 2
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_REG *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
itor = find<MI_LOAD_REGISTER_IMM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
itor = find<MI_STORE_REGISTER_MEM *>(++itor, cmdList.end());
|
||||
EXPECT_NE(itor, cmdList.end());
|
||||
|
||||
context->freeMem(alloc);
|
||||
}
|
||||
|
||||
@@ -1824,6 +1824,66 @@ HWTEST_F(KernelGlobalWorkOffsetTests, whenSettingGlobalOffsetThenCrossThreadData
|
||||
EXPECT_EQ(*(dst.begin() + desc.payloadMappings.dispatchTraits.globalWorkOffset[2]), globalOffsetz);
|
||||
}
|
||||
|
||||
using KernelWorkDimTests = Test<ModuleImmutableDataFixture>;
|
||||
|
||||
HWTEST_F(KernelWorkDimTests, givenGroupCountsWhenPatchingWorkDimThenCrossThreadDataIsPatched) {
|
||||
struct MockKernelWithMockCrossThreadData : public MockKernel {
|
||||
public:
|
||||
MockKernelWithMockCrossThreadData(MockModule *mockModule) : MockKernel(mockModule) {}
|
||||
void setCrossThreadData(uint32_t _crossThreadDataSize) {
|
||||
crossThreadData.reset(new uint8_t[_crossThreadDataSize]);
|
||||
crossThreadDataSize = _crossThreadDataSize;
|
||||
memset(crossThreadData.get(), 0x00, crossThreadDataSize);
|
||||
}
|
||||
};
|
||||
uint32_t perHwThreadPrivateMemorySizeRequested = 32u;
|
||||
|
||||
std::unique_ptr<MockImmutableData> mockKernelImmData =
|
||||
std::make_unique<MockImmutableData>(perHwThreadPrivateMemorySizeRequested);
|
||||
|
||||
createModuleFromBinary(perHwThreadPrivateMemorySizeRequested, false, mockKernelImmData.get());
|
||||
auto kernel = std::make_unique<MockKernelWithMockCrossThreadData>(module.get());
|
||||
createKernel(kernel.get());
|
||||
kernel->setCrossThreadData(sizeof(uint32_t));
|
||||
|
||||
kernel->patchWorkDim(1, 1, 1);
|
||||
|
||||
mockKernelImmData->mockKernelDescriptor->payloadMappings.dispatchTraits.workDim = 0x0u;
|
||||
|
||||
auto destinationBuffer = ArrayRef<const uint8_t>(kernel->getCrossThreadData(), kernel->getCrossThreadDataSize());
|
||||
auto &kernelDescriptor = mockKernelImmData->getDescriptor();
|
||||
auto workDimInCrossThreadDataPtr = destinationBuffer.begin() + kernelDescriptor.payloadMappings.dispatchTraits.workDim;
|
||||
EXPECT_EQ(*workDimInCrossThreadDataPtr, 0u);
|
||||
|
||||
std::array<std::array<uint32_t, 7>, 8> sizesCountsWorkDim{
|
||||
std::array<uint32_t, 7>{2, 1, 1, 1, 1, 1, 1},
|
||||
std::array<uint32_t, 7>{1, 1, 1, 1, 1, 1, 1},
|
||||
std::array<uint32_t, 7>{1, 2, 1, 2, 1, 1, 2},
|
||||
std::array<uint32_t, 7>{1, 2, 1, 1, 1, 1, 2},
|
||||
std::array<uint32_t, 7>{1, 1, 1, 1, 2, 1, 2},
|
||||
std::array<uint32_t, 7>{1, 1, 1, 2, 2, 2, 3},
|
||||
std::array<uint32_t, 7>{1, 1, 2, 1, 1, 1, 3},
|
||||
std::array<uint32_t, 7>{1, 1, 1, 1, 1, 2, 3}};
|
||||
for (auto parameters : sizesCountsWorkDim) {
|
||||
|
||||
uint32_t groupSizeX = parameters[0];
|
||||
uint32_t groupSizeY = parameters[1];
|
||||
uint32_t groupSizeZ = parameters[2];
|
||||
|
||||
uint32_t groupCountX = parameters[3];
|
||||
uint32_t groupCountY = parameters[4];
|
||||
uint32_t groupCountZ = parameters[5];
|
||||
|
||||
uint32_t expectedWorkDim = parameters[6];
|
||||
|
||||
ze_result_t res = kernel->setGroupSize(groupSizeX, groupSizeY, groupSizeZ);
|
||||
EXPECT_EQ(res, ZE_RESULT_SUCCESS);
|
||||
|
||||
kernel->patchWorkDim(groupCountX, groupCountY, groupCountZ);
|
||||
EXPECT_EQ(*workDimInCrossThreadDataPtr, expectedWorkDim);
|
||||
}
|
||||
}
|
||||
|
||||
using KernelPrintHandlerTest = Test<ModuleFixture>;
|
||||
struct MyPrintfHandler : public PrintfHandler {
|
||||
static uint32_t getPrintfSurfaceInitialDataSize() {
|
||||
|
||||
Reference in New Issue
Block a user