feature: add support for ImplicitArgs versioning

- define ImplicitArgs version 1

Related-To: NEO-14115

Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe
2025-02-25 09:01:14 +00:00
committed by Compute-Runtime-Automation
parent beb859a53a
commit 291745cdf7
23 changed files with 706 additions and 155 deletions

View File

@@ -250,7 +250,11 @@ ze_result_t KernelImp::getBaseAddress(uint64_t *baseAddress) {
return ZE_RESULT_SUCCESS;
}
KernelImp::KernelImp(Module *module) : module(module) {}
KernelImp::KernelImp(Module *module) : module(module) {
if (module) {
this->implicitArgsVersion = module->getDevice()->getGfxCoreHelper().getImplicitArgsVersion();
}
}
KernelImp::~KernelImp() {
if (nullptr != privateMemoryGraphicsAllocation) {
@@ -321,15 +325,9 @@ void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32
}
if (pImplicitArgs) {
pImplicitArgs->numWorkDim = workDim;
pImplicitArgs->globalSizeX = globalWorkSize[0];
pImplicitArgs->globalSizeY = globalWorkSize[1];
pImplicitArgs->globalSizeZ = globalWorkSize[2];
pImplicitArgs->groupCountX = groupCount[0];
pImplicitArgs->groupCountY = groupCount[1];
pImplicitArgs->groupCountZ = groupCount[2];
pImplicitArgs->setNumWorkDim(workDim);
pImplicitArgs->setGlobalSize(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
pImplicitArgs->setGroupCount(groupCount[0], groupCount[1], groupCount[2]);
}
}
@@ -1128,9 +1126,8 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
pImplicitArgs = std::make_unique<NEO::ImplicitArgs>();
*pImplicitArgs = {};
pImplicitArgs->structSize = NEO::ImplicitArgs::getSize();
pImplicitArgs->structVersion = 0;
pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize;
pImplicitArgs->initializeHeader(this->implicitArgsVersion);
pImplicitArgs->setSimdWidth(kernelDescriptor.kernelAttributes.simdSize);
}
if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] > 0) {
@@ -1209,7 +1206,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
static_cast<uintptr_t>(address));
}
if (this->pImplicitArgs) {
pImplicitArgs->rtGlobalBufferPtr = address;
pImplicitArgs->setRtGlobalBufferPtr(address);
}
this->internalResidencyContainer.push_back(rtDispatchGlobalsInfo->rtDispatchGlobalsArray);
@@ -1228,7 +1225,7 @@ void KernelImp::createPrintfBuffer() {
static_cast<uintptr_t>(this->printfBuffer->getGpuAddressToPatch()));
}
if (pImplicitArgs) {
pImplicitArgs->printfBufferPtr = printfBuffer->getGpuAddress();
pImplicitArgs->setPrintfBuffer(printfBuffer->getGpuAddress());
}
this->devicePrintfKernelMutex = &(static_cast<DeviceImp *>(this->module->getDevice())->printfKernelMutex);
}
@@ -1298,9 +1295,7 @@ void KernelImp::patchWorkgroupSizeInCrossThreadData(uint32_t x, uint32_t y, uint
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize2, workgroupSize);
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.enqueuedLocalWorkSize, workgroupSize);
if (pImplicitArgs) {
pImplicitArgs->localSizeX = x;
pImplicitArgs->localSizeY = y;
pImplicitArgs->localSizeZ = z;
pImplicitArgs->setLocalSize(x, y, z);
}
}
@@ -1319,9 +1314,7 @@ void KernelImp::patchGlobalOffset() {
auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkOffset, this->globalOffsets);
if (pImplicitArgs) {
pImplicitArgs->globalOffsetX = globalOffsets[0];
pImplicitArgs->globalOffsetY = globalOffsets[1];
pImplicitArgs->globalOffsetZ = globalOffsets[2];
pImplicitArgs->setGlobalOffset(globalOffsets[0], globalOffsets[1], globalOffsets[2]);
}
}
@@ -1392,7 +1385,7 @@ void KernelImp::setAssertBuffer() {
this->internalResidencyContainer.push_back(assertHandler->getAssertBuffer());
if (pImplicitArgs) {
pImplicitArgs->assertBufferPtr = static_cast<uintptr_t>(assertHandler->getAssertBuffer()->getGpuAddressToPatch());
pImplicitArgs->setAssertBufferPtr(static_cast<uintptr_t>(assertHandler->getAssertBuffer()->getGpuAddressToPatch()));
}
}

View File

@@ -252,6 +252,7 @@ struct KernelImp : Kernel {
const KernelImmutableData *kernelImmData = nullptr;
Module *module = nullptr;
uint32_t implicitArgsVersion = 0;
typedef ze_result_t (KernelImp::*KernelArgHandler)(uint32_t argIndex, size_t argSize, const void *argVal);
std::vector<KernelArgInfo> kernelArgInfos;

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -98,6 +98,7 @@ struct ModuleImmutableDataFixture : public DeviceFixture {
using KernelImp::unifiedMemoryControls;
MockKernel(MockModule *mockModule) : WhiteBox<L0::KernelImp>(mockModule) {
implicitArgsVersion = 0;
}
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2024 Intel Corporation
* Copyright (C) 2020-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -137,7 +137,11 @@ TEST(KernelAssert, GivenKernelWithAssertAndImplicitArgsWhenInitializingKernelThe
auto assertBufferAddress = assertHandler->getAssertBuffer()->getGpuAddressToPatch();
auto implicitArgs = kernel.getImplicitArgs();
ASSERT_NE(nullptr, implicitArgs);
EXPECT_EQ(assertBufferAddress, implicitArgs->assertBufferPtr);
if (implicitArgs->v0.header.structVersion == 0) {
EXPECT_EQ(assertBufferAddress, implicitArgs->v0.assertBufferPtr);
} else if (implicitArgs->v1.header.structVersion == 1) {
EXPECT_EQ(assertBufferAddress, implicitArgs->v1.assertBufferPtr);
}
}
TEST(KernelAssert, GivenNoAssertHandlerWhenKernelWithAssertSetsAssertBufferThenAssertHandlerIsCreated) {

View File

@@ -821,8 +821,10 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
void SetUp() override {
CmdlistAppendLaunchKernelTests::SetUp();
memset(&expectedImplicitArgs, 0, sizeof(ImplicitArgs));
expectedImplicitArgs.structSize = ImplicitArgs::getSize();
memset(&expectedImplicitArgs, 0, sizeof(expectedImplicitArgs));
expectedImplicitArgs.header.structSize = ImplicitArgsV0::getSize();
expectedImplicitArgs.header.structVersion = 0;
expectedImplicitArgs.numWorkDim = 3;
expectedImplicitArgs.simdWidth = 32;
@@ -887,7 +889,7 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(reinterpret_cast<const ImplicitArgs *>(&expectedImplicitArgs), *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
EXPECT_EQ(indirectHeap->getUsed(), alignUp(sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize, NEO::EncodeDispatchKernel<FamilyType>::getDefaultIOHAlignment()));
@@ -899,7 +901,7 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
}
std::unique_ptr<L0::CommandList> commandList;
GraphicsAllocation *indirectHeapAllocation = nullptr;
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize()};
ImplicitArgsV0 expectedImplicitArgs = {};
std::array<uint8_t, 3> workgroupDimOrder{0, 1, 2};
uint32_t implicitArgsProgrammingSize = 0u;
@@ -919,27 +921,27 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
HWCMDTEST_F(IGFX_GEN12LP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, givenPreXeHpPlatformWhenAppendLaunchKernelWithImplicitArgsThenImplicitArgsAreSentToIndirectHeapWithoutLocalIds) {
dispatchKernelWithImplicitArgs<FamilyType>();
auto implicitArgsInIndirectData = indirectHeapAllocation->getUnderlyingBuffer();
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgs::getSize(), 64));
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgsV0::getSize(), 64));
auto programmedImplicitArgsGpuVA = reinterpret_cast<uint64_t *>(crossThreadDataInIndirectData)[0];
EXPECT_EQ(indirectHeapAllocation->getGpuAddress(), programmedImplicitArgsGpuVA);
@@ -966,18 +968,18 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, givenXeHpAndLaterPlatformWhenAppendLaunchKernelWithImplicitArgsAndSimd1ThenLocalIdsAreGeneratedCorrectly) {
@@ -999,12 +1001,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreNotSentToIndirectHeap) {

View File

@@ -938,7 +938,7 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
return implicitArgsProgrammingSize - ImplicitArgs::getSize();
return implicitArgsProgrammingSize - kernel.pImplicitArgs->v0.header.structSize;
} else {
return 0u;
}
@@ -956,6 +956,8 @@ HWTEST_F(CommandListAppendLaunchKernelWithImplicitArgs, givenIndirectDispatchWit
kernel.module = pMockModule.get();
kernel.immutableData.crossThreadDataSize = sizeof(uint64_t);
kernel.pImplicitArgs.reset(new ImplicitArgs());
kernel.pImplicitArgs->v0.header.structVersion = 0;
kernel.pImplicitArgs->v0.header.structSize = ImplicitArgsV0::getSize();
UnitTestHelper<FamilyType>::adjustKernelDescriptorForImplicitArgs(*kernel.immutableData.kernelDescriptor);
kernel.setGroupSize(1, 1, 1);
@@ -981,27 +983,27 @@ HWTEST_F(CommandListAppendLaunchKernelWithImplicitArgs, givenIndirectDispatchWit
auto groupCountXStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
groupCountXStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::gpgpuDispatchDimX);
groupCountXStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, groupCountX));
groupCountXStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, groupCountX));
auto groupCountYStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
groupCountYStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::gpgpuDispatchDimY);
groupCountYStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, groupCountY));
groupCountYStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, groupCountY));
auto groupCountZStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
groupCountZStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::gpgpuDispatchDimZ);
groupCountZStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, groupCountZ));
groupCountZStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, groupCountZ));
auto globalSizeXStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
globalSizeXStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::csGprR1);
globalSizeXStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, globalSizeX));
globalSizeXStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, globalSizeX));
auto globalSizeYStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
globalSizeYStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::csGprR1);
globalSizeYStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, globalSizeY));
globalSizeYStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, globalSizeY));
auto globalSizeZStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
globalSizeZStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::csGprR1);
globalSizeZStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, globalSizeZ));
globalSizeZStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, globalSizeZ));
GenCmdList cmdList;
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(

View File

@@ -1238,7 +1238,7 @@ HWTEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenRayTracingIsInitializ
EXPECT_NE(nullptr, rtDispatchGlobals);
auto implicitArgs = kernel->getImplicitArgs();
ASSERT_NE(nullptr, implicitArgs);
EXPECT_EQ_VAL(implicitArgs->rtGlobalBufferPtr, rtDispatchGlobals->rtDispatchGlobalsArray->getGpuAddressToPatch());
EXPECT_EQ_VAL(implicitArgs->v0.rtGlobalBufferPtr, rtDispatchGlobals->rtDispatchGlobalsArray->getGpuAddressToPatch());
auto &residencyContainer = kernel->getInternalResidencyContainer();
@@ -4060,8 +4060,8 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenInitializeThenPrin
auto printfSurface = kernel->getPrintfBufferAllocation();
ASSERT_NE(nullptr, printfSurface);
EXPECT_NE(0u, pImplicitArgs->printfBufferPtr);
EXPECT_EQ(printfSurface->getGpuAddress(), pImplicitArgs->printfBufferPtr);
EXPECT_NE(0u, pImplicitArgs->v0.printfBufferPtr);
EXPECT_EQ(printfSurface->getGpuAddress(), pImplicitArgs->v0.printfBufferPtr);
}
TEST_F(KernelImplicitArgTests, givenImplicitArgsRequiredWhenCreatingKernelThenImplicitArgsAreCreated) {
@@ -4080,8 +4080,8 @@ TEST_F(KernelImplicitArgTests, givenImplicitArgsRequiredWhenCreatingKernelThenIm
auto pImplicitArgs = kernel->getImplicitArgs();
ASSERT_NE(nullptr, pImplicitArgs);
EXPECT_EQ(ImplicitArgs::getSize(), pImplicitArgs->structSize);
EXPECT_EQ(0u, pImplicitArgs->structVersion);
EXPECT_EQ(ImplicitArgsV0::getSize(), pImplicitArgs->v0.header.structSize);
EXPECT_EQ(0u, pImplicitArgs->v0.header.structVersion);
}
TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParamsThenImplicitArgsAreUpdated) {
@@ -4100,7 +4100,7 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParam
auto pImplicitArgs = kernel->getImplicitArgs();
ASSERT_NE(nullptr, pImplicitArgs);
ImplicitArgs expectedImplicitArgs{ImplicitArgs::getSize()};
ImplicitArgsV0 expectedImplicitArgs{{ImplicitArgsV0::getSize(), 0}};
expectedImplicitArgs.numWorkDim = 3;
expectedImplicitArgs.simdWidth = simd;
@@ -4122,7 +4122,7 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParam
kernel->setGroupCount(3, 2, 1);
kernel->setGlobalOffsetExp(1, 2, 3);
kernel->patchGlobalOffset();
EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
using BindlessKernelTest = Test<DeviceFixture>;

View File

@@ -1928,7 +1928,7 @@ TEST_F(ModuleDynamicLinkTests, givenModuleWithInternalRelocationAndUnresolvedExt
uint32_t internalRelocationOffset = 0x10;
linkerInput->textRelocations.push_back({{implicitArgsRelocationSymbolName, internalRelocationOffset, LinkerInput::RelocationInfo::Type::address, SegmentType::instructions}});
uint32_t expectedInternalRelocationValue = ImplicitArgs::getSize();
uint32_t expectedInternalRelocationValue = ImplicitArgsV0::getSize();
uint32_t externalRelocationOffset = 0x20;
constexpr auto externalSymbolName = "unresolved";
@@ -4795,7 +4795,7 @@ TEST_F(ModuleTests, givenImplicitArgsRelocationAndStackCallsWhenLinkingModuleThe
auto status = pModule->linkBinary();
EXPECT_TRUE(status);
EXPECT_EQ(ImplicitArgs::getSize(), *reinterpret_cast<uint32_t *>(ptrOffset(isaCpuPtr, 0x8)));
EXPECT_EQ(ImplicitArgsV0::getSize(), *reinterpret_cast<uint32_t *>(ptrOffset(isaCpuPtr, 0x8)));
EXPECT_TRUE(kernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
}

View File

@@ -69,13 +69,25 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
auto pImplicitArgs = kernel.getImplicitArgs();
if (pImplicitArgs) {
pImplicitArgs->localIdTablePtr = indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData;
size_t localWorkSize[3] = {0u, 0u, 0u};
pImplicitArgs->setLocalIdTablePtr(indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData);
if (pImplicitArgs->v0.header.structVersion == 0) {
localWorkSize[0] = pImplicitArgs->v0.localSizeX;
localWorkSize[1] = pImplicitArgs->v0.localSizeY;
localWorkSize[2] = pImplicitArgs->v0.localSizeZ;
} else if (pImplicitArgs->v1.header.structVersion == 1) {
localWorkSize[0] = pImplicitArgs->v1.localSizeX;
localWorkSize[1] = pImplicitArgs->v1.localSizeY;
localWorkSize[2] = pImplicitArgs->v1.localSizeZ;
} else {
UNRECOVERABLE_IF(true);
}
const auto &kernelDescriptor = kernel.getDescriptor();
const auto &kernelAttributes = kernelDescriptor.kernelAttributes;
uint32_t requiredWalkOrder = 0u;
size_t localWorkSize[3] = {pImplicitArgs->localSizeX, pImplicitArgs->localSizeY, pImplicitArgs->localSizeZ};
auto generationOfLocalIdsByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
3,
localWorkSize,

View File

@@ -205,9 +205,8 @@ cl_int Kernel::initialize() {
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
pImplicitArgs = std::make_unique<ImplicitArgs>();
*pImplicitArgs = {};
pImplicitArgs->structSize = ImplicitArgs::getSize();
pImplicitArgs->structVersion = 0;
pImplicitArgs->simdWidth = maxSimdSize;
pImplicitArgs->initializeHeader(gfxCoreHelper.getImplicitArgsVersion());
pImplicitArgs->setSimdWidth(maxSimdSize);
}
auto ret = KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(kernelDescriptor.kernelAttributes, &pClDevice->getDevice());
if (ret == NEO::KernelHelper::ErrorCode::invalidKernel) {
@@ -457,7 +456,7 @@ cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
}
if (pImplicitArgs) {
memcpy_s(pImplicitArgs.get(), ImplicitArgs::getSize(), pSourceKernel->getImplicitArgs(), ImplicitArgs::getSize());
memcpy_s(pImplicitArgs.get(), pImplicitArgs->getSize(), pSourceKernel->getImplicitArgs(), pImplicitArgs->getSize());
}
this->isBuiltIn = pSourceKernel->isBuiltIn;
@@ -2275,7 +2274,7 @@ const HardwareInfo &Kernel::getHardwareInfo() const {
void Kernel::setWorkDim(uint32_t workDim) {
patchNonPointer<uint32_t, uint32_t>(getCrossThreadDataRef(), getDescriptor().payloadMappings.dispatchTraits.workDim, workDim);
if (pImplicitArgs) {
pImplicitArgs->numWorkDim = workDim;
pImplicitArgs->setNumWorkDim(workDim);
}
}
@@ -2284,9 +2283,7 @@ void Kernel::setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t glob
getDescriptor().payloadMappings.dispatchTraits.globalWorkOffset,
{globalWorkOffsetX, globalWorkOffsetY, globalWorkOffsetZ});
if (pImplicitArgs) {
pImplicitArgs->globalOffsetX = globalWorkOffsetX;
pImplicitArgs->globalOffsetY = globalWorkOffsetY;
pImplicitArgs->globalOffsetZ = globalWorkOffsetZ;
pImplicitArgs->setGlobalOffset(globalWorkOffsetX, globalWorkOffsetY, globalWorkOffsetZ);
}
}
@@ -2295,9 +2292,7 @@ void Kernel::setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWo
getDescriptor().payloadMappings.dispatchTraits.globalWorkSize,
{globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ});
if (pImplicitArgs) {
pImplicitArgs->globalSizeX = globalWorkSizeX;
pImplicitArgs->globalSizeY = globalWorkSizeY;
pImplicitArgs->globalSizeZ = globalWorkSizeZ;
pImplicitArgs->setGlobalSize(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ);
}
}
@@ -2306,9 +2301,7 @@ void Kernel::setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkS
getDescriptor().payloadMappings.dispatchTraits.localWorkSize,
{localWorkSizeX, localWorkSizeY, localWorkSizeZ});
if (pImplicitArgs) {
pImplicitArgs->localSizeX = localWorkSizeX;
pImplicitArgs->localSizeY = localWorkSizeY;
pImplicitArgs->localSizeZ = localWorkSizeZ;
pImplicitArgs->setLocalSize(localWorkSizeX, localWorkSizeY, localWorkSizeZ);
}
}
@@ -2329,9 +2322,7 @@ void Kernel::setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGro
getDescriptor().payloadMappings.dispatchTraits.numWorkGroups,
{numWorkGroupsX, numWorkGroupsY, numWorkGroupsZ});
if (pImplicitArgs) {
pImplicitArgs->groupCountX = numWorkGroupsX;
pImplicitArgs->groupCountY = numWorkGroupsY;
pImplicitArgs->groupCountZ = numWorkGroupsZ;
pImplicitArgs->setGroupCount(numWorkGroupsX, numWorkGroupsY, numWorkGroupsZ);
}
}

View File

@@ -70,7 +70,7 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo)
}
auto pImplicitArgs = kernel->getImplicitArgs();
if (pImplicitArgs) {
pImplicitArgs->printfBufferPtr = printfSurface->getGpuAddress();
pImplicitArgs->setPrintfBuffer(printfSurface->getGpuAddress());
}
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -1169,8 +1169,8 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
ClDeviceFixture::setUp();
indirectHeapAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
memset(&expectedImplicitArgs, 0, sizeof(ImplicitArgs));
expectedImplicitArgs.structSize = ImplicitArgs::getSize();
memset(&expectedImplicitArgs, 0, sizeof(expectedImplicitArgs));
expectedImplicitArgs.header.structSize = ImplicitArgsV0::getSize();
expectedImplicitArgs.numWorkDim = 3;
expectedImplicitArgs.simdWidth = 32;
@@ -1242,7 +1242,7 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
}
}
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize()};
ImplicitArgsV0 expectedImplicitArgs = {{ImplicitArgsV0::getSize(), 0}};
GraphicsAllocation *indirectHeapAllocation = nullptr;
std::array<uint8_t, 3> workgroupDimOrder{0, 1, 2};
uint32_t implicitArgsProgrammingSize = 0u;
@@ -1251,18 +1251,18 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenXeHpAndLaterPlatformWhenSendingIndirectStateForKernelWithImplicitArgsThenImplicitArgsAreSentToIndirectHeapWithLocalIds) {
dispatchKernelWithImplicitArgs<FamilyType>();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
HWCMDTEST_F(IGFX_GEN12LP_CORE, HardwareCommandsImplicitArgsTests, givenPreXeHpPlatformWhenSendingIndirectStateForKernelWithImplicitArgsThenImplicitArgsAreSentToIndirectHeapWithoutLocalIds) {
dispatchKernelWithImplicitArgs<FamilyType>();
auto implicitArgsInIndirectData = indirectHeapAllocation->getUnderlyingBuffer();
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgs::getSize(), MemoryConstants::cacheLineSize));
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize));
auto programmedImplicitArgsGpuVA = reinterpret_cast<uint64_t *>(crossThreadDataInIndirectData)[0];
EXPECT_EQ(indirectHeapAllocation->getGpuAddress(), programmedImplicitArgsGpuVA);
@@ -1287,18 +1287,18 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsAndHwLocalIdsGenerationWhenSendingIndirectStateThenLocalIdsAreGeneratedAndCorrectlyProgrammedInCrossThreadData) {
@@ -1322,18 +1322,18 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
alignedFree(expectedLocalIds);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingIndirectStateWithSimd1ThenLocalIdsAreGeneratedCorrectly) {
@@ -1355,12 +1355,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest;

View File

@@ -22,6 +22,7 @@
#include "shared/test/common/fixtures/memory_management_fixture.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/gtest_helpers.h"
#include "shared/test/common/helpers/raii_gfx_core_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_allocation_properties.h"
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
@@ -3914,8 +3915,8 @@ TEST_F(KernelImplicitArgsTest, WhenKernelRequiresImplicitArgsThenImplicitArgsStr
ASSERT_NE(nullptr, pImplicitArgs);
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize(), 0, 0, 32};
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgs::getSize()));
ImplicitArgsV0 expectedImplicitArgs = {{ImplicitArgsV0::getSize(), 0}, 0, 32};
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgsV0::getSize()));
}
}
@@ -3933,7 +3934,7 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenSettingKernelParam
ASSERT_NE(nullptr, pImplicitArgs);
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize()};
ImplicitArgsV0 expectedImplicitArgs = {{ImplicitArgsV0::getSize(), 0}};
expectedImplicitArgs.numWorkDim = 3;
expectedImplicitArgs.simdWidth = 32;
expectedImplicitArgs.localSizeX = 4;
@@ -3955,7 +3956,53 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenSettingKernelParam
kernel.setGlobalWorkOffsetValues(1, 2, 3);
kernel.setNumWorkGroupsValues(3, 2, 1);
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgsV0::getSize()));
}
HWTEST_F(KernelImplicitArgsTest, givenGfxCoreRequiringImplicitArgsV1WhenSettingKernelParamsThenImplicitArgsAreProperlySet) {
auto pKernelInfo = std::make_unique<MockKernelInfo>();
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 32;
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
struct MockGfxCoreHelper : NEO::GfxCoreHelperHw<FamilyType> {
uint32_t getImplicitArgsVersion() const override {
return 1;
}
};
RAIIGfxCoreHelperFactory<MockGfxCoreHelper> raii(*pClDevice->getDevice().getExecutionEnvironment()->rootDeviceEnvironments[0]);
MockContext context(pClDevice);
MockProgram program(&context, false, toClDeviceVector(*pClDevice));
MockKernel kernel(&program, *pKernelInfo, *pClDevice);
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
auto pImplicitArgs = kernel.getImplicitArgs();
ASSERT_NE(nullptr, pImplicitArgs);
ImplicitArgsV1 expectedImplicitArgs = {{ImplicitArgsV1::getSize(), 1}};
expectedImplicitArgs.numWorkDim = 3;
expectedImplicitArgs.localSizeX = 4;
expectedImplicitArgs.localSizeY = 5;
expectedImplicitArgs.localSizeZ = 6;
expectedImplicitArgs.globalSizeX = 7;
expectedImplicitArgs.globalSizeY = 8;
expectedImplicitArgs.globalSizeZ = 9;
expectedImplicitArgs.globalOffsetX = 1;
expectedImplicitArgs.globalOffsetY = 2;
expectedImplicitArgs.globalOffsetZ = 3;
expectedImplicitArgs.groupCountX = 3;
expectedImplicitArgs.groupCountY = 2;
expectedImplicitArgs.groupCountZ = 1;
kernel.setWorkDim(3);
kernel.setLocalWorkSizeValues(4, 5, 6);
kernel.setGlobalWorkSizeValues(7, 8, 9);
kernel.setGlobalWorkOffsetValues(1, 2, 3);
kernel.setNumWorkGroupsValues(3, 2, 1);
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgsV1::getSize()));
}
TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenCloneKernelThenImplicitArgsAreCopied) {
@@ -3971,7 +4018,7 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenCloneKernelThenImp
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
ASSERT_EQ(CL_SUCCESS, kernel2.initialize());
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize()};
ImplicitArgsV0 expectedImplicitArgs = {{ImplicitArgsV0::getSize(), 0}};
expectedImplicitArgs.numWorkDim = 3;
expectedImplicitArgs.simdWidth = 32;
expectedImplicitArgs.localSizeX = 4;
@@ -3999,7 +4046,7 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenCloneKernelThenImp
ASSERT_NE(nullptr, pImplicitArgs);
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgs::getSize()));
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgsV0::getSize()));
}
TEST_F(KernelImplicitArgsTest, givenKernelWithoutImplicitArgsWhenSettingKernelParamsThenImplicitArgsAreNotSet) {

View File

@@ -112,7 +112,7 @@ TEST_F(PrintfHandlerTests, givenKernelWithImplicitArgsWhenPreparingPrintfHandler
auto pImplicitArgs = kernel.getImplicitArgs();
ASSERT_NE(nullptr, pImplicitArgs);
EXPECT_EQ(printfSurface->getGpuAddress(), pImplicitArgs->printfBufferPtr);
EXPECT_EQ(printfSurface->getGpuAddress(), pImplicitArgs->v0.printfBufferPtr);
}
HWTEST_F(PrintfHandlerTests, givenEnabledStatelessCompressionWhenPrintEnqueueOutputIsCalledThenBCSEngineIsUsedToDecompressPrintfOutput) {

View File

@@ -577,11 +577,22 @@ void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t
UNRECOVERABLE_IF(NEO::isValidOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && (kernelDescriptor.payloadMappings.dispatchTraits.workDim & 0b11) != 0u);
setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
if (implicitArgsGpuPtr) {
CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgs, groupCountX), offsetof(ImplicitArgs, groupCountY), offsetof(ImplicitArgs, groupCountZ)};
CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgs, globalSizeX), offsetof(ImplicitArgs, globalSizeY), offsetof(ImplicitArgs, globalSizeZ)};
setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr);
setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
setWorkDimIndirect(container, offsetof(ImplicitArgs, numWorkDim), implicitArgsGpuPtr, dispatchInterface->getGroupSize());
const auto version = container.getDevice()->getGfxCoreHelper().getImplicitArgsVersion();
if (version == 0) {
constexpr CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgsV0, groupCountX), offsetof(ImplicitArgsV0, groupCountY), offsetof(ImplicitArgsV0, groupCountZ)};
constexpr CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgsV0, globalSizeX), offsetof(ImplicitArgsV0, globalSizeY), offsetof(ImplicitArgsV0, globalSizeZ)};
constexpr auto numWorkDimOffset = offsetof(ImplicitArgsV0, numWorkDim);
setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr);
setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
setWorkDimIndirect(container, numWorkDimOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
} else if (version == 1) {
constexpr CrossThreadDataOffset groupCountOffsetV1[] = {offsetof(ImplicitArgsV1, groupCountX), offsetof(ImplicitArgsV1, groupCountY), offsetof(ImplicitArgsV1, groupCountZ)};
constexpr CrossThreadDataOffset globalSizeOffsetV1[] = {offsetof(ImplicitArgsV1, globalSizeX), offsetof(ImplicitArgsV1, globalSizeY), offsetof(ImplicitArgsV1, globalSizeZ)};
constexpr auto numWorkDimOffsetV1 = offsetof(ImplicitArgsV1, numWorkDim);
setGroupCountIndirect(container, groupCountOffsetV1, implicitArgsGpuPtr);
setGlobalWorkSizeIndirect(container, globalSizeOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
setWorkDimIndirect(container, numWorkDimOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
}
}
}

View File

@@ -264,7 +264,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace);
if (pImplicitArgs) {
offsetThreadData -= sizeForImplicitArgsStruct;
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
pImplicitArgs->setLocalIdTablePtr(heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize);
EncodeDispatchKernel<Family>::patchScratchAddressInImplicitArgs<heaplessModeEnabled>(*pImplicitArgs, scratchAddressForImmediatePatching, args.immediateScratchAddressPatching);
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), rootDeviceEnvironment, &args.outImplicitArgsPtr);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -656,7 +656,15 @@ void Linker::resolveImplicitArgs(const KernelDescriptorsT &kernelDescriptors, De
UNRECOVERABLE_IF(!pDevice);
kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = kernelDescriptor.kernelAttributes.flags.useStackCalls || pDevice->getDebugger() != nullptr;
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
*pImplicitArgsReloc = ImplicitArgs::getSize();
auto implicitArgsSize = 0;
if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 0) {
implicitArgsSize = ImplicitArgsV0::getSize();
} else if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
implicitArgsSize = ImplicitArgsV1::getSize();
} else {
UNRECOVERABLE_IF(true);
}
*pImplicitArgsReloc = implicitArgsSize;
}
}
}

View File

@@ -198,6 +198,7 @@ class GfxCoreHelper {
virtual uint32_t getDeviceTimestampWidth() const = 0;
virtual void alignThreadGroupCountToDssSize(uint32_t &threadCount, uint32_t dssCount, uint32_t threadsPerDss, uint32_t threadGroupSize) const = 0;
virtual uint32_t getImplicitArgsVersion() const = 0;
virtual ~GfxCoreHelper() = default;
@@ -436,6 +437,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
bool usmCompressionSupported(const NEO::HardwareInfo &hwInfo) const override;
uint32_t getDeviceTimestampWidth() const override;
uint32_t getImplicitArgsVersion() const override;
~GfxCoreHelperHw() override = default;

View File

@@ -833,4 +833,9 @@ uint32_t GfxCoreHelperHw<Family>::getInternalCopyEngineIndex(const HardwareInfo
return std::min(defaultInternalCopyEngineIndex, highestAvailableIndex);
}
template <typename Family>
uint32_t GfxCoreHelperHw<Family>::getImplicitArgsVersion() const {
return 0;
}
} // namespace NEO

View File

@@ -11,13 +11,20 @@
#include <cstddef>
#include <cstdint>
#include <optional>
#include <type_traits>
namespace NEO {
struct alignas(32) ImplicitArgs {
struct alignas(1) ImplicitArgsHeader {
uint8_t structSize;
uint8_t structVersion;
};
static_assert(sizeof(ImplicitArgsHeader) == (2 * sizeof(uint8_t)));
struct alignas(32) ImplicitArgsV0 {
ImplicitArgsHeader header;
uint8_t numWorkDim;
uint8_t simdWidth;
uint32_t localSizeX;
@@ -39,12 +46,195 @@ struct alignas(32) ImplicitArgs {
uint64_t assertBufferPtr;
uint8_t reserved[16];
static constexpr uint8_t getSize() { return static_cast<uint8_t>((offsetof(ImplicitArgs, reserved))); }
static constexpr uint8_t getSize() { return static_cast<uint8_t>((offsetof(ImplicitArgsV0, reserved))); }
};
static_assert(std::alignment_of_v<ImplicitArgsV0> == 32, "Implicit args size need to be aligned to 32");
static_assert(sizeof(ImplicitArgsV0) == (32 * sizeof(uint32_t)));
static_assert(ImplicitArgsV0::getSize() == (28 * sizeof(uint32_t)));
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV0>);
struct alignas(32) ImplicitArgsV1 {
ImplicitArgsHeader header;
uint8_t numWorkDim;
uint8_t padding0;
uint32_t localSizeX;
uint32_t localSizeY;
uint32_t localSizeZ;
uint64_t globalSizeX;
uint64_t globalSizeY;
uint64_t globalSizeZ;
uint64_t printfBufferPtr;
uint64_t globalOffsetX;
uint64_t globalOffsetY;
uint64_t globalOffsetZ;
uint64_t localIdTablePtr;
uint32_t groupCountX;
uint32_t groupCountY;
uint32_t groupCountZ;
uint32_t padding1;
uint64_t rtGlobalBufferPtr;
uint64_t assertBufferPtr;
uint8_t reserved[44];
static constexpr uint8_t getSize() { return static_cast<uint8_t>(offsetof(ImplicitArgsV1, reserved)); }
};
static_assert(std::alignment_of_v<ImplicitArgsV1> == 32, "Implicit args size need to be aligned to 32");
static_assert(sizeof(ImplicitArgsV1) == (40 * sizeof(uint32_t)));
static_assert(ImplicitArgsV1::getSize() == (28 * sizeof(uint32_t)));
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV1>);
struct alignas(32) ImplicitArgs {
union {
ImplicitArgsV0 v0;
ImplicitArgsV1 v1;
};
void initializeHeader(uint32_t version) {
if (version == 0) {
v0.header.structSize = ImplicitArgsV0::getSize();
v0.header.structVersion = 0;
} else if (version == 1) {
v1.header.structSize = NEO::ImplicitArgsV1::getSize();
v1.header.structVersion = 1;
}
}
uint8_t getSize() const {
if (v0.header.structVersion == 0) {
return v0.header.structSize;
} else if (v1.header.structVersion == 1) {
return v1.header.structSize;
}
DEBUG_BREAK_IF(true);
return 0;
}
void setNumWorkDim(uint32_t numWorkDim) {
if (v0.header.structVersion == 0) {
v0.numWorkDim = numWorkDim;
} else if (v1.header.structVersion == 1) {
v1.numWorkDim = numWorkDim;
}
}
void setSimdWidth(uint32_t simd) {
if (v0.header.structVersion == 0) {
v0.simdWidth = simd;
}
}
std::optional<uint32_t> getSimdWidth() const {
if (v0.header.structVersion == 0) {
return v0.simdWidth;
}
return std::nullopt;
}
void setLocalSize(uint32_t x, uint32_t y, uint32_t z) {
if (v0.header.structVersion == 0) {
v0.localSizeX = x;
v0.localSizeY = y;
v0.localSizeZ = z;
} else if (v1.header.structVersion == 1) {
v1.localSizeX = x;
v1.localSizeY = y;
v1.localSizeZ = z;
}
}
void getLocalSize(uint32_t &x, uint32_t &y, uint32_t &z) const {
if (v0.header.structVersion == 0) {
x = v0.localSizeX;
y = v0.localSizeY;
z = v0.localSizeZ;
} else if (v1.header.structVersion == 1) {
x = v1.localSizeX;
y = v1.localSizeY;
z = v1.localSizeZ;
}
}
void setGlobalSize(uint32_t x, uint32_t y, uint32_t z) {
if (v0.header.structVersion == 0) {
v0.globalSizeX = x;
v0.globalSizeY = y;
v0.globalSizeZ = z;
} else if (v1.header.structVersion == 1) {
v1.globalSizeX = x;
v1.globalSizeY = y;
v1.globalSizeZ = z;
}
}
void setGlobalOffset(uint32_t x, uint32_t y, uint32_t z) {
if (v0.header.structVersion == 0) {
v0.globalOffsetX = x;
v0.globalOffsetY = y;
v0.globalOffsetZ = z;
} else if (v1.header.structVersion == 1) {
v1.globalOffsetX = x;
v1.globalOffsetY = y;
v1.globalOffsetZ = z;
}
}
void setGroupCount(uint32_t x, uint32_t y, uint32_t z) {
if (v0.header.structVersion == 0) {
v0.groupCountX = x;
v0.groupCountY = y;
v0.groupCountZ = z;
} else if (v1.header.structVersion == 1) {
v1.groupCountX = x;
v1.groupCountY = y;
v1.groupCountZ = z;
}
}
void setLocalIdTablePtr(uint64_t address) {
if (v0.header.structVersion == 0) {
v0.localIdTablePtr = address;
} else if (v1.header.structVersion == 1) {
v1.localIdTablePtr = address;
}
}
void setPrintfBuffer(uint64_t address) {
if (v0.header.structVersion == 0) {
v0.printfBufferPtr = address;
} else if (v1.header.structVersion == 1) {
v1.printfBufferPtr = address;
}
}
void setRtGlobalBufferPtr(uint64_t address) {
if (v0.header.structVersion == 0) {
v0.rtGlobalBufferPtr = address;
} else if (v1.header.structVersion == 1) {
v1.rtGlobalBufferPtr = address;
}
}
void setAssertBufferPtr(uint64_t address) {
if (v0.header.structVersion == 0) {
v0.assertBufferPtr = address;
} else if (v1.header.structVersion == 1) {
v1.assertBufferPtr = address;
}
}
};
static_assert(std::alignment_of_v<ImplicitArgs> == 32, "Implicit args size need to be aligned to 32");
static_assert(sizeof(ImplicitArgs) == (32 * sizeof(uint32_t)));
static_assert(ImplicitArgs::getSize() == (28 * sizeof(uint32_t)));
static_assert(NEO::TypeTraits::isPodV<ImplicitArgs>);
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2024 Intel Corporation
* Copyright (C) 2022-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -48,7 +48,8 @@ uint32_t getSizeForImplicitArgsStruct(const ImplicitArgs *pImplicitArgs, const K
if (!pImplicitArgs) {
return 0;
}
auto implicitArgsSize = static_cast<uint32_t>(ImplicitArgs::getSize());
auto implicitArgsSize = pImplicitArgs->getSize();
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
if (patchImplicitArgsBufferInCrossThread) {
return alignUp(implicitArgsSize, MemoryConstants::cacheLineSize);
@@ -65,10 +66,19 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
uint32_t localIdsSize = 0;
if (false == patchImplicitArgsBufferInCrossThread) {
auto simdSize = pImplicitArgs->simdWidth;
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
auto simdSize = 32u;
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize);
Vec3<size_t> localWorkSize = {pImplicitArgs->localSizeX, pImplicitArgs->localSizeY, pImplicitArgs->localSizeZ};
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
uint32_t lws[3] = {0, 0, 0};
pImplicitArgs->getLocalSize(lws[0], lws[1], lws[2]);
Vec3<size_t> localWorkSize = {lws[0], lws[1], lws[2]};
if (pImplicitArgs->v0.header.structVersion == 0) {
simdSize = pImplicitArgs->v0.simdWidth;
grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize);
}
auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
localIdsSize = static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment));
localIdsSize = alignUp(localIdsSize, MemoryConstants::cacheLineSize);
@@ -81,9 +91,14 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, localIdsGeneratedByHw, rootDeviceEnvironment);
auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram);
auto size = implicitArgs.v0.header.structSize;
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
if (!patchImplicitArgsBufferInCrossThread) {
auto simdSize = implicitArgs.simdWidth;
uint32_t lws[3] = {0, 0, 0};
implicitArgs.getLocalSize(lws[0], lws[1], lws[2]);
auto simdSize = implicitArgs.getSimdWidth().value_or(32);
auto grfSize = getGrfSize(simdSize);
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
auto dimensionOrder = getDimensionOrderForLocalIds(kernelDescriptor.kernelAttributes.workgroupDimensionsOrder, hwGenerationOfLocalIdsParams);
@@ -91,12 +106,13 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
NEO::generateLocalIDs(
ptrToPatch,
simdSize,
std::array<uint16_t, 3>{{static_cast<uint16_t>(implicitArgs.localSizeX),
static_cast<uint16_t>(implicitArgs.localSizeY),
static_cast<uint16_t>(implicitArgs.localSizeZ)}},
std::array<uint16_t, 3>{{static_cast<uint16_t>(lws[0]),
static_cast<uint16_t>(lws[1]),
static_cast<uint16_t>(lws[2])}},
dimensionOrder,
false, grfSize, grfCount, rootDeviceEnvironment);
auto sizeForLocalIdsProgramming = totalSizeToProgram - ImplicitArgs::getSize();
auto sizeForLocalIdsProgramming = totalSizeToProgram - implicitArgs.getSize();
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
}
@@ -104,7 +120,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
*outImplicitArgsAddress = ptrToPatch;
}
memcpy_s(ptrToPatch, ImplicitArgs::getSize(), &implicitArgs, ImplicitArgs::getSize());
memcpy_s(ptrToPatch, size, &implicitArgs, size);
return retVal;
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -18,8 +18,10 @@
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/default_hw_info.h"
#include "shared/test/common/helpers/gtest_helpers.h"
#include "shared/test/common/helpers/raii_gfx_core_helper.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/mocks/mock_elf.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
#include "shared/test/common/mocks/mock_modules_zebin.h"
#include "shared/test/common/mocks/ult_device_factory.h"
@@ -2080,12 +2082,138 @@ TEST_F(LinkerTests, givenImplicitArgRelocationAndStackCallsThenPatchRelocationWi
EXPECT_EQ(0U, relocatedSymbols.size());
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
EXPECT_EQ(ImplicitArgs::getSize(), *addressToPatch);
EXPECT_EQ(ImplicitArgsV0::getSize(), *addressToPatch);
EXPECT_EQ(initData, *(addressToPatch - 1));
EXPECT_EQ(initData, *(addressToPatch + 1));
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
}
HWTEST_F(LinkerTests, givenImplicitArgRelocationAndImplicitArgsV1WhenLinkingThenPatchRelocationWithSizeOfImplicitArgsV1) {
DebugManagerStateRestore restore;
struct MockGfxCoreHelper : NEO::GfxCoreHelperHw<FamilyType> {
uint32_t getImplicitArgsVersion() const override {
return 1;
}
};
NEO::LinkerInput linkerInput;
vISA::GenRelocEntry reloc = {};
std::string relocationName = implicitArgsRelocationSymbolName;
memcpy_s(reloc.r_symbol, 1024, relocationName.c_str(), relocationName.size());
reloc.r_offset = 8;
reloc.r_type = vISA::GenRelocType::R_SYM_ADDR_32;
vISA::GenRelocEntry relocs[] = {reloc};
constexpr uint32_t numRelocations = 1;
bool decodeRelocSuccess = linkerInput.decodeRelocationTable(&relocs, numRelocations, 0);
EXPECT_TRUE(decodeRelocSuccess);
NEO::Linker linker(linkerInput);
NEO::Linker::SegmentInfo globalVarSegment, globalConstSegment, exportedFuncSegment;
globalVarSegment.gpuAddress = 8;
globalVarSegment.segmentSize = 64;
globalConstSegment.gpuAddress = 128;
globalConstSegment.segmentSize = 256;
exportedFuncSegment.gpuAddress = 4096;
exportedFuncSegment.segmentSize = 1024;
NEO::Linker::UnresolvedExternals unresolvedExternals;
NEO::Linker::KernelDescriptorsT kernelDescriptors;
NEO::Linker::ExternalFunctionsT externalFunctions;
KernelDescriptor kernelDescriptor;
kernelDescriptors.push_back(&kernelDescriptor);
kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
kernelDescriptor.kernelAttributes.flags.useStackCalls = true;
HardwareInfo hwInfo = *defaultHwInfo;
MockExecutionEnvironment executionEnvironment(&hwInfo, false, 1);
executionEnvironment.incRefInternal();
UltDeviceFactory deviceFactory{1, 0, executionEnvironment};
auto rootDeviceIndex = deviceFactory.rootDevices[0]->getRootDeviceIndex();
RAIIGfxCoreHelperFactory<MockGfxCoreHelper> raii(*deviceFactory.rootDevices[0]->getExecutionEnvironment()->rootDeviceEnvironments[rootDeviceIndex]);
std::vector<char> instructionSegment;
uint32_t initData = 0x77777777;
instructionSegment.resize(32, static_cast<char>(initData));
NEO::Linker::PatchableSegment seg0;
seg0.hostPointer = instructionSegment.data();
seg0.segmentSize = instructionSegment.size();
NEO::Linker::PatchableSegments patchableInstructionSegments{seg0};
auto linkResult = linker.link(globalVarSegment, globalConstSegment, exportedFuncSegment, {},
nullptr, nullptr, patchableInstructionSegments, unresolvedExternals,
deviceFactory.rootDevices[0], nullptr, 0, nullptr, 0, kernelDescriptors, externalFunctions);
EXPECT_EQ(NEO::LinkingStatus::linkedFully, linkResult);
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
EXPECT_EQ(ImplicitArgsV1::getSize(), *addressToPatch);
EXPECT_EQ(initData, *(addressToPatch - 1));
EXPECT_EQ(initData, *(addressToPatch + 1));
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
}
HWTEST_F(LinkerTests, givenImplicitArgRelocationAndImplicitArgsWithUnknownVersionWhenLinkingThenUnrecoverableIfCalled) {
DebugManagerStateRestore restore;
struct MockGfxCoreHelper : NEO::GfxCoreHelperHw<FamilyType> {
uint32_t getImplicitArgsVersion() const override {
return 2; // unknown version
}
};
NEO::LinkerInput linkerInput;
vISA::GenRelocEntry reloc = {};
std::string relocationName = implicitArgsRelocationSymbolName;
memcpy_s(reloc.r_symbol, 1024, relocationName.c_str(), relocationName.size());
reloc.r_offset = 8;
reloc.r_type = vISA::GenRelocType::R_SYM_ADDR_32;
vISA::GenRelocEntry relocs[] = {reloc};
constexpr uint32_t numRelocations = 1;
bool decodeRelocSuccess = linkerInput.decodeRelocationTable(&relocs, numRelocations, 0);
EXPECT_TRUE(decodeRelocSuccess);
NEO::Linker linker(linkerInput);
NEO::Linker::SegmentInfo globalVarSegment, globalConstSegment, exportedFuncSegment;
globalVarSegment.gpuAddress = 8;
globalVarSegment.segmentSize = 64;
globalConstSegment.gpuAddress = 128;
globalConstSegment.segmentSize = 256;
exportedFuncSegment.gpuAddress = 4096;
exportedFuncSegment.segmentSize = 1024;
NEO::Linker::UnresolvedExternals unresolvedExternals;
NEO::Linker::KernelDescriptorsT kernelDescriptors;
NEO::Linker::ExternalFunctionsT externalFunctions;
KernelDescriptor kernelDescriptor;
kernelDescriptors.push_back(&kernelDescriptor);
kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
kernelDescriptor.kernelAttributes.flags.useStackCalls = true;
HardwareInfo hwInfo = *defaultHwInfo;
MockExecutionEnvironment executionEnvironment(&hwInfo, false, 1);
executionEnvironment.incRefInternal();
UltDeviceFactory deviceFactory{1, 0, executionEnvironment};
auto rootDeviceIndex = deviceFactory.rootDevices[0]->getRootDeviceIndex();
RAIIGfxCoreHelperFactory<MockGfxCoreHelper> raii(*deviceFactory.rootDevices[0]->getExecutionEnvironment()->rootDeviceEnvironments[rootDeviceIndex]);
std::vector<char> instructionSegment;
uint32_t initData = 0x77777777;
instructionSegment.resize(32, static_cast<char>(initData));
NEO::Linker::PatchableSegment seg0;
seg0.hostPointer = instructionSegment.data();
seg0.segmentSize = instructionSegment.size();
NEO::Linker::PatchableSegments patchableInstructionSegments{seg0};
EXPECT_THROW(linker.link(globalVarSegment, globalConstSegment, exportedFuncSegment, {},
nullptr, nullptr, patchableInstructionSegments, unresolvedExternals,
deviceFactory.rootDevices[0], nullptr, 0, nullptr, 0, kernelDescriptors, externalFunctions),
std::exception);
}
using LinkerDebuggingSupportedTests = ::testing::Test;
TEST_F(LinkerDebuggingSupportedTests, givenImplicitArgRelocationAndEnabledDebuggerThenPatchRelocationWithSizeOfImplicitArgStructAndUpdateKernelDescriptor) {
@@ -2142,7 +2270,7 @@ TEST_F(LinkerDebuggingSupportedTests, givenImplicitArgRelocationAndEnabledDebugg
EXPECT_EQ(0U, relocatedSymbols.size());
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
EXPECT_EQ(ImplicitArgs::getSize(), *addressToPatch);
EXPECT_EQ(ImplicitArgsV0::getSize(), *addressToPatch);
EXPECT_EQ(initData, *(addressToPatch - 1));
EXPECT_EQ(initData, *(addressToPatch + 1));
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
@@ -2349,7 +2477,7 @@ TEST_F(LinkerTests, givenMultipleImplicitArgsRelocationsWithinSingleKernelWhenLi
for (const auto &reloc : relocs) {
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
EXPECT_EQ(ImplicitArgs::getSize(), *addressToPatch);
EXPECT_EQ(ImplicitArgsV0::getSize(), *addressToPatch);
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
}
}

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2022-2024 Intel Corporation
* Copyright (C) 2022-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -64,43 +64,49 @@ TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsPr
}
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
ImplicitArgs implicitArgs{ImplicitArgs::getSize()};
ImplicitArgs implicitArgs{};
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
implicitArgs.v0.header.structVersion = 0;
KernelDescriptor kernelDescriptor{};
EXPECT_TRUE(isUndefinedOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer));
implicitArgs.simdWidth = 32;
implicitArgs.localSizeX = 2;
implicitArgs.localSizeY = 3;
implicitArgs.localSizeZ = 4;
implicitArgs.v0.simdWidth = 32;
implicitArgs.v0.localSizeX = 2;
implicitArgs.v0.localSizeY = 3;
implicitArgs.v0.localSizeZ = 4;
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
auto totalWorkgroupSize = implicitArgs.v0.localSizeX * implicitArgs.v0.localSizeY * implicitArgs.v0.localSizeZ;
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
EXPECT_EQ(localIdsSize + ImplicitArgs::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
EXPECT_EQ(localIdsSize + ImplicitArgsV0::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
}
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
ImplicitArgs implicitArgs{ImplicitArgs::getSize()};
ImplicitArgs implicitArgs{};
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
implicitArgs.v0.header.structVersion = 0;
KernelDescriptor kernelDescriptor{};
kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer = 0x10;
EXPECT_TRUE(isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer));
implicitArgs.simdWidth = 32;
implicitArgs.localSizeX = 2;
implicitArgs.localSizeY = 3;
implicitArgs.localSizeZ = 4;
implicitArgs.v0.simdWidth = 32;
implicitArgs.v0.localSizeX = 2;
implicitArgs.v0.localSizeY = 3;
implicitArgs.v0.localSizeZ = 4;
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
EXPECT_EQ(alignUp(implicitArgs.v0.header.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
}
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
ImplicitArgs implicitArgs{ImplicitArgs::getSize()};
ImplicitArgs implicitArgs{};
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
implicitArgs.v0.header.structVersion = 0;
void *outImplicitArgs = nullptr;
KernelDescriptor kernelDescriptor{};
@@ -110,15 +116,15 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
EXPECT_TRUE(isUndefinedOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer));
implicitArgs.simdWidth = 1;
implicitArgs.localSizeX = 2;
implicitArgs.localSizeY = 3;
implicitArgs.localSizeZ = 4;
implicitArgs.v0.simdWidth = 1;
implicitArgs.v0.localSizeX = 2;
implicitArgs.v0.localSizeY = 3;
implicitArgs.v0.localSizeZ = 4;
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment);
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
auto totalWorkgroupSize = implicitArgs.v0.localSizeX * implicitArgs.v0.localSizeY * implicitArgs.v0.localSizeZ;
auto localIdsPatchingSize = totalWorkgroupSize * 3 * sizeof(uint16_t);
auto localIdsOffset = alignUp(localIdsPatchingSize, MemoryConstants::cacheLineSize);
@@ -141,7 +147,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
EXPECT_NE(pattern, memoryToPatch.get()[offset]) << offset;
}
for (; offset < totalSizeForPatching - ImplicitArgs::getSize(); offset++) {
for (; offset < totalSizeForPatching - ImplicitArgsV0::getSize(); offset++) {
EXPECT_EQ(pattern, memoryToPatch.get()[offset]);
}
@@ -151,21 +157,24 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
}
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
ImplicitArgs implicitArgs{ImplicitArgs::getSize()};
ImplicitArgs implicitArgs{};
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
implicitArgs.v0.header.structVersion = 0;
void *outImplicitArgs = nullptr;
KernelDescriptor kernelDescriptor{};
kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer = 0x10;
EXPECT_TRUE(isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer));
implicitArgs.simdWidth = 32;
implicitArgs.localSizeX = 2;
implicitArgs.localSizeY = 3;
implicitArgs.localSizeZ = 4;
implicitArgs.v0.simdWidth = 32;
implicitArgs.v0.localSizeX = 2;
implicitArgs.v0.localSizeY = 3;
implicitArgs.v0.localSizeZ = 4;
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment);
EXPECT_EQ(alignUp(ImplicitArgs::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching);
EXPECT_EQ(alignUp(ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching);
auto memoryToPatch = std::make_unique<uint8_t[]>(totalSizeForPatching);
@@ -182,7 +191,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
uint32_t offset = 0;
for (; offset < ImplicitArgs::getSize(); offset++) {
for (; offset < ImplicitArgsV0::getSize(); offset++) {
EXPECT_NE(pattern, memoryToPatch.get()[offset]);
}
@@ -190,3 +199,132 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
EXPECT_EQ(pattern, memoryToPatch.get()[offset]);
}
}
TEST(ImplicitArgsV0Test, givenImplicitArgsV0WhenSettingFieldsThenCorrectFieldsAreSet) {
ImplicitArgs implicitArgs{};
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
implicitArgs.v0.header.structVersion = 0;
EXPECT_EQ(ImplicitArgsV0::getSize(), implicitArgs.getSize());
implicitArgs.setAssertBufferPtr(0x4567000);
implicitArgs.setGlobalOffset(5, 6, 7);
implicitArgs.setGlobalSize(1, 2, 3);
implicitArgs.setGroupCount(10, 20, 30);
implicitArgs.setLocalSize(8, 9, 11);
implicitArgs.setLocalIdTablePtr(0x5699000);
implicitArgs.setPrintfBuffer(0xff000);
implicitArgs.setNumWorkDim(16);
implicitArgs.setRtGlobalBufferPtr(0x1000123400);
implicitArgs.setSimdWidth(32);
EXPECT_EQ(0x4567000u, implicitArgs.v0.assertBufferPtr);
EXPECT_EQ(5u, implicitArgs.v0.globalOffsetX);
EXPECT_EQ(6u, implicitArgs.v0.globalOffsetY);
EXPECT_EQ(7u, implicitArgs.v0.globalOffsetZ);
EXPECT_EQ(1u, implicitArgs.v0.globalSizeX);
EXPECT_EQ(2u, implicitArgs.v0.globalSizeY);
EXPECT_EQ(3u, implicitArgs.v0.globalSizeZ);
EXPECT_EQ(10u, implicitArgs.v0.groupCountX);
EXPECT_EQ(20u, implicitArgs.v0.groupCountY);
EXPECT_EQ(30u, implicitArgs.v0.groupCountZ);
EXPECT_EQ(8u, implicitArgs.v0.localSizeX);
EXPECT_EQ(9u, implicitArgs.v0.localSizeY);
EXPECT_EQ(11u, implicitArgs.v0.localSizeZ);
EXPECT_EQ(0x5699000u, implicitArgs.v0.localIdTablePtr);
EXPECT_EQ(0xff000u, implicitArgs.v0.printfBufferPtr);
EXPECT_EQ(16u, implicitArgs.v0.numWorkDim);
EXPECT_EQ(0x1000123400u, implicitArgs.v0.rtGlobalBufferPtr);
}
TEST(ImplicitArgsV1Test, givenImplicitArgsV1WhenSettingFieldsThenCorrectFieldsAreSet) {
ImplicitArgs implicitArgs{};
implicitArgs.v1.header.structSize = ImplicitArgsV1::getSize();
implicitArgs.v1.header.structVersion = 1;
EXPECT_EQ(ImplicitArgsV1::getSize(), implicitArgs.getSize());
implicitArgs.setAssertBufferPtr(0x4567000);
implicitArgs.setGlobalOffset(5, 6, 7);
implicitArgs.setGlobalSize(1, 2, 3);
implicitArgs.setGroupCount(10, 20, 30);
implicitArgs.setLocalSize(8, 9, 11);
implicitArgs.setLocalIdTablePtr(0x5699000);
implicitArgs.setPrintfBuffer(0xff000);
implicitArgs.setNumWorkDim(16);
implicitArgs.setRtGlobalBufferPtr(0x1000123400);
implicitArgs.setSimdWidth(32);
EXPECT_EQ(0x4567000u, implicitArgs.v1.assertBufferPtr);
EXPECT_EQ(5u, implicitArgs.v1.globalOffsetX);
EXPECT_EQ(6u, implicitArgs.v1.globalOffsetY);
EXPECT_EQ(7u, implicitArgs.v1.globalOffsetZ);
EXPECT_EQ(1u, implicitArgs.v1.globalSizeX);
EXPECT_EQ(2u, implicitArgs.v1.globalSizeY);
EXPECT_EQ(3u, implicitArgs.v1.globalSizeZ);
EXPECT_EQ(10u, implicitArgs.v1.groupCountX);
EXPECT_EQ(20u, implicitArgs.v1.groupCountY);
EXPECT_EQ(30u, implicitArgs.v1.groupCountZ);
EXPECT_EQ(8u, implicitArgs.v1.localSizeX);
EXPECT_EQ(9u, implicitArgs.v1.localSizeY);
EXPECT_EQ(11u, implicitArgs.v1.localSizeZ);
EXPECT_EQ(0x5699000u, implicitArgs.v1.localIdTablePtr);
EXPECT_EQ(0xff000u, implicitArgs.v1.printfBufferPtr);
EXPECT_EQ(16u, implicitArgs.v1.numWorkDim);
EXPECT_EQ(0x1000123400u, implicitArgs.v1.rtGlobalBufferPtr);
}
TEST(ImplicitArgsV1Test, givenImplicitArgsWithUnknownVersionWhenSettingFieldsThenFieldsAreNotPopulated) {
ImplicitArgs implicitArgs{};
memset(&implicitArgs, 0, sizeof(implicitArgs));
implicitArgs.v1.header.structSize = ImplicitArgsV1::getSize();
implicitArgs.v1.header.structVersion = 2; // unknown version
EXPECT_EQ(0u, implicitArgs.getSize());
implicitArgs.setAssertBufferPtr(0x4567000);
implicitArgs.setGlobalOffset(5, 6, 7);
implicitArgs.setGlobalSize(1, 2, 3);
implicitArgs.setGroupCount(10, 20, 30);
implicitArgs.setLocalSize(8, 9, 11);
implicitArgs.setLocalIdTablePtr(0x5699000);
implicitArgs.setPrintfBuffer(0xff000);
implicitArgs.setNumWorkDim(16);
implicitArgs.setRtGlobalBufferPtr(0x1000123400);
implicitArgs.setSimdWidth(32);
EXPECT_EQ(0u, implicitArgs.v1.assertBufferPtr);
EXPECT_EQ(0u, implicitArgs.v1.globalOffsetX);
EXPECT_EQ(0u, implicitArgs.v1.globalOffsetY);
EXPECT_EQ(0u, implicitArgs.v1.globalOffsetZ);
EXPECT_EQ(0u, implicitArgs.v1.globalSizeX);
EXPECT_EQ(0u, implicitArgs.v1.globalSizeY);
EXPECT_EQ(0u, implicitArgs.v1.globalSizeZ);
EXPECT_EQ(0u, implicitArgs.v1.groupCountX);
EXPECT_EQ(0u, implicitArgs.v1.groupCountY);
EXPECT_EQ(0u, implicitArgs.v1.groupCountZ);
EXPECT_EQ(0u, implicitArgs.v1.localSizeX);
EXPECT_EQ(0u, implicitArgs.v1.localSizeY);
EXPECT_EQ(0u, implicitArgs.v1.localSizeZ);
EXPECT_EQ(0u, implicitArgs.v1.localIdTablePtr);
EXPECT_EQ(0u, implicitArgs.v1.printfBufferPtr);
EXPECT_EQ(0u, implicitArgs.v1.numWorkDim);
EXPECT_EQ(0u, implicitArgs.v1.rtGlobalBufferPtr);
}