mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-19 15:20:36 +08:00
feature: add support for ImplicitArgs versioning
- define ImplicitArgs version 1 Related-To: NEO-14115 Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
beb859a53a
commit
291745cdf7
@@ -250,7 +250,11 @@ ze_result_t KernelImp::getBaseAddress(uint64_t *baseAddress) {
|
||||
return ZE_RESULT_SUCCESS;
|
||||
}
|
||||
|
||||
KernelImp::KernelImp(Module *module) : module(module) {}
|
||||
KernelImp::KernelImp(Module *module) : module(module) {
|
||||
if (module) {
|
||||
this->implicitArgsVersion = module->getDevice()->getGfxCoreHelper().getImplicitArgsVersion();
|
||||
}
|
||||
}
|
||||
|
||||
KernelImp::~KernelImp() {
|
||||
if (nullptr != privateMemoryGraphicsAllocation) {
|
||||
@@ -321,15 +325,9 @@ void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32
|
||||
}
|
||||
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->numWorkDim = workDim;
|
||||
|
||||
pImplicitArgs->globalSizeX = globalWorkSize[0];
|
||||
pImplicitArgs->globalSizeY = globalWorkSize[1];
|
||||
pImplicitArgs->globalSizeZ = globalWorkSize[2];
|
||||
|
||||
pImplicitArgs->groupCountX = groupCount[0];
|
||||
pImplicitArgs->groupCountY = groupCount[1];
|
||||
pImplicitArgs->groupCountZ = groupCount[2];
|
||||
pImplicitArgs->setNumWorkDim(workDim);
|
||||
pImplicitArgs->setGlobalSize(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
|
||||
pImplicitArgs->setGroupCount(groupCount[0], groupCount[1], groupCount[2]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1128,9 +1126,8 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
|
||||
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
|
||||
pImplicitArgs = std::make_unique<NEO::ImplicitArgs>();
|
||||
*pImplicitArgs = {};
|
||||
pImplicitArgs->structSize = NEO::ImplicitArgs::getSize();
|
||||
pImplicitArgs->structVersion = 0;
|
||||
pImplicitArgs->simdWidth = kernelDescriptor.kernelAttributes.simdSize;
|
||||
pImplicitArgs->initializeHeader(this->implicitArgsVersion);
|
||||
pImplicitArgs->setSimdWidth(kernelDescriptor.kernelAttributes.simdSize);
|
||||
}
|
||||
|
||||
if (kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] > 0) {
|
||||
@@ -1209,7 +1206,7 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
|
||||
static_cast<uintptr_t>(address));
|
||||
}
|
||||
if (this->pImplicitArgs) {
|
||||
pImplicitArgs->rtGlobalBufferPtr = address;
|
||||
pImplicitArgs->setRtGlobalBufferPtr(address);
|
||||
}
|
||||
|
||||
this->internalResidencyContainer.push_back(rtDispatchGlobalsInfo->rtDispatchGlobalsArray);
|
||||
@@ -1228,7 +1225,7 @@ void KernelImp::createPrintfBuffer() {
|
||||
static_cast<uintptr_t>(this->printfBuffer->getGpuAddressToPatch()));
|
||||
}
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->printfBufferPtr = printfBuffer->getGpuAddress();
|
||||
pImplicitArgs->setPrintfBuffer(printfBuffer->getGpuAddress());
|
||||
}
|
||||
this->devicePrintfKernelMutex = &(static_cast<DeviceImp *>(this->module->getDevice())->printfKernelMutex);
|
||||
}
|
||||
@@ -1298,9 +1295,7 @@ void KernelImp::patchWorkgroupSizeInCrossThreadData(uint32_t x, uint32_t y, uint
|
||||
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize2, workgroupSize);
|
||||
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.enqueuedLocalWorkSize, workgroupSize);
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->localSizeX = x;
|
||||
pImplicitArgs->localSizeY = y;
|
||||
pImplicitArgs->localSizeZ = z;
|
||||
pImplicitArgs->setLocalSize(x, y, z);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1319,9 +1314,7 @@ void KernelImp::patchGlobalOffset() {
|
||||
auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
|
||||
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkOffset, this->globalOffsets);
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->globalOffsetX = globalOffsets[0];
|
||||
pImplicitArgs->globalOffsetY = globalOffsets[1];
|
||||
pImplicitArgs->globalOffsetZ = globalOffsets[2];
|
||||
pImplicitArgs->setGlobalOffset(globalOffsets[0], globalOffsets[1], globalOffsets[2]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1392,7 +1385,7 @@ void KernelImp::setAssertBuffer() {
|
||||
this->internalResidencyContainer.push_back(assertHandler->getAssertBuffer());
|
||||
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->assertBufferPtr = static_cast<uintptr_t>(assertHandler->getAssertBuffer()->getGpuAddressToPatch());
|
||||
pImplicitArgs->setAssertBufferPtr(static_cast<uintptr_t>(assertHandler->getAssertBuffer()->getGpuAddressToPatch()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -252,6 +252,7 @@ struct KernelImp : Kernel {
|
||||
|
||||
const KernelImmutableData *kernelImmData = nullptr;
|
||||
Module *module = nullptr;
|
||||
uint32_t implicitArgsVersion = 0;
|
||||
|
||||
typedef ze_result_t (KernelImp::*KernelArgHandler)(uint32_t argIndex, size_t argSize, const void *argVal);
|
||||
std::vector<KernelArgInfo> kernelArgInfos;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2024 Intel Corporation
|
||||
* Copyright (C) 2020-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -98,6 +98,7 @@ struct ModuleImmutableDataFixture : public DeviceFixture {
|
||||
using KernelImp::unifiedMemoryControls;
|
||||
|
||||
MockKernel(MockModule *mockModule) : WhiteBox<L0::KernelImp>(mockModule) {
|
||||
implicitArgsVersion = 0;
|
||||
}
|
||||
void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2020-2024 Intel Corporation
|
||||
* Copyright (C) 2020-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -137,7 +137,11 @@ TEST(KernelAssert, GivenKernelWithAssertAndImplicitArgsWhenInitializingKernelThe
|
||||
auto assertBufferAddress = assertHandler->getAssertBuffer()->getGpuAddressToPatch();
|
||||
auto implicitArgs = kernel.getImplicitArgs();
|
||||
ASSERT_NE(nullptr, implicitArgs);
|
||||
EXPECT_EQ(assertBufferAddress, implicitArgs->assertBufferPtr);
|
||||
if (implicitArgs->v0.header.structVersion == 0) {
|
||||
EXPECT_EQ(assertBufferAddress, implicitArgs->v0.assertBufferPtr);
|
||||
} else if (implicitArgs->v1.header.structVersion == 1) {
|
||||
EXPECT_EQ(assertBufferAddress, implicitArgs->v1.assertBufferPtr);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(KernelAssert, GivenNoAssertHandlerWhenKernelWithAssertSetsAssertBufferThenAssertHandlerIsCreated) {
|
||||
|
||||
@@ -821,8 +821,10 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
|
||||
|
||||
void SetUp() override {
|
||||
CmdlistAppendLaunchKernelTests::SetUp();
|
||||
memset(&expectedImplicitArgs, 0, sizeof(ImplicitArgs));
|
||||
expectedImplicitArgs.structSize = ImplicitArgs::getSize();
|
||||
memset(&expectedImplicitArgs, 0, sizeof(expectedImplicitArgs));
|
||||
|
||||
expectedImplicitArgs.header.structSize = ImplicitArgsV0::getSize();
|
||||
expectedImplicitArgs.header.structVersion = 0;
|
||||
|
||||
expectedImplicitArgs.numWorkDim = 3;
|
||||
expectedImplicitArgs.simdWidth = 32;
|
||||
@@ -887,7 +889,7 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
|
||||
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&expectedImplicitArgs, *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(reinterpret_cast<const ImplicitArgs *>(&expectedImplicitArgs), *kernelDescriptor, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
|
||||
auto sizePerThreadDataForWholeGroup = kernel->getPerThreadDataSizeForWholeThreadGroup();
|
||||
EXPECT_EQ(indirectHeap->getUsed(), alignUp(sizeCrossThreadData + sizePerThreadDataForWholeGroup + implicitArgsProgrammingSize, NEO::EncodeDispatchKernel<FamilyType>::getDefaultIOHAlignment()));
|
||||
@@ -899,7 +901,7 @@ struct CmdlistAppendLaunchKernelWithImplicitArgsTests : CmdlistAppendLaunchKerne
|
||||
}
|
||||
std::unique_ptr<L0::CommandList> commandList;
|
||||
GraphicsAllocation *indirectHeapAllocation = nullptr;
|
||||
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize()};
|
||||
ImplicitArgsV0 expectedImplicitArgs = {};
|
||||
std::array<uint8_t, 3> workgroupDimOrder{0, 1, 2};
|
||||
uint32_t implicitArgsProgrammingSize = 0u;
|
||||
|
||||
@@ -919,27 +921,27 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
|
||||
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN12LP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, givenPreXeHpPlatformWhenAppendLaunchKernelWithImplicitArgsThenImplicitArgsAreSentToIndirectHeapWithoutLocalIds) {
|
||||
dispatchKernelWithImplicitArgs<FamilyType>();
|
||||
|
||||
auto implicitArgsInIndirectData = indirectHeapAllocation->getUnderlyingBuffer();
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
|
||||
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgs::getSize(), 64));
|
||||
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgsV0::getSize(), 64));
|
||||
|
||||
auto programmedImplicitArgsGpuVA = reinterpret_cast<uint64_t *>(crossThreadDataInIndirectData)[0];
|
||||
EXPECT_EQ(indirectHeapAllocation->getGpuAddress(), programmedImplicitArgsGpuVA);
|
||||
@@ -966,18 +968,18 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
|
||||
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, givenXeHpAndLaterPlatformWhenAppendLaunchKernelWithImplicitArgsAndSimd1ThenLocalIdsAreGeneratedCorrectly) {
|
||||
@@ -999,12 +1001,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
|
||||
EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);
|
||||
|
||||
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppendLaunchKernelThenImplicitArgsAreNotSentToIndirectHeap) {
|
||||
|
||||
@@ -938,7 +938,7 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe
|
||||
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
return implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
return implicitArgsProgrammingSize - kernel.pImplicitArgs->v0.header.structSize;
|
||||
} else {
|
||||
return 0u;
|
||||
}
|
||||
@@ -956,6 +956,8 @@ HWTEST_F(CommandListAppendLaunchKernelWithImplicitArgs, givenIndirectDispatchWit
|
||||
kernel.module = pMockModule.get();
|
||||
kernel.immutableData.crossThreadDataSize = sizeof(uint64_t);
|
||||
kernel.pImplicitArgs.reset(new ImplicitArgs());
|
||||
kernel.pImplicitArgs->v0.header.structVersion = 0;
|
||||
kernel.pImplicitArgs->v0.header.structSize = ImplicitArgsV0::getSize();
|
||||
UnitTestHelper<FamilyType>::adjustKernelDescriptorForImplicitArgs(*kernel.immutableData.kernelDescriptor);
|
||||
|
||||
kernel.setGroupSize(1, 1, 1);
|
||||
@@ -981,27 +983,27 @@ HWTEST_F(CommandListAppendLaunchKernelWithImplicitArgs, givenIndirectDispatchWit
|
||||
|
||||
auto groupCountXStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
groupCountXStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::gpgpuDispatchDimX);
|
||||
groupCountXStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, groupCountX));
|
||||
groupCountXStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, groupCountX));
|
||||
|
||||
auto groupCountYStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
groupCountYStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::gpgpuDispatchDimY);
|
||||
groupCountYStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, groupCountY));
|
||||
groupCountYStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, groupCountY));
|
||||
|
||||
auto groupCountZStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
groupCountZStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::gpgpuDispatchDimZ);
|
||||
groupCountZStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, groupCountZ));
|
||||
groupCountZStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, groupCountZ));
|
||||
|
||||
auto globalSizeXStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
globalSizeXStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::csGprR1);
|
||||
globalSizeXStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, globalSizeX));
|
||||
globalSizeXStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, globalSizeX));
|
||||
|
||||
auto globalSizeYStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
globalSizeYStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::csGprR1);
|
||||
globalSizeYStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, globalSizeY));
|
||||
globalSizeYStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, globalSizeY));
|
||||
|
||||
auto globalSizeZStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
|
||||
globalSizeZStoreRegisterMemCmd.setRegisterAddress(RegisterOffsets::csGprR1);
|
||||
globalSizeZStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgs, globalSizeZ));
|
||||
globalSizeZStoreRegisterMemCmd.setMemoryAddress(pImplicitArgsGPUVA + offsetof(ImplicitArgsV0, globalSizeZ));
|
||||
|
||||
GenCmdList cmdList;
|
||||
ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer(
|
||||
|
||||
@@ -1238,7 +1238,7 @@ HWTEST_F(KernelImmutableDataTests, whenHasRTCallsIsTrueThenRayTracingIsInitializ
|
||||
EXPECT_NE(nullptr, rtDispatchGlobals);
|
||||
auto implicitArgs = kernel->getImplicitArgs();
|
||||
ASSERT_NE(nullptr, implicitArgs);
|
||||
EXPECT_EQ_VAL(implicitArgs->rtGlobalBufferPtr, rtDispatchGlobals->rtDispatchGlobalsArray->getGpuAddressToPatch());
|
||||
EXPECT_EQ_VAL(implicitArgs->v0.rtGlobalBufferPtr, rtDispatchGlobals->rtDispatchGlobalsArray->getGpuAddressToPatch());
|
||||
|
||||
auto &residencyContainer = kernel->getInternalResidencyContainer();
|
||||
|
||||
@@ -4060,8 +4060,8 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenInitializeThenPrin
|
||||
auto printfSurface = kernel->getPrintfBufferAllocation();
|
||||
ASSERT_NE(nullptr, printfSurface);
|
||||
|
||||
EXPECT_NE(0u, pImplicitArgs->printfBufferPtr);
|
||||
EXPECT_EQ(printfSurface->getGpuAddress(), pImplicitArgs->printfBufferPtr);
|
||||
EXPECT_NE(0u, pImplicitArgs->v0.printfBufferPtr);
|
||||
EXPECT_EQ(printfSurface->getGpuAddress(), pImplicitArgs->v0.printfBufferPtr);
|
||||
}
|
||||
|
||||
TEST_F(KernelImplicitArgTests, givenImplicitArgsRequiredWhenCreatingKernelThenImplicitArgsAreCreated) {
|
||||
@@ -4080,8 +4080,8 @@ TEST_F(KernelImplicitArgTests, givenImplicitArgsRequiredWhenCreatingKernelThenIm
|
||||
auto pImplicitArgs = kernel->getImplicitArgs();
|
||||
ASSERT_NE(nullptr, pImplicitArgs);
|
||||
|
||||
EXPECT_EQ(ImplicitArgs::getSize(), pImplicitArgs->structSize);
|
||||
EXPECT_EQ(0u, pImplicitArgs->structVersion);
|
||||
EXPECT_EQ(ImplicitArgsV0::getSize(), pImplicitArgs->v0.header.structSize);
|
||||
EXPECT_EQ(0u, pImplicitArgs->v0.header.structVersion);
|
||||
}
|
||||
|
||||
TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParamsThenImplicitArgsAreUpdated) {
|
||||
@@ -4100,7 +4100,7 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParam
|
||||
auto pImplicitArgs = kernel->getImplicitArgs();
|
||||
ASSERT_NE(nullptr, pImplicitArgs);
|
||||
|
||||
ImplicitArgs expectedImplicitArgs{ImplicitArgs::getSize()};
|
||||
ImplicitArgsV0 expectedImplicitArgs{{ImplicitArgsV0::getSize(), 0}};
|
||||
|
||||
expectedImplicitArgs.numWorkDim = 3;
|
||||
expectedImplicitArgs.simdWidth = simd;
|
||||
@@ -4122,7 +4122,7 @@ TEST_F(KernelImplicitArgTests, givenKernelWithImplicitArgsWhenSettingKernelParam
|
||||
kernel->setGroupCount(3, 2, 1);
|
||||
kernel->setGlobalOffsetExp(1, 2, 3);
|
||||
kernel->patchGlobalOffset();
|
||||
EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(pImplicitArgs, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
using BindlessKernelTest = Test<DeviceFixture>;
|
||||
|
||||
@@ -1928,7 +1928,7 @@ TEST_F(ModuleDynamicLinkTests, givenModuleWithInternalRelocationAndUnresolvedExt
|
||||
|
||||
uint32_t internalRelocationOffset = 0x10;
|
||||
linkerInput->textRelocations.push_back({{implicitArgsRelocationSymbolName, internalRelocationOffset, LinkerInput::RelocationInfo::Type::address, SegmentType::instructions}});
|
||||
uint32_t expectedInternalRelocationValue = ImplicitArgs::getSize();
|
||||
uint32_t expectedInternalRelocationValue = ImplicitArgsV0::getSize();
|
||||
|
||||
uint32_t externalRelocationOffset = 0x20;
|
||||
constexpr auto externalSymbolName = "unresolved";
|
||||
@@ -4795,7 +4795,7 @@ TEST_F(ModuleTests, givenImplicitArgsRelocationAndStackCallsWhenLinkingModuleThe
|
||||
auto status = pModule->linkBinary();
|
||||
EXPECT_TRUE(status);
|
||||
|
||||
EXPECT_EQ(ImplicitArgs::getSize(), *reinterpret_cast<uint32_t *>(ptrOffset(isaCpuPtr, 0x8)));
|
||||
EXPECT_EQ(ImplicitArgsV0::getSize(), *reinterpret_cast<uint32_t *>(ptrOffset(isaCpuPtr, 0x8)));
|
||||
|
||||
EXPECT_TRUE(kernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
|
||||
}
|
||||
|
||||
@@ -69,13 +69,25 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||
|
||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->localIdTablePtr = indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData;
|
||||
size_t localWorkSize[3] = {0u, 0u, 0u};
|
||||
|
||||
pImplicitArgs->setLocalIdTablePtr(indirectHeap.getGraphicsAllocation()->getGpuAddress() + offsetCrossThreadData);
|
||||
if (pImplicitArgs->v0.header.structVersion == 0) {
|
||||
localWorkSize[0] = pImplicitArgs->v0.localSizeX;
|
||||
localWorkSize[1] = pImplicitArgs->v0.localSizeY;
|
||||
localWorkSize[2] = pImplicitArgs->v0.localSizeZ;
|
||||
} else if (pImplicitArgs->v1.header.structVersion == 1) {
|
||||
localWorkSize[0] = pImplicitArgs->v1.localSizeX;
|
||||
localWorkSize[1] = pImplicitArgs->v1.localSizeY;
|
||||
localWorkSize[2] = pImplicitArgs->v1.localSizeZ;
|
||||
} else {
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
const auto &kernelDescriptor = kernel.getDescriptor();
|
||||
|
||||
const auto &kernelAttributes = kernelDescriptor.kernelAttributes;
|
||||
uint32_t requiredWalkOrder = 0u;
|
||||
size_t localWorkSize[3] = {pImplicitArgs->localSizeX, pImplicitArgs->localSizeY, pImplicitArgs->localSizeZ};
|
||||
|
||||
auto generationOfLocalIdsByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
|
||||
3,
|
||||
localWorkSize,
|
||||
|
||||
@@ -205,9 +205,8 @@ cl_int Kernel::initialize() {
|
||||
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
|
||||
pImplicitArgs = std::make_unique<ImplicitArgs>();
|
||||
*pImplicitArgs = {};
|
||||
pImplicitArgs->structSize = ImplicitArgs::getSize();
|
||||
pImplicitArgs->structVersion = 0;
|
||||
pImplicitArgs->simdWidth = maxSimdSize;
|
||||
pImplicitArgs->initializeHeader(gfxCoreHelper.getImplicitArgsVersion());
|
||||
pImplicitArgs->setSimdWidth(maxSimdSize);
|
||||
}
|
||||
auto ret = KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(kernelDescriptor.kernelAttributes, &pClDevice->getDevice());
|
||||
if (ret == NEO::KernelHelper::ErrorCode::invalidKernel) {
|
||||
@@ -457,7 +456,7 @@ cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
|
||||
}
|
||||
|
||||
if (pImplicitArgs) {
|
||||
memcpy_s(pImplicitArgs.get(), ImplicitArgs::getSize(), pSourceKernel->getImplicitArgs(), ImplicitArgs::getSize());
|
||||
memcpy_s(pImplicitArgs.get(), pImplicitArgs->getSize(), pSourceKernel->getImplicitArgs(), pImplicitArgs->getSize());
|
||||
}
|
||||
this->isBuiltIn = pSourceKernel->isBuiltIn;
|
||||
|
||||
@@ -2275,7 +2274,7 @@ const HardwareInfo &Kernel::getHardwareInfo() const {
|
||||
void Kernel::setWorkDim(uint32_t workDim) {
|
||||
patchNonPointer<uint32_t, uint32_t>(getCrossThreadDataRef(), getDescriptor().payloadMappings.dispatchTraits.workDim, workDim);
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->numWorkDim = workDim;
|
||||
pImplicitArgs->setNumWorkDim(workDim);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2284,9 +2283,7 @@ void Kernel::setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t glob
|
||||
getDescriptor().payloadMappings.dispatchTraits.globalWorkOffset,
|
||||
{globalWorkOffsetX, globalWorkOffsetY, globalWorkOffsetZ});
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->globalOffsetX = globalWorkOffsetX;
|
||||
pImplicitArgs->globalOffsetY = globalWorkOffsetY;
|
||||
pImplicitArgs->globalOffsetZ = globalWorkOffsetZ;
|
||||
pImplicitArgs->setGlobalOffset(globalWorkOffsetX, globalWorkOffsetY, globalWorkOffsetZ);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2295,9 +2292,7 @@ void Kernel::setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWo
|
||||
getDescriptor().payloadMappings.dispatchTraits.globalWorkSize,
|
||||
{globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ});
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->globalSizeX = globalWorkSizeX;
|
||||
pImplicitArgs->globalSizeY = globalWorkSizeY;
|
||||
pImplicitArgs->globalSizeZ = globalWorkSizeZ;
|
||||
pImplicitArgs->setGlobalSize(globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2306,9 +2301,7 @@ void Kernel::setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkS
|
||||
getDescriptor().payloadMappings.dispatchTraits.localWorkSize,
|
||||
{localWorkSizeX, localWorkSizeY, localWorkSizeZ});
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->localSizeX = localWorkSizeX;
|
||||
pImplicitArgs->localSizeY = localWorkSizeY;
|
||||
pImplicitArgs->localSizeZ = localWorkSizeZ;
|
||||
pImplicitArgs->setLocalSize(localWorkSizeX, localWorkSizeY, localWorkSizeZ);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2329,9 +2322,7 @@ void Kernel::setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGro
|
||||
getDescriptor().payloadMappings.dispatchTraits.numWorkGroups,
|
||||
{numWorkGroupsX, numWorkGroupsY, numWorkGroupsZ});
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->groupCountX = numWorkGroupsX;
|
||||
pImplicitArgs->groupCountY = numWorkGroupsY;
|
||||
pImplicitArgs->groupCountZ = numWorkGroupsZ;
|
||||
pImplicitArgs->setGroupCount(numWorkGroupsX, numWorkGroupsY, numWorkGroupsZ);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ void PrintfHandler::prepareDispatch(const MultiDispatchInfo &multiDispatchInfo)
|
||||
}
|
||||
auto pImplicitArgs = kernel->getImplicitArgs();
|
||||
if (pImplicitArgs) {
|
||||
pImplicitArgs->printfBufferPtr = printfSurface->getGpuAddress();
|
||||
pImplicitArgs->setPrintfBuffer(printfSurface->getGpuAddress());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2024 Intel Corporation
|
||||
* Copyright (C) 2019-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -1169,8 +1169,8 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
|
||||
ClDeviceFixture::setUp();
|
||||
indirectHeapAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
|
||||
|
||||
memset(&expectedImplicitArgs, 0, sizeof(ImplicitArgs));
|
||||
expectedImplicitArgs.structSize = ImplicitArgs::getSize();
|
||||
memset(&expectedImplicitArgs, 0, sizeof(expectedImplicitArgs));
|
||||
expectedImplicitArgs.header.structSize = ImplicitArgsV0::getSize();
|
||||
|
||||
expectedImplicitArgs.numWorkDim = 3;
|
||||
expectedImplicitArgs.simdWidth = 32;
|
||||
@@ -1242,7 +1242,7 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
|
||||
}
|
||||
}
|
||||
|
||||
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize()};
|
||||
ImplicitArgsV0 expectedImplicitArgs = {{ImplicitArgsV0::getSize(), 0}};
|
||||
GraphicsAllocation *indirectHeapAllocation = nullptr;
|
||||
std::array<uint8_t, 3> workgroupDimOrder{0, 1, 2};
|
||||
uint32_t implicitArgsProgrammingSize = 0u;
|
||||
@@ -1251,18 +1251,18 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenXeHpAndLaterPlatformWhenSendingIndirectStateForKernelWithImplicitArgsThenImplicitArgsAreSentToIndirectHeapWithLocalIds) {
|
||||
dispatchKernelWithImplicitArgs<FamilyType>();
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_GEN12LP_CORE, HardwareCommandsImplicitArgsTests, givenPreXeHpPlatformWhenSendingIndirectStateForKernelWithImplicitArgsThenImplicitArgsAreSentToIndirectHeapWithoutLocalIds) {
|
||||
dispatchKernelWithImplicitArgs<FamilyType>();
|
||||
|
||||
auto implicitArgsInIndirectData = indirectHeapAllocation->getUnderlyingBuffer();
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
|
||||
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgs::getSize(), MemoryConstants::cacheLineSize));
|
||||
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize));
|
||||
|
||||
auto programmedImplicitArgsGpuVA = reinterpret_cast<uint64_t *>(crossThreadDataInIndirectData)[0];
|
||||
EXPECT_EQ(indirectHeapAllocation->getGpuAddress(), programmedImplicitArgsGpuVA);
|
||||
@@ -1287,18 +1287,18 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
|
||||
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsAndHwLocalIdsGenerationWhenSendingIndirectStateThenLocalIdsAreGeneratedAndCorrectlyProgrammedInCrossThreadData) {
|
||||
@@ -1322,18 +1322,18 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgs::getSize(), MemoryConstants::cacheLineSize);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
alignedFree(expectedLocalIds);
|
||||
|
||||
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithImplicitArgsWhenSendingIndirectStateWithSimd1ThenLocalIdsAreGeneratedCorrectly) {
|
||||
@@ -1355,12 +1355,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgs::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
|
||||
EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);
|
||||
|
||||
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
using HardwareCommandsTestXeHpAndLater = HardwareCommandsTest;
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#include "shared/test/common/fixtures/memory_management_fixture.h"
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/helpers/gtest_helpers.h"
|
||||
#include "shared/test/common/helpers/raii_gfx_core_helper.h"
|
||||
#include "shared/test/common/libult/ult_command_stream_receiver.h"
|
||||
#include "shared/test/common/mocks/mock_allocation_properties.h"
|
||||
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
|
||||
@@ -3914,8 +3915,8 @@ TEST_F(KernelImplicitArgsTest, WhenKernelRequiresImplicitArgsThenImplicitArgsStr
|
||||
|
||||
ASSERT_NE(nullptr, pImplicitArgs);
|
||||
|
||||
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize(), 0, 0, 32};
|
||||
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgs::getSize()));
|
||||
ImplicitArgsV0 expectedImplicitArgs = {{ImplicitArgsV0::getSize(), 0}, 0, 32};
|
||||
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3933,7 +3934,7 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenSettingKernelParam
|
||||
|
||||
ASSERT_NE(nullptr, pImplicitArgs);
|
||||
|
||||
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize()};
|
||||
ImplicitArgsV0 expectedImplicitArgs = {{ImplicitArgsV0::getSize(), 0}};
|
||||
expectedImplicitArgs.numWorkDim = 3;
|
||||
expectedImplicitArgs.simdWidth = 32;
|
||||
expectedImplicitArgs.localSizeX = 4;
|
||||
@@ -3955,7 +3956,53 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenSettingKernelParam
|
||||
kernel.setGlobalWorkOffsetValues(1, 2, 3);
|
||||
kernel.setNumWorkGroupsValues(3, 2, 1);
|
||||
|
||||
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
HWTEST_F(KernelImplicitArgsTest, givenGfxCoreRequiringImplicitArgsV1WhenSettingKernelParamsThenImplicitArgsAreProperlySet) {
|
||||
auto pKernelInfo = std::make_unique<MockKernelInfo>();
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 32;
|
||||
pKernelInfo->kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
|
||||
|
||||
struct MockGfxCoreHelper : NEO::GfxCoreHelperHw<FamilyType> {
|
||||
uint32_t getImplicitArgsVersion() const override {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
RAIIGfxCoreHelperFactory<MockGfxCoreHelper> raii(*pClDevice->getDevice().getExecutionEnvironment()->rootDeviceEnvironments[0]);
|
||||
|
||||
MockContext context(pClDevice);
|
||||
MockProgram program(&context, false, toClDeviceVector(*pClDevice));
|
||||
|
||||
MockKernel kernel(&program, *pKernelInfo, *pClDevice);
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||
|
||||
ASSERT_NE(nullptr, pImplicitArgs);
|
||||
|
||||
ImplicitArgsV1 expectedImplicitArgs = {{ImplicitArgsV1::getSize(), 1}};
|
||||
expectedImplicitArgs.numWorkDim = 3;
|
||||
expectedImplicitArgs.localSizeX = 4;
|
||||
expectedImplicitArgs.localSizeY = 5;
|
||||
expectedImplicitArgs.localSizeZ = 6;
|
||||
expectedImplicitArgs.globalSizeX = 7;
|
||||
expectedImplicitArgs.globalSizeY = 8;
|
||||
expectedImplicitArgs.globalSizeZ = 9;
|
||||
expectedImplicitArgs.globalOffsetX = 1;
|
||||
expectedImplicitArgs.globalOffsetY = 2;
|
||||
expectedImplicitArgs.globalOffsetZ = 3;
|
||||
expectedImplicitArgs.groupCountX = 3;
|
||||
expectedImplicitArgs.groupCountY = 2;
|
||||
expectedImplicitArgs.groupCountZ = 1;
|
||||
|
||||
kernel.setWorkDim(3);
|
||||
kernel.setLocalWorkSizeValues(4, 5, 6);
|
||||
kernel.setGlobalWorkSizeValues(7, 8, 9);
|
||||
kernel.setGlobalWorkOffsetValues(1, 2, 3);
|
||||
kernel.setNumWorkGroupsValues(3, 2, 1);
|
||||
|
||||
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgsV1::getSize()));
|
||||
}
|
||||
|
||||
TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenCloneKernelThenImplicitArgsAreCopied) {
|
||||
@@ -3971,7 +4018,7 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenCloneKernelThenImp
|
||||
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
|
||||
ASSERT_EQ(CL_SUCCESS, kernel2.initialize());
|
||||
|
||||
ImplicitArgs expectedImplicitArgs = {ImplicitArgs::getSize()};
|
||||
ImplicitArgsV0 expectedImplicitArgs = {{ImplicitArgsV0::getSize(), 0}};
|
||||
expectedImplicitArgs.numWorkDim = 3;
|
||||
expectedImplicitArgs.simdWidth = 32;
|
||||
expectedImplicitArgs.localSizeX = 4;
|
||||
@@ -3999,7 +4046,7 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenCloneKernelThenImp
|
||||
|
||||
ASSERT_NE(nullptr, pImplicitArgs);
|
||||
|
||||
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgs::getSize()));
|
||||
EXPECT_EQ(0, memcmp(&expectedImplicitArgs, pImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
||||
TEST_F(KernelImplicitArgsTest, givenKernelWithoutImplicitArgsWhenSettingKernelParamsThenImplicitArgsAreNotSet) {
|
||||
|
||||
@@ -112,7 +112,7 @@ TEST_F(PrintfHandlerTests, givenKernelWithImplicitArgsWhenPreparingPrintfHandler
|
||||
auto pImplicitArgs = kernel.getImplicitArgs();
|
||||
ASSERT_NE(nullptr, pImplicitArgs);
|
||||
|
||||
EXPECT_EQ(printfSurface->getGpuAddress(), pImplicitArgs->printfBufferPtr);
|
||||
EXPECT_EQ(printfSurface->getGpuAddress(), pImplicitArgs->v0.printfBufferPtr);
|
||||
}
|
||||
|
||||
HWTEST_F(PrintfHandlerTests, givenEnabledStatelessCompressionWhenPrintEnqueueOutputIsCalledThenBCSEngineIsUsedToDecompressPrintfOutput) {
|
||||
|
||||
@@ -577,11 +577,22 @@ void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t
|
||||
UNRECOVERABLE_IF(NEO::isValidOffset(kernelDescriptor.payloadMappings.dispatchTraits.workDim) && (kernelDescriptor.payloadMappings.dispatchTraits.workDim & 0b11) != 0u);
|
||||
setWorkDimIndirect(container, kernelDescriptor.payloadMappings.dispatchTraits.workDim, crossThreadDataGpuVa, dispatchInterface->getGroupSize());
|
||||
if (implicitArgsGpuPtr) {
|
||||
CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgs, groupCountX), offsetof(ImplicitArgs, groupCountY), offsetof(ImplicitArgs, groupCountZ)};
|
||||
CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgs, globalSizeX), offsetof(ImplicitArgs, globalSizeY), offsetof(ImplicitArgs, globalSizeZ)};
|
||||
setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr);
|
||||
setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
||||
setWorkDimIndirect(container, offsetof(ImplicitArgs, numWorkDim), implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
||||
const auto version = container.getDevice()->getGfxCoreHelper().getImplicitArgsVersion();
|
||||
if (version == 0) {
|
||||
constexpr CrossThreadDataOffset groupCountOffset[] = {offsetof(ImplicitArgsV0, groupCountX), offsetof(ImplicitArgsV0, groupCountY), offsetof(ImplicitArgsV0, groupCountZ)};
|
||||
constexpr CrossThreadDataOffset globalSizeOffset[] = {offsetof(ImplicitArgsV0, globalSizeX), offsetof(ImplicitArgsV0, globalSizeY), offsetof(ImplicitArgsV0, globalSizeZ)};
|
||||
constexpr auto numWorkDimOffset = offsetof(ImplicitArgsV0, numWorkDim);
|
||||
setGroupCountIndirect(container, groupCountOffset, implicitArgsGpuPtr);
|
||||
setGlobalWorkSizeIndirect(container, globalSizeOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
||||
setWorkDimIndirect(container, numWorkDimOffset, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
||||
} else if (version == 1) {
|
||||
constexpr CrossThreadDataOffset groupCountOffsetV1[] = {offsetof(ImplicitArgsV1, groupCountX), offsetof(ImplicitArgsV1, groupCountY), offsetof(ImplicitArgsV1, groupCountZ)};
|
||||
constexpr CrossThreadDataOffset globalSizeOffsetV1[] = {offsetof(ImplicitArgsV1, globalSizeX), offsetof(ImplicitArgsV1, globalSizeY), offsetof(ImplicitArgsV1, globalSizeZ)};
|
||||
constexpr auto numWorkDimOffsetV1 = offsetof(ImplicitArgsV1, numWorkDim);
|
||||
setGroupCountIndirect(container, groupCountOffsetV1, implicitArgsGpuPtr);
|
||||
setGlobalWorkSizeIndirect(container, globalSizeOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
||||
setWorkDimIndirect(container, numWorkDimOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -264,7 +264,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
|
||||
offsetThreadData = (is64bit ? heap->getHeapGpuStartOffset() : heap->getHeapGpuBase()) + static_cast<uint64_t>(heap->getUsed() - sizeThreadData - args.reserveExtraPayloadSpace);
|
||||
if (pImplicitArgs) {
|
||||
offsetThreadData -= sizeForImplicitArgsStruct;
|
||||
pImplicitArgs->localIdTablePtr = heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize;
|
||||
pImplicitArgs->setLocalIdTablePtr(heap->getGraphicsAllocation()->getGpuAddress() + heap->getUsed() - iohRequiredSize);
|
||||
EncodeDispatchKernel<Family>::patchScratchAddressInImplicitArgs<heaplessModeEnabled>(*pImplicitArgs, scratchAddressForImmediatePatching, args.immediateScratchAddressPatching);
|
||||
|
||||
ptr = NEO::ImplicitArgsHelper::patchImplicitArgs(ptr, *pImplicitArgs, kernelDescriptor, std::make_pair(localIdsGenerationByRuntime, requiredWorkgroupOrder), rootDeviceEnvironment, &args.outImplicitArgsPtr);
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2024 Intel Corporation
|
||||
* Copyright (C) 2019-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -656,7 +656,15 @@ void Linker::resolveImplicitArgs(const KernelDescriptorsT &kernelDescriptors, De
|
||||
UNRECOVERABLE_IF(!pDevice);
|
||||
kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = kernelDescriptor.kernelAttributes.flags.useStackCalls || pDevice->getDebugger() != nullptr;
|
||||
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
|
||||
*pImplicitArgsReloc = ImplicitArgs::getSize();
|
||||
auto implicitArgsSize = 0;
|
||||
if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 0) {
|
||||
implicitArgsSize = ImplicitArgsV0::getSize();
|
||||
} else if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
|
||||
implicitArgsSize = ImplicitArgsV1::getSize();
|
||||
} else {
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
*pImplicitArgsReloc = implicitArgsSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,6 +198,7 @@ class GfxCoreHelper {
|
||||
|
||||
virtual uint32_t getDeviceTimestampWidth() const = 0;
|
||||
virtual void alignThreadGroupCountToDssSize(uint32_t &threadCount, uint32_t dssCount, uint32_t threadsPerDss, uint32_t threadGroupSize) const = 0;
|
||||
virtual uint32_t getImplicitArgsVersion() const = 0;
|
||||
|
||||
virtual ~GfxCoreHelper() = default;
|
||||
|
||||
@@ -436,6 +437,7 @@ class GfxCoreHelperHw : public GfxCoreHelper {
|
||||
bool usmCompressionSupported(const NEO::HardwareInfo &hwInfo) const override;
|
||||
|
||||
uint32_t getDeviceTimestampWidth() const override;
|
||||
uint32_t getImplicitArgsVersion() const override;
|
||||
|
||||
~GfxCoreHelperHw() override = default;
|
||||
|
||||
|
||||
@@ -833,4 +833,9 @@ uint32_t GfxCoreHelperHw<Family>::getInternalCopyEngineIndex(const HardwareInfo
|
||||
return std::min(defaultInternalCopyEngineIndex, highestAvailableIndex);
|
||||
}
|
||||
|
||||
template <typename Family>
|
||||
uint32_t GfxCoreHelperHw<Family>::getImplicitArgsVersion() const {
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -11,13 +11,20 @@
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <type_traits>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
struct alignas(32) ImplicitArgs {
|
||||
struct alignas(1) ImplicitArgsHeader {
|
||||
uint8_t structSize;
|
||||
uint8_t structVersion;
|
||||
};
|
||||
|
||||
static_assert(sizeof(ImplicitArgsHeader) == (2 * sizeof(uint8_t)));
|
||||
|
||||
struct alignas(32) ImplicitArgsV0 {
|
||||
ImplicitArgsHeader header;
|
||||
uint8_t numWorkDim;
|
||||
uint8_t simdWidth;
|
||||
uint32_t localSizeX;
|
||||
@@ -39,12 +46,195 @@ struct alignas(32) ImplicitArgs {
|
||||
uint64_t assertBufferPtr;
|
||||
uint8_t reserved[16];
|
||||
|
||||
static constexpr uint8_t getSize() { return static_cast<uint8_t>((offsetof(ImplicitArgs, reserved))); }
|
||||
static constexpr uint8_t getSize() { return static_cast<uint8_t>((offsetof(ImplicitArgsV0, reserved))); }
|
||||
};
|
||||
|
||||
static_assert(std::alignment_of_v<ImplicitArgsV0> == 32, "Implicit args size need to be aligned to 32");
|
||||
static_assert(sizeof(ImplicitArgsV0) == (32 * sizeof(uint32_t)));
|
||||
static_assert(ImplicitArgsV0::getSize() == (28 * sizeof(uint32_t)));
|
||||
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV0>);
|
||||
|
||||
struct alignas(32) ImplicitArgsV1 {
|
||||
ImplicitArgsHeader header;
|
||||
uint8_t numWorkDim;
|
||||
uint8_t padding0;
|
||||
uint32_t localSizeX;
|
||||
uint32_t localSizeY;
|
||||
uint32_t localSizeZ;
|
||||
uint64_t globalSizeX;
|
||||
uint64_t globalSizeY;
|
||||
uint64_t globalSizeZ;
|
||||
uint64_t printfBufferPtr;
|
||||
uint64_t globalOffsetX;
|
||||
uint64_t globalOffsetY;
|
||||
uint64_t globalOffsetZ;
|
||||
uint64_t localIdTablePtr;
|
||||
uint32_t groupCountX;
|
||||
uint32_t groupCountY;
|
||||
uint32_t groupCountZ;
|
||||
uint32_t padding1;
|
||||
uint64_t rtGlobalBufferPtr;
|
||||
uint64_t assertBufferPtr;
|
||||
uint8_t reserved[44];
|
||||
|
||||
static constexpr uint8_t getSize() { return static_cast<uint8_t>(offsetof(ImplicitArgsV1, reserved)); }
|
||||
};
|
||||
|
||||
static_assert(std::alignment_of_v<ImplicitArgsV1> == 32, "Implicit args size need to be aligned to 32");
|
||||
static_assert(sizeof(ImplicitArgsV1) == (40 * sizeof(uint32_t)));
|
||||
static_assert(ImplicitArgsV1::getSize() == (28 * sizeof(uint32_t)));
|
||||
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV1>);
|
||||
|
||||
struct alignas(32) ImplicitArgs {
|
||||
union {
|
||||
ImplicitArgsV0 v0;
|
||||
ImplicitArgsV1 v1;
|
||||
};
|
||||
|
||||
void initializeHeader(uint32_t version) {
|
||||
if (version == 0) {
|
||||
v0.header.structSize = ImplicitArgsV0::getSize();
|
||||
v0.header.structVersion = 0;
|
||||
} else if (version == 1) {
|
||||
v1.header.structSize = NEO::ImplicitArgsV1::getSize();
|
||||
v1.header.structVersion = 1;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t getSize() const {
|
||||
if (v0.header.structVersion == 0) {
|
||||
return v0.header.structSize;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
return v1.header.structSize;
|
||||
}
|
||||
|
||||
DEBUG_BREAK_IF(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void setNumWorkDim(uint32_t numWorkDim) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.numWorkDim = numWorkDim;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.numWorkDim = numWorkDim;
|
||||
}
|
||||
}
|
||||
|
||||
void setSimdWidth(uint32_t simd) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.simdWidth = simd;
|
||||
}
|
||||
}
|
||||
|
||||
std::optional<uint32_t> getSimdWidth() const {
|
||||
if (v0.header.structVersion == 0) {
|
||||
return v0.simdWidth;
|
||||
}
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
void setLocalSize(uint32_t x, uint32_t y, uint32_t z) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.localSizeX = x;
|
||||
v0.localSizeY = y;
|
||||
v0.localSizeZ = z;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.localSizeX = x;
|
||||
v1.localSizeY = y;
|
||||
v1.localSizeZ = z;
|
||||
}
|
||||
}
|
||||
|
||||
void getLocalSize(uint32_t &x, uint32_t &y, uint32_t &z) const {
|
||||
if (v0.header.structVersion == 0) {
|
||||
x = v0.localSizeX;
|
||||
y = v0.localSizeY;
|
||||
z = v0.localSizeZ;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
x = v1.localSizeX;
|
||||
y = v1.localSizeY;
|
||||
z = v1.localSizeZ;
|
||||
}
|
||||
}
|
||||
|
||||
void setGlobalSize(uint32_t x, uint32_t y, uint32_t z) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.globalSizeX = x;
|
||||
v0.globalSizeY = y;
|
||||
v0.globalSizeZ = z;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.globalSizeX = x;
|
||||
v1.globalSizeY = y;
|
||||
v1.globalSizeZ = z;
|
||||
}
|
||||
}
|
||||
|
||||
void setGlobalOffset(uint32_t x, uint32_t y, uint32_t z) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.globalOffsetX = x;
|
||||
v0.globalOffsetY = y;
|
||||
v0.globalOffsetZ = z;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.globalOffsetX = x;
|
||||
v1.globalOffsetY = y;
|
||||
v1.globalOffsetZ = z;
|
||||
}
|
||||
}
|
||||
void setGroupCount(uint32_t x, uint32_t y, uint32_t z) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.groupCountX = x;
|
||||
v0.groupCountY = y;
|
||||
v0.groupCountZ = z;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.groupCountX = x;
|
||||
v1.groupCountY = y;
|
||||
v1.groupCountZ = z;
|
||||
}
|
||||
}
|
||||
|
||||
void setLocalIdTablePtr(uint64_t address) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.localIdTablePtr = address;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.localIdTablePtr = address;
|
||||
}
|
||||
}
|
||||
void setPrintfBuffer(uint64_t address) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.printfBufferPtr = address;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.printfBufferPtr = address;
|
||||
}
|
||||
}
|
||||
|
||||
void setRtGlobalBufferPtr(uint64_t address) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.rtGlobalBufferPtr = address;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.rtGlobalBufferPtr = address;
|
||||
}
|
||||
}
|
||||
|
||||
void setAssertBufferPtr(uint64_t address) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.assertBufferPtr = address;
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
v1.assertBufferPtr = address;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static_assert(std::alignment_of_v<ImplicitArgs> == 32, "Implicit args size need to be aligned to 32");
|
||||
static_assert(sizeof(ImplicitArgs) == (32 * sizeof(uint32_t)));
|
||||
static_assert(ImplicitArgs::getSize() == (28 * sizeof(uint32_t)));
|
||||
static_assert(NEO::TypeTraits::isPodV<ImplicitArgs>);
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022-2024 Intel Corporation
|
||||
* Copyright (C) 2022-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -48,7 +48,8 @@ uint32_t getSizeForImplicitArgsStruct(const ImplicitArgs *pImplicitArgs, const K
|
||||
if (!pImplicitArgs) {
|
||||
return 0;
|
||||
}
|
||||
auto implicitArgsSize = static_cast<uint32_t>(ImplicitArgs::getSize());
|
||||
auto implicitArgsSize = pImplicitArgs->getSize();
|
||||
|
||||
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||
if (patchImplicitArgsBufferInCrossThread) {
|
||||
return alignUp(implicitArgsSize, MemoryConstants::cacheLineSize);
|
||||
@@ -65,10 +66,19 @@ uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const
|
||||
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||
uint32_t localIdsSize = 0;
|
||||
if (false == patchImplicitArgsBufferInCrossThread) {
|
||||
auto simdSize = pImplicitArgs->simdWidth;
|
||||
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
|
||||
auto simdSize = 32u;
|
||||
auto grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize);
|
||||
Vec3<size_t> localWorkSize = {pImplicitArgs->localSizeX, pImplicitArgs->localSizeY, pImplicitArgs->localSizeZ};
|
||||
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
|
||||
|
||||
uint32_t lws[3] = {0, 0, 0};
|
||||
pImplicitArgs->getLocalSize(lws[0], lws[1], lws[2]);
|
||||
Vec3<size_t> localWorkSize = {lws[0], lws[1], lws[2]};
|
||||
|
||||
if (pImplicitArgs->v0.header.structVersion == 0) {
|
||||
simdSize = pImplicitArgs->v0.simdWidth;
|
||||
grfSize = NEO::ImplicitArgsHelper::getGrfSize(simdSize);
|
||||
}
|
||||
|
||||
auto itemsInGroup = Math::computeTotalElementsCount(localWorkSize);
|
||||
localIdsSize = static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(simdSize, grfSize, grfCount, 3u, itemsInGroup, isHwLocalIdGeneration, rootDeviceEnvironment));
|
||||
localIdsSize = alignUp(localIdsSize, MemoryConstants::cacheLineSize);
|
||||
@@ -81,9 +91,14 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
|
||||
auto totalSizeToProgram = getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, localIdsGeneratedByHw, rootDeviceEnvironment);
|
||||
auto retVal = ptrOffset(ptrToPatch, totalSizeToProgram);
|
||||
|
||||
auto size = implicitArgs.v0.header.structSize;
|
||||
|
||||
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||
if (!patchImplicitArgsBufferInCrossThread) {
|
||||
auto simdSize = implicitArgs.simdWidth;
|
||||
|
||||
uint32_t lws[3] = {0, 0, 0};
|
||||
implicitArgs.getLocalSize(lws[0], lws[1], lws[2]);
|
||||
auto simdSize = implicitArgs.getSimdWidth().value_or(32);
|
||||
auto grfSize = getGrfSize(simdSize);
|
||||
auto grfCount = kernelDescriptor.kernelAttributes.numGrfRequired;
|
||||
auto dimensionOrder = getDimensionOrderForLocalIds(kernelDescriptor.kernelAttributes.workgroupDimensionsOrder, hwGenerationOfLocalIdsParams);
|
||||
@@ -91,12 +106,13 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
|
||||
NEO::generateLocalIDs(
|
||||
ptrToPatch,
|
||||
simdSize,
|
||||
std::array<uint16_t, 3>{{static_cast<uint16_t>(implicitArgs.localSizeX),
|
||||
static_cast<uint16_t>(implicitArgs.localSizeY),
|
||||
static_cast<uint16_t>(implicitArgs.localSizeZ)}},
|
||||
std::array<uint16_t, 3>{{static_cast<uint16_t>(lws[0]),
|
||||
static_cast<uint16_t>(lws[1]),
|
||||
static_cast<uint16_t>(lws[2])}},
|
||||
dimensionOrder,
|
||||
false, grfSize, grfCount, rootDeviceEnvironment);
|
||||
auto sizeForLocalIdsProgramming = totalSizeToProgram - ImplicitArgs::getSize();
|
||||
|
||||
auto sizeForLocalIdsProgramming = totalSizeToProgram - implicitArgs.getSize();
|
||||
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
|
||||
}
|
||||
|
||||
@@ -104,7 +120,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
|
||||
*outImplicitArgsAddress = ptrToPatch;
|
||||
}
|
||||
|
||||
memcpy_s(ptrToPatch, ImplicitArgs::getSize(), &implicitArgs, ImplicitArgs::getSize());
|
||||
memcpy_s(ptrToPatch, size, &implicitArgs, size);
|
||||
|
||||
return retVal;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2024 Intel Corporation
|
||||
* Copyright (C) 2019-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -18,8 +18,10 @@
|
||||
#include "shared/test/common/helpers/debug_manager_state_restore.h"
|
||||
#include "shared/test/common/helpers/default_hw_info.h"
|
||||
#include "shared/test/common/helpers/gtest_helpers.h"
|
||||
#include "shared/test/common/helpers/raii_gfx_core_helper.h"
|
||||
#include "shared/test/common/mocks/mock_device.h"
|
||||
#include "shared/test/common/mocks/mock_elf.h"
|
||||
#include "shared/test/common/mocks/mock_execution_environment.h"
|
||||
#include "shared/test/common/mocks/mock_graphics_allocation.h"
|
||||
#include "shared/test/common/mocks/mock_modules_zebin.h"
|
||||
#include "shared/test/common/mocks/ult_device_factory.h"
|
||||
@@ -2080,12 +2082,138 @@ TEST_F(LinkerTests, givenImplicitArgRelocationAndStackCallsThenPatchRelocationWi
|
||||
EXPECT_EQ(0U, relocatedSymbols.size());
|
||||
|
||||
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
|
||||
EXPECT_EQ(ImplicitArgs::getSize(), *addressToPatch);
|
||||
EXPECT_EQ(ImplicitArgsV0::getSize(), *addressToPatch);
|
||||
EXPECT_EQ(initData, *(addressToPatch - 1));
|
||||
EXPECT_EQ(initData, *(addressToPatch + 1));
|
||||
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
|
||||
}
|
||||
|
||||
HWTEST_F(LinkerTests, givenImplicitArgRelocationAndImplicitArgsV1WhenLinkingThenPatchRelocationWithSizeOfImplicitArgsV1) {
|
||||
DebugManagerStateRestore restore;
|
||||
struct MockGfxCoreHelper : NEO::GfxCoreHelperHw<FamilyType> {
|
||||
uint32_t getImplicitArgsVersion() const override {
|
||||
return 1;
|
||||
}
|
||||
};
|
||||
|
||||
NEO::LinkerInput linkerInput;
|
||||
|
||||
vISA::GenRelocEntry reloc = {};
|
||||
std::string relocationName = implicitArgsRelocationSymbolName;
|
||||
memcpy_s(reloc.r_symbol, 1024, relocationName.c_str(), relocationName.size());
|
||||
reloc.r_offset = 8;
|
||||
reloc.r_type = vISA::GenRelocType::R_SYM_ADDR_32;
|
||||
|
||||
vISA::GenRelocEntry relocs[] = {reloc};
|
||||
constexpr uint32_t numRelocations = 1;
|
||||
bool decodeRelocSuccess = linkerInput.decodeRelocationTable(&relocs, numRelocations, 0);
|
||||
EXPECT_TRUE(decodeRelocSuccess);
|
||||
|
||||
NEO::Linker linker(linkerInput);
|
||||
NEO::Linker::SegmentInfo globalVarSegment, globalConstSegment, exportedFuncSegment;
|
||||
globalVarSegment.gpuAddress = 8;
|
||||
globalVarSegment.segmentSize = 64;
|
||||
globalConstSegment.gpuAddress = 128;
|
||||
globalConstSegment.segmentSize = 256;
|
||||
exportedFuncSegment.gpuAddress = 4096;
|
||||
exportedFuncSegment.segmentSize = 1024;
|
||||
NEO::Linker::UnresolvedExternals unresolvedExternals;
|
||||
NEO::Linker::KernelDescriptorsT kernelDescriptors;
|
||||
NEO::Linker::ExternalFunctionsT externalFunctions;
|
||||
KernelDescriptor kernelDescriptor;
|
||||
kernelDescriptors.push_back(&kernelDescriptor);
|
||||
kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
|
||||
kernelDescriptor.kernelAttributes.flags.useStackCalls = true;
|
||||
|
||||
HardwareInfo hwInfo = *defaultHwInfo;
|
||||
MockExecutionEnvironment executionEnvironment(&hwInfo, false, 1);
|
||||
executionEnvironment.incRefInternal();
|
||||
|
||||
UltDeviceFactory deviceFactory{1, 0, executionEnvironment};
|
||||
auto rootDeviceIndex = deviceFactory.rootDevices[0]->getRootDeviceIndex();
|
||||
RAIIGfxCoreHelperFactory<MockGfxCoreHelper> raii(*deviceFactory.rootDevices[0]->getExecutionEnvironment()->rootDeviceEnvironments[rootDeviceIndex]);
|
||||
|
||||
std::vector<char> instructionSegment;
|
||||
uint32_t initData = 0x77777777;
|
||||
instructionSegment.resize(32, static_cast<char>(initData));
|
||||
NEO::Linker::PatchableSegment seg0;
|
||||
seg0.hostPointer = instructionSegment.data();
|
||||
seg0.segmentSize = instructionSegment.size();
|
||||
NEO::Linker::PatchableSegments patchableInstructionSegments{seg0};
|
||||
|
||||
auto linkResult = linker.link(globalVarSegment, globalConstSegment, exportedFuncSegment, {},
|
||||
nullptr, nullptr, patchableInstructionSegments, unresolvedExternals,
|
||||
deviceFactory.rootDevices[0], nullptr, 0, nullptr, 0, kernelDescriptors, externalFunctions);
|
||||
EXPECT_EQ(NEO::LinkingStatus::linkedFully, linkResult);
|
||||
|
||||
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
|
||||
|
||||
EXPECT_EQ(ImplicitArgsV1::getSize(), *addressToPatch);
|
||||
EXPECT_EQ(initData, *(addressToPatch - 1));
|
||||
EXPECT_EQ(initData, *(addressToPatch + 1));
|
||||
|
||||
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
|
||||
}
|
||||
|
||||
HWTEST_F(LinkerTests, givenImplicitArgRelocationAndImplicitArgsWithUnknownVersionWhenLinkingThenUnrecoverableIfCalled) {
|
||||
DebugManagerStateRestore restore;
|
||||
struct MockGfxCoreHelper : NEO::GfxCoreHelperHw<FamilyType> {
|
||||
uint32_t getImplicitArgsVersion() const override {
|
||||
return 2; // unknown version
|
||||
}
|
||||
};
|
||||
|
||||
NEO::LinkerInput linkerInput;
|
||||
|
||||
vISA::GenRelocEntry reloc = {};
|
||||
std::string relocationName = implicitArgsRelocationSymbolName;
|
||||
memcpy_s(reloc.r_symbol, 1024, relocationName.c_str(), relocationName.size());
|
||||
reloc.r_offset = 8;
|
||||
reloc.r_type = vISA::GenRelocType::R_SYM_ADDR_32;
|
||||
|
||||
vISA::GenRelocEntry relocs[] = {reloc};
|
||||
constexpr uint32_t numRelocations = 1;
|
||||
bool decodeRelocSuccess = linkerInput.decodeRelocationTable(&relocs, numRelocations, 0);
|
||||
EXPECT_TRUE(decodeRelocSuccess);
|
||||
|
||||
NEO::Linker linker(linkerInput);
|
||||
NEO::Linker::SegmentInfo globalVarSegment, globalConstSegment, exportedFuncSegment;
|
||||
globalVarSegment.gpuAddress = 8;
|
||||
globalVarSegment.segmentSize = 64;
|
||||
globalConstSegment.gpuAddress = 128;
|
||||
globalConstSegment.segmentSize = 256;
|
||||
exportedFuncSegment.gpuAddress = 4096;
|
||||
exportedFuncSegment.segmentSize = 1024;
|
||||
NEO::Linker::UnresolvedExternals unresolvedExternals;
|
||||
NEO::Linker::KernelDescriptorsT kernelDescriptors;
|
||||
NEO::Linker::ExternalFunctionsT externalFunctions;
|
||||
KernelDescriptor kernelDescriptor;
|
||||
kernelDescriptors.push_back(&kernelDescriptor);
|
||||
kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = true;
|
||||
kernelDescriptor.kernelAttributes.flags.useStackCalls = true;
|
||||
|
||||
HardwareInfo hwInfo = *defaultHwInfo;
|
||||
MockExecutionEnvironment executionEnvironment(&hwInfo, false, 1);
|
||||
executionEnvironment.incRefInternal();
|
||||
|
||||
UltDeviceFactory deviceFactory{1, 0, executionEnvironment};
|
||||
auto rootDeviceIndex = deviceFactory.rootDevices[0]->getRootDeviceIndex();
|
||||
RAIIGfxCoreHelperFactory<MockGfxCoreHelper> raii(*deviceFactory.rootDevices[0]->getExecutionEnvironment()->rootDeviceEnvironments[rootDeviceIndex]);
|
||||
|
||||
std::vector<char> instructionSegment;
|
||||
uint32_t initData = 0x77777777;
|
||||
instructionSegment.resize(32, static_cast<char>(initData));
|
||||
NEO::Linker::PatchableSegment seg0;
|
||||
seg0.hostPointer = instructionSegment.data();
|
||||
seg0.segmentSize = instructionSegment.size();
|
||||
NEO::Linker::PatchableSegments patchableInstructionSegments{seg0};
|
||||
|
||||
EXPECT_THROW(linker.link(globalVarSegment, globalConstSegment, exportedFuncSegment, {},
|
||||
nullptr, nullptr, patchableInstructionSegments, unresolvedExternals,
|
||||
deviceFactory.rootDevices[0], nullptr, 0, nullptr, 0, kernelDescriptors, externalFunctions),
|
||||
std::exception);
|
||||
}
|
||||
|
||||
using LinkerDebuggingSupportedTests = ::testing::Test;
|
||||
|
||||
TEST_F(LinkerDebuggingSupportedTests, givenImplicitArgRelocationAndEnabledDebuggerThenPatchRelocationWithSizeOfImplicitArgStructAndUpdateKernelDescriptor) {
|
||||
@@ -2142,7 +2270,7 @@ TEST_F(LinkerDebuggingSupportedTests, givenImplicitArgRelocationAndEnabledDebugg
|
||||
EXPECT_EQ(0U, relocatedSymbols.size());
|
||||
|
||||
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
|
||||
EXPECT_EQ(ImplicitArgs::getSize(), *addressToPatch);
|
||||
EXPECT_EQ(ImplicitArgsV0::getSize(), *addressToPatch);
|
||||
EXPECT_EQ(initData, *(addressToPatch - 1));
|
||||
EXPECT_EQ(initData, *(addressToPatch + 1));
|
||||
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
|
||||
@@ -2349,7 +2477,7 @@ TEST_F(LinkerTests, givenMultipleImplicitArgsRelocationsWithinSingleKernelWhenLi
|
||||
|
||||
for (const auto &reloc : relocs) {
|
||||
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
|
||||
EXPECT_EQ(ImplicitArgs::getSize(), *addressToPatch);
|
||||
EXPECT_EQ(ImplicitArgsV0::getSize(), *addressToPatch);
|
||||
EXPECT_TRUE(kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2022-2024 Intel Corporation
|
||||
* Copyright (C) 2022-2025 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -64,43 +64,49 @@ TEST(ImplicitArgsHelperTest, givenNoImplicitArgsWhenGettingSizeForImplicitArgsPr
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
||||
ImplicitArgs implicitArgs{ImplicitArgs::getSize()};
|
||||
ImplicitArgs implicitArgs{};
|
||||
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
|
||||
implicitArgs.v0.header.structVersion = 0;
|
||||
|
||||
KernelDescriptor kernelDescriptor{};
|
||||
|
||||
EXPECT_TRUE(isUndefinedOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer));
|
||||
|
||||
implicitArgs.simdWidth = 32;
|
||||
implicitArgs.localSizeX = 2;
|
||||
implicitArgs.localSizeY = 3;
|
||||
implicitArgs.localSizeZ = 4;
|
||||
implicitArgs.v0.simdWidth = 32;
|
||||
implicitArgs.v0.localSizeX = 2;
|
||||
implicitArgs.v0.localSizeY = 3;
|
||||
implicitArgs.v0.localSizeZ = 4;
|
||||
|
||||
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
||||
auto totalWorkgroupSize = implicitArgs.v0.localSizeX * implicitArgs.v0.localSizeY * implicitArgs.v0.localSizeZ;
|
||||
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
|
||||
EXPECT_EQ(localIdsSize + ImplicitArgs::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
|
||||
EXPECT_EQ(localIdsSize + ImplicitArgsV0::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
||||
ImplicitArgs implicitArgs{ImplicitArgs::getSize()};
|
||||
ImplicitArgs implicitArgs{};
|
||||
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
|
||||
implicitArgs.v0.header.structVersion = 0;
|
||||
|
||||
KernelDescriptor kernelDescriptor{};
|
||||
kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer = 0x10;
|
||||
EXPECT_TRUE(isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer));
|
||||
|
||||
implicitArgs.simdWidth = 32;
|
||||
implicitArgs.localSizeX = 2;
|
||||
implicitArgs.localSizeY = 3;
|
||||
implicitArgs.localSizeZ = 4;
|
||||
implicitArgs.v0.simdWidth = 32;
|
||||
implicitArgs.v0.localSizeX = 2;
|
||||
implicitArgs.v0.localSizeY = 3;
|
||||
implicitArgs.v0.localSizeZ = 4;
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
EXPECT_EQ(alignUp(implicitArgs.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
EXPECT_EQ(alignUp(implicitArgs.v0.header.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
|
||||
ImplicitArgs implicitArgs{ImplicitArgs::getSize()};
|
||||
ImplicitArgs implicitArgs{};
|
||||
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
|
||||
implicitArgs.v0.header.structVersion = 0;
|
||||
|
||||
void *outImplicitArgs = nullptr;
|
||||
KernelDescriptor kernelDescriptor{};
|
||||
@@ -110,15 +116,15 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
||||
|
||||
EXPECT_TRUE(isUndefinedOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer));
|
||||
|
||||
implicitArgs.simdWidth = 1;
|
||||
implicitArgs.localSizeX = 2;
|
||||
implicitArgs.localSizeY = 3;
|
||||
implicitArgs.localSizeZ = 4;
|
||||
implicitArgs.v0.simdWidth = 1;
|
||||
implicitArgs.v0.localSizeX = 2;
|
||||
implicitArgs.v0.localSizeY = 3;
|
||||
implicitArgs.v0.localSizeZ = 4;
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment);
|
||||
|
||||
auto totalWorkgroupSize = implicitArgs.localSizeX * implicitArgs.localSizeY * implicitArgs.localSizeZ;
|
||||
auto totalWorkgroupSize = implicitArgs.v0.localSizeX * implicitArgs.v0.localSizeY * implicitArgs.v0.localSizeZ;
|
||||
auto localIdsPatchingSize = totalWorkgroupSize * 3 * sizeof(uint16_t);
|
||||
auto localIdsOffset = alignUp(localIdsPatchingSize, MemoryConstants::cacheLineSize);
|
||||
|
||||
@@ -141,7 +147,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
||||
EXPECT_NE(pattern, memoryToPatch.get()[offset]) << offset;
|
||||
}
|
||||
|
||||
for (; offset < totalSizeForPatching - ImplicitArgs::getSize(); offset++) {
|
||||
for (; offset < totalSizeForPatching - ImplicitArgsV0::getSize(); offset++) {
|
||||
EXPECT_EQ(pattern, memoryToPatch.get()[offset]);
|
||||
}
|
||||
|
||||
@@ -151,21 +157,24 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
|
||||
ImplicitArgs implicitArgs{ImplicitArgs::getSize()};
|
||||
ImplicitArgs implicitArgs{};
|
||||
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
|
||||
implicitArgs.v0.header.structVersion = 0;
|
||||
|
||||
void *outImplicitArgs = nullptr;
|
||||
KernelDescriptor kernelDescriptor{};
|
||||
kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer = 0x10;
|
||||
EXPECT_TRUE(isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer));
|
||||
|
||||
implicitArgs.simdWidth = 32;
|
||||
implicitArgs.localSizeX = 2;
|
||||
implicitArgs.localSizeY = 3;
|
||||
implicitArgs.localSizeZ = 4;
|
||||
implicitArgs.v0.simdWidth = 32;
|
||||
implicitArgs.v0.localSizeX = 2;
|
||||
implicitArgs.v0.localSizeY = 3;
|
||||
implicitArgs.v0.localSizeZ = 4;
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(alignUp(ImplicitArgs::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching);
|
||||
EXPECT_EQ(alignUp(ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching);
|
||||
|
||||
auto memoryToPatch = std::make_unique<uint8_t[]>(totalSizeForPatching);
|
||||
|
||||
@@ -182,7 +191,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
||||
|
||||
uint32_t offset = 0;
|
||||
|
||||
for (; offset < ImplicitArgs::getSize(); offset++) {
|
||||
for (; offset < ImplicitArgsV0::getSize(); offset++) {
|
||||
EXPECT_NE(pattern, memoryToPatch.get()[offset]);
|
||||
}
|
||||
|
||||
@@ -190,3 +199,132 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
||||
EXPECT_EQ(pattern, memoryToPatch.get()[offset]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsV0Test, givenImplicitArgsV0WhenSettingFieldsThenCorrectFieldsAreSet) {
|
||||
ImplicitArgs implicitArgs{};
|
||||
implicitArgs.v0.header.structSize = ImplicitArgsV0::getSize();
|
||||
implicitArgs.v0.header.structVersion = 0;
|
||||
|
||||
EXPECT_EQ(ImplicitArgsV0::getSize(), implicitArgs.getSize());
|
||||
|
||||
implicitArgs.setAssertBufferPtr(0x4567000);
|
||||
implicitArgs.setGlobalOffset(5, 6, 7);
|
||||
implicitArgs.setGlobalSize(1, 2, 3);
|
||||
implicitArgs.setGroupCount(10, 20, 30);
|
||||
implicitArgs.setLocalSize(8, 9, 11);
|
||||
implicitArgs.setLocalIdTablePtr(0x5699000);
|
||||
implicitArgs.setPrintfBuffer(0xff000);
|
||||
implicitArgs.setNumWorkDim(16);
|
||||
implicitArgs.setRtGlobalBufferPtr(0x1000123400);
|
||||
implicitArgs.setSimdWidth(32);
|
||||
|
||||
EXPECT_EQ(0x4567000u, implicitArgs.v0.assertBufferPtr);
|
||||
|
||||
EXPECT_EQ(5u, implicitArgs.v0.globalOffsetX);
|
||||
EXPECT_EQ(6u, implicitArgs.v0.globalOffsetY);
|
||||
EXPECT_EQ(7u, implicitArgs.v0.globalOffsetZ);
|
||||
|
||||
EXPECT_EQ(1u, implicitArgs.v0.globalSizeX);
|
||||
EXPECT_EQ(2u, implicitArgs.v0.globalSizeY);
|
||||
EXPECT_EQ(3u, implicitArgs.v0.globalSizeZ);
|
||||
|
||||
EXPECT_EQ(10u, implicitArgs.v0.groupCountX);
|
||||
EXPECT_EQ(20u, implicitArgs.v0.groupCountY);
|
||||
EXPECT_EQ(30u, implicitArgs.v0.groupCountZ);
|
||||
|
||||
EXPECT_EQ(8u, implicitArgs.v0.localSizeX);
|
||||
EXPECT_EQ(9u, implicitArgs.v0.localSizeY);
|
||||
EXPECT_EQ(11u, implicitArgs.v0.localSizeZ);
|
||||
|
||||
EXPECT_EQ(0x5699000u, implicitArgs.v0.localIdTablePtr);
|
||||
EXPECT_EQ(0xff000u, implicitArgs.v0.printfBufferPtr);
|
||||
EXPECT_EQ(16u, implicitArgs.v0.numWorkDim);
|
||||
EXPECT_EQ(0x1000123400u, implicitArgs.v0.rtGlobalBufferPtr);
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsV1Test, givenImplicitArgsV1WhenSettingFieldsThenCorrectFieldsAreSet) {
|
||||
ImplicitArgs implicitArgs{};
|
||||
implicitArgs.v1.header.structSize = ImplicitArgsV1::getSize();
|
||||
implicitArgs.v1.header.structVersion = 1;
|
||||
|
||||
EXPECT_EQ(ImplicitArgsV1::getSize(), implicitArgs.getSize());
|
||||
|
||||
implicitArgs.setAssertBufferPtr(0x4567000);
|
||||
implicitArgs.setGlobalOffset(5, 6, 7);
|
||||
implicitArgs.setGlobalSize(1, 2, 3);
|
||||
implicitArgs.setGroupCount(10, 20, 30);
|
||||
implicitArgs.setLocalSize(8, 9, 11);
|
||||
implicitArgs.setLocalIdTablePtr(0x5699000);
|
||||
implicitArgs.setPrintfBuffer(0xff000);
|
||||
implicitArgs.setNumWorkDim(16);
|
||||
implicitArgs.setRtGlobalBufferPtr(0x1000123400);
|
||||
implicitArgs.setSimdWidth(32);
|
||||
|
||||
EXPECT_EQ(0x4567000u, implicitArgs.v1.assertBufferPtr);
|
||||
|
||||
EXPECT_EQ(5u, implicitArgs.v1.globalOffsetX);
|
||||
EXPECT_EQ(6u, implicitArgs.v1.globalOffsetY);
|
||||
EXPECT_EQ(7u, implicitArgs.v1.globalOffsetZ);
|
||||
|
||||
EXPECT_EQ(1u, implicitArgs.v1.globalSizeX);
|
||||
EXPECT_EQ(2u, implicitArgs.v1.globalSizeY);
|
||||
EXPECT_EQ(3u, implicitArgs.v1.globalSizeZ);
|
||||
|
||||
EXPECT_EQ(10u, implicitArgs.v1.groupCountX);
|
||||
EXPECT_EQ(20u, implicitArgs.v1.groupCountY);
|
||||
EXPECT_EQ(30u, implicitArgs.v1.groupCountZ);
|
||||
|
||||
EXPECT_EQ(8u, implicitArgs.v1.localSizeX);
|
||||
EXPECT_EQ(9u, implicitArgs.v1.localSizeY);
|
||||
EXPECT_EQ(11u, implicitArgs.v1.localSizeZ);
|
||||
|
||||
EXPECT_EQ(0x5699000u, implicitArgs.v1.localIdTablePtr);
|
||||
EXPECT_EQ(0xff000u, implicitArgs.v1.printfBufferPtr);
|
||||
EXPECT_EQ(16u, implicitArgs.v1.numWorkDim);
|
||||
EXPECT_EQ(0x1000123400u, implicitArgs.v1.rtGlobalBufferPtr);
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsV1Test, givenImplicitArgsWithUnknownVersionWhenSettingFieldsThenFieldsAreNotPopulated) {
|
||||
ImplicitArgs implicitArgs{};
|
||||
|
||||
memset(&implicitArgs, 0, sizeof(implicitArgs));
|
||||
|
||||
implicitArgs.v1.header.structSize = ImplicitArgsV1::getSize();
|
||||
implicitArgs.v1.header.structVersion = 2; // unknown version
|
||||
|
||||
EXPECT_EQ(0u, implicitArgs.getSize());
|
||||
|
||||
implicitArgs.setAssertBufferPtr(0x4567000);
|
||||
implicitArgs.setGlobalOffset(5, 6, 7);
|
||||
implicitArgs.setGlobalSize(1, 2, 3);
|
||||
implicitArgs.setGroupCount(10, 20, 30);
|
||||
implicitArgs.setLocalSize(8, 9, 11);
|
||||
implicitArgs.setLocalIdTablePtr(0x5699000);
|
||||
implicitArgs.setPrintfBuffer(0xff000);
|
||||
implicitArgs.setNumWorkDim(16);
|
||||
implicitArgs.setRtGlobalBufferPtr(0x1000123400);
|
||||
implicitArgs.setSimdWidth(32);
|
||||
|
||||
EXPECT_EQ(0u, implicitArgs.v1.assertBufferPtr);
|
||||
|
||||
EXPECT_EQ(0u, implicitArgs.v1.globalOffsetX);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.globalOffsetY);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.globalOffsetZ);
|
||||
|
||||
EXPECT_EQ(0u, implicitArgs.v1.globalSizeX);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.globalSizeY);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.globalSizeZ);
|
||||
|
||||
EXPECT_EQ(0u, implicitArgs.v1.groupCountX);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.groupCountY);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.groupCountZ);
|
||||
|
||||
EXPECT_EQ(0u, implicitArgs.v1.localSizeX);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.localSizeY);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.localSizeZ);
|
||||
|
||||
EXPECT_EQ(0u, implicitArgs.v1.localIdTablePtr);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.printfBufferPtr);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.numWorkDim);
|
||||
EXPECT_EQ(0u, implicitArgs.v1.rtGlobalBufferPtr);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user