Revert "fix: correctly patch implicit arg buffer in indirect data"

This reverts commit e345d55fe5.

Signed-off-by: Compute-Runtime-Validation <compute-runtime-validation@intel.com>
This commit is contained in:
Compute-Runtime-Validation 2025-05-06 03:46:46 +02:00 committed by Compute-Runtime-Automation
parent 3fa9229483
commit 26d3c7527c
10 changed files with 33 additions and 44 deletions

View File

@ -924,11 +924,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(), MemoryConstants::cacheLineSize);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
@ -944,7 +944,7 @@ HWCMDTEST_F(IGFX_GEN12LP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, g
auto implicitArgsInIndirectData = indirectHeapAllocation->getUnderlyingBuffer();
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), ImplicitArgsV0::getAlignedSize());
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgsV0::getSize(), 64));
auto programmedImplicitArgsGpuVA = reinterpret_cast<uint64_t *>(crossThreadDataInIndirectData)[0];
EXPECT_EQ(indirectHeapAllocation->getGpuAddress(), programmedImplicitArgsGpuVA);
@ -975,7 +975,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
@ -1004,7 +1004,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);

View File

@ -942,7 +942,7 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
return implicitArgsProgrammingSize - kernel.pImplicitArgs->getAlignedSize();
return implicitArgsProgrammingSize - kernel.pImplicitArgs->v0.header.structSize;
} else {
return 0u;
}

View File

@ -279,7 +279,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubInlineDataTest, givenCrossThreadSize
auto pImplicitArgs = kernel->getImplicitArgs();
if (pImplicitArgs) {
payloadData = ptrOffset(payloadData, pImplicitArgs->getAlignedSize());
payloadData = ptrOffset(payloadData, alignUp(pImplicitArgs->getSize(), MemoryConstants::cacheLineSize));
}
EXPECT_EQ(0, memcmp(payloadData, crossThreadData, crossThreadDataSize));
},

View File

@ -1254,7 +1254,7 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenXeHpAndLaterPlatformWhenSendingIndirectStateForKernelWithImplicitArgsThenImplicitArgsAreSentToIndirectHeapWithLocalIds) {
dispatchKernelWithImplicitArgs<FamilyType>();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
}
@ -1265,7 +1265,7 @@ HWCMDTEST_F(IGFX_GEN12LP_CORE, HardwareCommandsImplicitArgsTests, givenPreXeHpPl
auto implicitArgsInIndirectData = indirectHeapAllocation->getUnderlyingBuffer();
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), ImplicitArgsV0::getAlignedSize());
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize));
auto programmedImplicitArgsGpuVA = reinterpret_cast<uint64_t *>(crossThreadDataInIndirectData)[0];
EXPECT_EQ(indirectHeapAllocation->getGpuAddress(), programmedImplicitArgsGpuVA);
@ -1290,11 +1290,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(), MemoryConstants::cacheLineSize);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
@ -1325,11 +1325,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
auto numGrf = GrfConfig::defaultGrfNumber;
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(), MemoryConstants::cacheLineSize);
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
@ -1358,7 +1358,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);

View File

@ -658,9 +658,9 @@ void Linker::resolveImplicitArgs(const KernelDescriptorsT &kernelDescriptors, De
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
auto implicitArgsSize = 0;
if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 0) {
implicitArgsSize = ImplicitArgsV0::getAlignedSize();
implicitArgsSize = ImplicitArgsV0::getSize();
} else if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
implicitArgsSize = ImplicitArgsV1::getAlignedSize();
implicitArgsSize = ImplicitArgsV1::getSize();
} else {
UNRECOVERABLE_IF(true);
}

View File

@ -47,7 +47,6 @@ struct alignas(32) ImplicitArgsV0 {
uint8_t reserved[16];
static constexpr uint8_t getSize() { return static_cast<uint8_t>((offsetof(ImplicitArgsV0, reserved))); }
static constexpr uint8_t getAlignedSize() { return sizeof(ImplicitArgsV0); }
};
static_assert(std::alignment_of_v<ImplicitArgsV0> == 32, "Implicit args size need to be aligned to 32");
@ -79,7 +78,6 @@ struct alignas(32) ImplicitArgsV1 {
uint8_t reserved[44];
static constexpr uint8_t getSize() { return static_cast<uint8_t>(offsetof(ImplicitArgsV1, reserved)); }
static constexpr uint8_t getAlignedSize() { return sizeof(ImplicitArgsV1); }
};
static_assert(std::alignment_of_v<ImplicitArgsV1> == 32, "Implicit args size need to be aligned to 32");
@ -115,18 +113,6 @@ struct alignas(32) ImplicitArgs {
return 0;
}
uint8_t getAlignedSize() const {
if (v0.header.structVersion == 0) {
return ImplicitArgsV0::getAlignedSize();
} else if (v1.header.structVersion == 1) {
return ImplicitArgsV1::getAlignedSize();
}
DEBUG_BREAK_IF(true);
return 0;
}
void setNumWorkDim(uint32_t numWorkDim) {
if (v0.header.structVersion == 0) {
v0.numWorkDim = numWorkDim;

View File

@ -48,7 +48,14 @@ uint32_t getSizeForImplicitArgsStruct(const ImplicitArgs *pImplicitArgs, const K
if (!pImplicitArgs) {
return 0;
}
return pImplicitArgs->getAlignedSize();
auto implicitArgsSize = pImplicitArgs->getSize();
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
if (patchImplicitArgsBufferInCrossThread) {
return alignUp(implicitArgsSize, MemoryConstants::cacheLineSize);
} else {
return implicitArgsSize;
}
}
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) {
@ -105,7 +112,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
dimensionOrder,
false, grfSize, grfCount, rootDeviceEnvironment);
auto sizeForLocalIdsProgramming = totalSizeToProgram - implicitArgs.getAlignedSize();
auto sizeForLocalIdsProgramming = totalSizeToProgram - implicitArgs.getSize();
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
}

View File

@ -11,9 +11,9 @@
namespace ImplicitArgsTestHelper {
constexpr uint32_t getImplicitArgsSize(uint32_t version) {
if (version == 0) {
return NEO::ImplicitArgsV0::getAlignedSize();
return NEO::ImplicitArgsV0::getSize();
} else if (version == 1) {
return NEO::ImplicitArgsV1::getAlignedSize();
return NEO::ImplicitArgsV1::getSize();
}
return 0;
}

View File

@ -2146,7 +2146,7 @@ HWTEST_F(LinkerTests, givenImplicitArgRelocationAndImplicitArgsV1WhenLinkingThen
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
EXPECT_EQ(ImplicitArgsV1::getAlignedSize(), *addressToPatch);
EXPECT_EQ(ImplicitArgsV1::getSize(), *addressToPatch);
EXPECT_EQ(initData, *(addressToPatch - 1));
EXPECT_EQ(initData, *(addressToPatch + 1));

View File

@ -82,7 +82,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
EXPECT_EQ(localIdsSize + ImplicitArgsV0::getAlignedSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
EXPECT_EQ(localIdsSize + ImplicitArgsV0::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
}
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
@ -100,8 +100,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
implicitArgs.v0.localSizeZ = 4;
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
EXPECT_EQ(alignUp(implicitArgs.v0.header.structSize, 32), implicitArgs.getAlignedSize());
EXPECT_EQ(alignUp(implicitArgs.v0.header.structSize, 32), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
EXPECT_EQ(alignUp(implicitArgs.v0.header.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
}
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
@ -148,15 +147,12 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
EXPECT_NE(pattern, memoryToPatch.get()[offset]) << offset;
}
for (; offset < totalSizeForPatching - ImplicitArgsV0::getAlignedSize(); offset++) {
for (; offset < totalSizeForPatching - ImplicitArgsV0::getSize(); offset++) {
EXPECT_EQ(pattern, memoryToPatch.get()[offset]);
}
for (; offset < totalSizeForPatching - (ImplicitArgsV0::getAlignedSize() - ImplicitArgsV0::getSize()); offset++) {
EXPECT_NE(pattern, memoryToPatch.get()[offset]);
}
for (; offset < totalSizeForPatching; offset++) {
EXPECT_EQ(pattern, memoryToPatch.get()[offset]);
EXPECT_NE(pattern, memoryToPatch.get()[offset]);
}
}
@ -178,7 +174,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment);
EXPECT_EQ(ImplicitArgsV0::getAlignedSize(), totalSizeForPatching);
EXPECT_EQ(alignUp(ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching);
auto memoryToPatch = std::make_unique<uint8_t[]>(totalSizeForPatching);