fix: correctly patch implicit arg buffer in indirect data
- use correct size alignment of implicit arg buffer, crosshtread data should start after the buffer without extra padding Related-To: NEO-14449 Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
parent
22ddaea09f
commit
e345d55fe5
|
@ -924,11 +924,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
|||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
|
@ -944,7 +944,7 @@ HWCMDTEST_F(IGFX_GEN12LP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, g
|
|||
auto implicitArgsInIndirectData = indirectHeapAllocation->getUnderlyingBuffer();
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
|
||||
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgsV0::getSize(), 64));
|
||||
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), ImplicitArgsV0::getAlignedSize());
|
||||
|
||||
auto programmedImplicitArgsGpuVA = reinterpret_cast<uint64_t *>(crossThreadDataInIndirectData)[0];
|
||||
EXPECT_EQ(indirectHeapAllocation->getGpuAddress(), programmedImplicitArgsGpuVA);
|
||||
|
@ -975,7 +975,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
|||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
|
||||
size_t sizeForLocalIds = NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, !kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
|
@ -1004,7 +1004,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CmdlistAppendLaunchKernelWithImplicitArgsTests, giv
|
|||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
|
||||
|
||||
EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);
|
||||
|
||||
|
|
|
@ -942,7 +942,7 @@ struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKe
|
|||
if (FamilyType::supportsCmdSet(IGFX_XE_HP_CORE)) {
|
||||
const auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironment();
|
||||
auto implicitArgsProgrammingSize = ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernel.pImplicitArgs.get(), kernel.getKernelDescriptor(), !kernel.kernelRequiresGenerationOfLocalIdsByRuntime, rootDeviceEnvironment);
|
||||
return implicitArgsProgrammingSize - kernel.pImplicitArgs->v0.header.structSize;
|
||||
return implicitArgsProgrammingSize - kernel.pImplicitArgs->getAlignedSize();
|
||||
} else {
|
||||
return 0u;
|
||||
}
|
||||
|
|
|
@ -279,7 +279,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubInlineDataTest, givenCrossThreadSize
|
|||
|
||||
auto pImplicitArgs = kernel->getImplicitArgs();
|
||||
if (pImplicitArgs) {
|
||||
payloadData = ptrOffset(payloadData, alignUp(pImplicitArgs->getSize(), MemoryConstants::cacheLineSize));
|
||||
payloadData = ptrOffset(payloadData, pImplicitArgs->getAlignedSize());
|
||||
}
|
||||
EXPECT_EQ(0, memcmp(payloadData, crossThreadData, crossThreadDataSize));
|
||||
},
|
||||
|
|
|
@ -1254,7 +1254,7 @@ struct HardwareCommandsImplicitArgsTests : Test<ClDeviceFixture> {
|
|||
HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenXeHpAndLaterPlatformWhenSendingIndirectStateForKernelWithImplicitArgsThenImplicitArgsAreSentToIndirectHeapWithLocalIds) {
|
||||
dispatchKernelWithImplicitArgs<FamilyType>();
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
|
||||
auto implicitArgsInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), localIdsProgrammingSize);
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
}
|
||||
|
@ -1265,7 +1265,7 @@ HWCMDTEST_F(IGFX_GEN12LP_CORE, HardwareCommandsImplicitArgsTests, givenPreXeHpPl
|
|||
auto implicitArgsInIndirectData = indirectHeapAllocation->getUnderlyingBuffer();
|
||||
EXPECT_EQ(0, memcmp(implicitArgsInIndirectData, &expectedImplicitArgs, ImplicitArgsV0::getSize()));
|
||||
|
||||
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), alignUp(ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize));
|
||||
auto crossThreadDataInIndirectData = ptrOffset(indirectHeapAllocation->getUnderlyingBuffer(), ImplicitArgsV0::getAlignedSize());
|
||||
|
||||
auto programmedImplicitArgsGpuVA = reinterpret_cast<uint64_t *>(crossThreadDataInIndirectData)[0];
|
||||
EXPECT_EQ(indirectHeapAllocation->getGpuAddress(), programmedImplicitArgsGpuVA);
|
||||
|
@ -1290,11 +1290,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
|||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, workgroupDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
|
@ -1325,11 +1325,11 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
|||
|
||||
auto grfSize = ImplicitArgsHelper::getGrfSize(expectedImplicitArgs.simdWidth);
|
||||
auto numGrf = GrfConfig::defaultGrfNumber;
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize);
|
||||
auto expectedLocalIds = alignedMalloc(implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize(), MemoryConstants::cacheLineSize);
|
||||
const auto &rootDeviceEnvironment = pDevice->getRootDeviceEnvironment();
|
||||
generateLocalIDs(expectedLocalIds, expectedImplicitArgs.simdWidth, localSize, expectedDimOrder, false, grfSize, numGrf, rootDeviceEnvironment);
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
|
||||
size_t sizeForLocalIds = PerThreadDataHelper::getPerThreadDataSizeTotal(expectedImplicitArgs.simdWidth, grfSize, numGrf, 3u, totalLocalSize, false, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeForLocalIds));
|
||||
|
@ -1358,7 +1358,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, HardwareCommandsImplicitArgsTests, givenKernelWithI
|
|||
|
||||
EXPECT_EQ(0, memcmp(expectedLocalIds, indirectHeapAllocation->getUnderlyingBuffer(), sizeof(expectedLocalIds)));
|
||||
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getSize();
|
||||
auto localIdsProgrammingSize = implicitArgsProgrammingSize - ImplicitArgsV0::getAlignedSize();
|
||||
|
||||
EXPECT_EQ(alignUp(sizeof(expectedLocalIds), MemoryConstants::cacheLineSize), localIdsProgrammingSize);
|
||||
|
||||
|
|
|
@ -658,9 +658,9 @@ void Linker::resolveImplicitArgs(const KernelDescriptorsT &kernelDescriptors, De
|
|||
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
|
||||
auto implicitArgsSize = 0;
|
||||
if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 0) {
|
||||
implicitArgsSize = ImplicitArgsV0::getSize();
|
||||
implicitArgsSize = ImplicitArgsV0::getAlignedSize();
|
||||
} else if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
|
||||
implicitArgsSize = ImplicitArgsV1::getSize();
|
||||
implicitArgsSize = ImplicitArgsV1::getAlignedSize();
|
||||
} else {
|
||||
UNRECOVERABLE_IF(true);
|
||||
}
|
||||
|
|
|
@ -47,6 +47,7 @@ struct alignas(32) ImplicitArgsV0 {
|
|||
uint8_t reserved[16];
|
||||
|
||||
static constexpr uint8_t getSize() { return static_cast<uint8_t>((offsetof(ImplicitArgsV0, reserved))); }
|
||||
static constexpr uint8_t getAlignedSize() { return sizeof(ImplicitArgsV0); }
|
||||
};
|
||||
|
||||
static_assert(std::alignment_of_v<ImplicitArgsV0> == 32, "Implicit args size need to be aligned to 32");
|
||||
|
@ -78,6 +79,7 @@ struct alignas(32) ImplicitArgsV1 {
|
|||
uint8_t reserved[44];
|
||||
|
||||
static constexpr uint8_t getSize() { return static_cast<uint8_t>(offsetof(ImplicitArgsV1, reserved)); }
|
||||
static constexpr uint8_t getAlignedSize() { return sizeof(ImplicitArgsV1); }
|
||||
};
|
||||
|
||||
static_assert(std::alignment_of_v<ImplicitArgsV1> == 32, "Implicit args size need to be aligned to 32");
|
||||
|
@ -113,6 +115,18 @@ struct alignas(32) ImplicitArgs {
|
|||
return 0;
|
||||
}
|
||||
|
||||
uint8_t getAlignedSize() const {
|
||||
if (v0.header.structVersion == 0) {
|
||||
return ImplicitArgsV0::getAlignedSize();
|
||||
|
||||
} else if (v1.header.structVersion == 1) {
|
||||
return ImplicitArgsV1::getAlignedSize();
|
||||
}
|
||||
|
||||
DEBUG_BREAK_IF(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void setNumWorkDim(uint32_t numWorkDim) {
|
||||
if (v0.header.structVersion == 0) {
|
||||
v0.numWorkDim = numWorkDim;
|
||||
|
|
|
@ -48,14 +48,7 @@ uint32_t getSizeForImplicitArgsStruct(const ImplicitArgs *pImplicitArgs, const K
|
|||
if (!pImplicitArgs) {
|
||||
return 0;
|
||||
}
|
||||
auto implicitArgsSize = pImplicitArgs->getSize();
|
||||
|
||||
auto patchImplicitArgsBufferInCrossThread = NEO::isValidOffset<>(kernelDescriptor.payloadMappings.implicitArgs.implicitArgsBuffer);
|
||||
if (patchImplicitArgsBufferInCrossThread) {
|
||||
return alignUp(implicitArgsSize, MemoryConstants::cacheLineSize);
|
||||
} else {
|
||||
return implicitArgsSize;
|
||||
}
|
||||
return pImplicitArgs->getAlignedSize();
|
||||
}
|
||||
|
||||
uint32_t getSizeForImplicitArgsPatching(const ImplicitArgs *pImplicitArgs, const KernelDescriptor &kernelDescriptor, bool isHwLocalIdGeneration, const RootDeviceEnvironment &rootDeviceEnvironment) {
|
||||
|
@ -112,7 +105,7 @@ void *patchImplicitArgs(void *ptrToPatch, const ImplicitArgs &implicitArgs, cons
|
|||
dimensionOrder,
|
||||
false, grfSize, grfCount, rootDeviceEnvironment);
|
||||
|
||||
auto sizeForLocalIdsProgramming = totalSizeToProgram - implicitArgs.getSize();
|
||||
auto sizeForLocalIdsProgramming = totalSizeToProgram - implicitArgs.getAlignedSize();
|
||||
ptrToPatch = ptrOffset(ptrToPatch, sizeForLocalIdsProgramming);
|
||||
}
|
||||
|
||||
|
|
|
@ -11,9 +11,9 @@
|
|||
namespace ImplicitArgsTestHelper {
|
||||
constexpr uint32_t getImplicitArgsSize(uint32_t version) {
|
||||
if (version == 0) {
|
||||
return NEO::ImplicitArgsV0::getSize();
|
||||
return NEO::ImplicitArgsV0::getAlignedSize();
|
||||
} else if (version == 1) {
|
||||
return NEO::ImplicitArgsV1::getSize();
|
||||
return NEO::ImplicitArgsV1::getAlignedSize();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -2146,7 +2146,7 @@ HWTEST_F(LinkerTests, givenImplicitArgRelocationAndImplicitArgsV1WhenLinkingThen
|
|||
|
||||
auto addressToPatch = reinterpret_cast<const uint32_t *>(instructionSegment.data() + reloc.r_offset);
|
||||
|
||||
EXPECT_EQ(ImplicitArgsV1::getSize(), *addressToPatch);
|
||||
EXPECT_EQ(ImplicitArgsV1::getAlignedSize(), *addressToPatch);
|
||||
EXPECT_EQ(initData, *(addressToPatch - 1));
|
||||
EXPECT_EQ(initData, *(addressToPatch + 1));
|
||||
|
||||
|
|
|
@ -82,7 +82,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
|||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
auto localIdsSize = alignUp(PerThreadDataHelper::getPerThreadDataSizeTotal(implicitArgs.v0.simdWidth, 32u /* grfSize */, GrfConfig::defaultGrfNumber /* numGrf */, 3u /* num channels */, totalWorkgroupSize, false, rootDeviceEnvironment), MemoryConstants::cacheLineSize);
|
||||
EXPECT_EQ(localIdsSize + ImplicitArgsV0::getSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
EXPECT_EQ(localIdsSize + ImplicitArgsV0::getAlignedSize(), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenGettingSizeForImplicitArgsProgrammingThenCorrectSizeIsReturned) {
|
||||
|
@ -100,7 +100,8 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
|||
implicitArgs.v0.localSizeZ = 4;
|
||||
NEO::MockExecutionEnvironment mockExecutionEnvironment{};
|
||||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
EXPECT_EQ(alignUp(implicitArgs.v0.header.structSize, MemoryConstants::cacheLineSize), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
EXPECT_EQ(alignUp(implicitArgs.v0.header.structSize, 32), implicitArgs.getAlignedSize());
|
||||
EXPECT_EQ(alignUp(implicitArgs.v0.header.structSize, 32), ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment));
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
|
||||
|
@ -147,13 +148,16 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithoutImplicitArgsBufferOffsetInP
|
|||
EXPECT_NE(pattern, memoryToPatch.get()[offset]) << offset;
|
||||
}
|
||||
|
||||
for (; offset < totalSizeForPatching - ImplicitArgsV0::getSize(); offset++) {
|
||||
for (; offset < totalSizeForPatching - ImplicitArgsV0::getAlignedSize(); offset++) {
|
||||
EXPECT_EQ(pattern, memoryToPatch.get()[offset]);
|
||||
}
|
||||
|
||||
for (; offset < totalSizeForPatching; offset++) {
|
||||
for (; offset < totalSizeForPatching - (ImplicitArgsV0::getAlignedSize() - ImplicitArgsV0::getSize()); offset++) {
|
||||
EXPECT_NE(pattern, memoryToPatch.get()[offset]);
|
||||
}
|
||||
for (; offset < totalSizeForPatching; offset++) {
|
||||
EXPECT_EQ(pattern, memoryToPatch.get()[offset]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayloadMappingWhenPatchingImplicitArgsThenOnlyProperRegionIsPatched) {
|
||||
|
@ -174,7 +178,7 @@ TEST(ImplicitArgsHelperTest, givenImplicitArgsWithImplicitArgsBufferOffsetInPayl
|
|||
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
|
||||
auto totalSizeForPatching = ImplicitArgsHelper::getSizeForImplicitArgsPatching(&implicitArgs, kernelDescriptor, false, rootDeviceEnvironment);
|
||||
|
||||
EXPECT_EQ(alignUp(ImplicitArgsV0::getSize(), MemoryConstants::cacheLineSize), totalSizeForPatching);
|
||||
EXPECT_EQ(ImplicitArgsV0::getAlignedSize(), totalSizeForPatching);
|
||||
|
||||
auto memoryToPatch = std::make_unique<uint8_t[]>(totalSizeForPatching);
|
||||
|
||||
|
|
Loading…
Reference in New Issue