fix: update implicitArgs versions

- fix layout of implicit args
- add enqueued local size and sync buffer ptr

Related-To: NEO-15160

Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe
2025-07-22 12:33:29 +00:00
committed by Compute-Runtime-Automation
parent 09ee9bf093
commit b17ed79618
13 changed files with 234 additions and 15 deletions

View File

@@ -1328,6 +1328,7 @@ void KernelImp::patchWorkgroupSizeInCrossThreadData(uint32_t x, uint32_t y, uint
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.enqueuedLocalWorkSize, workgroupSize);
if (state.pImplicitArgs) {
state.pImplicitArgs->setLocalSize(x, y, z);
state.pImplicitArgs->setEnqueuedLocalSize(x, y, z);
}
}

View File

@@ -141,6 +141,8 @@ TEST(KernelAssert, GivenKernelWithAssertAndImplicitArgsWhenInitializingKernelThe
EXPECT_EQ(assertBufferAddress, implicitArgs->v0.assertBufferPtr);
} else if (implicitArgs->v1.header.structVersion == 1) {
EXPECT_EQ(assertBufferAddress, implicitArgs->v1.assertBufferPtr);
} else if (implicitArgs->v2.header.structVersion == 2) {
EXPECT_EQ(assertBufferAddress, implicitArgs->v2.assertBufferPtr);
}
}

View File

@@ -81,6 +81,10 @@ size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
localWorkSize[0] = pImplicitArgs->v1.localSizeX;
localWorkSize[1] = pImplicitArgs->v1.localSizeY;
localWorkSize[2] = pImplicitArgs->v1.localSizeZ;
} else if (pImplicitArgs->v1.header.structVersion == 2) {
localWorkSize[0] = pImplicitArgs->v2.localSizeX;
localWorkSize[1] = pImplicitArgs->v2.localSizeY;
localWorkSize[2] = pImplicitArgs->v2.localSizeZ;
} else {
UNRECOVERABLE_IF(true);
}

View File

@@ -2139,6 +2139,9 @@ void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t lo
patchVecNonPointer(getCrossThreadDataRef(),
getDescriptor().payloadMappings.dispatchTraits.enqueuedLocalWorkSize,
{localWorkSizeX, localWorkSizeY, localWorkSizeZ});
if (pImplicitArgs) {
pImplicitArgs->setEnqueuedLocalSize(localWorkSizeX, localWorkSizeY, localWorkSizeZ);
}
}
void Kernel::setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {

View File

@@ -3824,6 +3824,9 @@ TEST_F(KernelImplicitArgsTest, WhenKernelRequiresImplicitArgsThenImplicitArgsStr
} else if (pClDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
expectedImplicitArgs.v1.header.structVersion = 1;
expectedImplicitArgs.v1.header.structSize = ImplicitArgsV1::getSize();
} else if (pClDevice->getGfxCoreHelper().getImplicitArgsVersion() == 2) {
expectedImplicitArgs.v2.header.structVersion = 2;
expectedImplicitArgs.v2.header.structSize = ImplicitArgsV2::getSize();
}
expectedImplicitArgs.setSimdWidth(32);
@@ -3852,6 +3855,9 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenSettingKernelParam
} else if (pClDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
expectedImplicitArgs.v1.header.structVersion = 1;
expectedImplicitArgs.v1.header.structSize = ImplicitArgsV1::getSize();
} else if (pClDevice->getGfxCoreHelper().getImplicitArgsVersion() == 2) {
expectedImplicitArgs.v2.header.structVersion = 2;
expectedImplicitArgs.v2.header.structSize = ImplicitArgsV2::getSize();
}
expectedImplicitArgs.setNumWorkDim(3);
@@ -3936,6 +3942,9 @@ TEST_F(KernelImplicitArgsTest, givenKernelWithImplicitArgsWhenCloneKernelThenImp
} else if (pClDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
expectedImplicitArgs.v1.header.structVersion = 1;
expectedImplicitArgs.v1.header.structSize = ImplicitArgsV1::getSize();
} else if (pClDevice->getGfxCoreHelper().getImplicitArgsVersion() == 2) {
expectedImplicitArgs.v2.header.structVersion = 2;
expectedImplicitArgs.v2.header.structSize = ImplicitArgsV2::getSize();
}
expectedImplicitArgs.setNumWorkDim(3);

View File

@@ -629,6 +629,15 @@ void EncodeIndirectParams<Family>::encode(CommandContainer &container, uint64_t
setGroupCountIndirect(container, groupCountOffsetV1, implicitArgsGpuPtr, nullptr);
setGlobalWorkSizeIndirect(container, globalSizeOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
setWorkDimIndirect(container, numWorkDimOffsetV1, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
} else if (version == 2) {
constexpr CrossThreadDataOffset groupCountOffsetV2[] = {offsetof(ImplicitArgsV2, groupCountX), offsetof(ImplicitArgsV2, groupCountY), offsetof(ImplicitArgsV2, groupCountZ)};
constexpr CrossThreadDataOffset globalSizeOffsetV2[] = {offsetof(ImplicitArgsV2, globalSizeX), offsetof(ImplicitArgsV2, globalSizeY), offsetof(ImplicitArgsV2, globalSizeZ)};
constexpr auto numWorkDimOffsetV2 = offsetof(ImplicitArgsV2, numWorkDim);
setGroupCountIndirect(container, groupCountOffsetV2, implicitArgsGpuPtr, nullptr);
setGlobalWorkSizeIndirect(container, globalSizeOffsetV2, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
setWorkDimIndirect(container, numWorkDimOffsetV2, implicitArgsGpuPtr, dispatchInterface->getGroupSize(), nullptr);
} else {
UNRECOVERABLE_IF(true);
}
}
if (outArgs && !outArgs->commandsToPatch.empty()) {

View File

@@ -665,6 +665,8 @@ void Linker::resolveImplicitArgs(const KernelDescriptorsT &kernelDescriptors, De
implicitArgsSize = ImplicitArgsV0::getAlignedSize();
} else if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 1) {
implicitArgsSize = ImplicitArgsV1::getAlignedSize();
} else if (pDevice->getGfxCoreHelper().getImplicitArgsVersion() == 2) {
implicitArgsSize = ImplicitArgsV2::getAlignedSize();
} else {
UNRECOVERABLE_IF(true);
}

View File

@@ -9,6 +9,7 @@ set(NEO_CORE_KERNEL
${CMAKE_CURRENT_SOURCE_DIR}/debug_data.h
${CMAKE_CURRENT_SOURCE_DIR}/dispatch_kernel_encoder_interface.h
${CMAKE_CURRENT_SOURCE_DIR}/grf_config.h
${CMAKE_CURRENT_SOURCE_DIR}/implicit_args.cpp
${CMAKE_CURRENT_SOURCE_DIR}/definitions${BRANCH_DIR_SUFFIX}implicit_args.h
${CMAKE_CURRENT_SOURCE_DIR}/implicit_args_helper.cpp
${CMAKE_CURRENT_SOURCE_DIR}/implicit_args_helper.h

View File

@@ -22,8 +22,6 @@ struct alignas(1) ImplicitArgsHeader {
uint8_t structVersion;
};
static_assert(sizeof(ImplicitArgsHeader) == (2 * sizeof(uint8_t)));
struct alignas(32) ImplicitArgsV0 {
ImplicitArgsHeader header;
uint8_t numWorkDim;
@@ -53,10 +51,7 @@ struct alignas(32) ImplicitArgsV0 {
}
};
static_assert(std::alignment_of_v<ImplicitArgsV0> == 32, "Implicit args size need to be aligned to 32");
static_assert(sizeof(ImplicitArgsV0) == (32 * sizeof(uint32_t)));
static_assert(ImplicitArgsV0::getSize() == (28 * sizeof(uint32_t)));
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV0>);
struct alignas(32) ImplicitArgsV1 {
ImplicitArgsHeader header;
@@ -79,21 +74,56 @@ struct alignas(32) ImplicitArgsV1 {
uint32_t padding1;
uint64_t rtGlobalBufferPtr;
uint64_t assertBufferPtr;
uint8_t reserved[44];
uint64_t scratchPtr;
uint64_t syncBufferPtr;
uint32_t enqueuedLocalSizeX;
uint32_t enqueuedLocalSizeY;
uint32_t enqueuedLocalSizeZ;
static constexpr uint8_t getSize() { return static_cast<uint8_t>(offsetof(ImplicitArgsV1, reserved)); }
static constexpr uint8_t getSize() { return static_cast<uint8_t>((offsetof(ImplicitArgsV1, enqueuedLocalSizeZ) + sizeof(ImplicitArgsV1::enqueuedLocalSizeZ))); }
static constexpr uint8_t getAlignedSize() { return sizeof(ImplicitArgsV1); }
};
static_assert(std::alignment_of_v<ImplicitArgsV1> == 32, "Implicit args size need to be aligned to 32");
static_assert(sizeof(ImplicitArgsV1) == (40 * sizeof(uint32_t)));
static_assert(ImplicitArgsV1::getSize() == (28 * sizeof(uint32_t)));
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV1>);
static_assert(ImplicitArgsV1::getSize() == (35 * sizeof(uint32_t)));
struct alignas(32) ImplicitArgsV2 {
ImplicitArgsHeader header;
uint8_t numWorkDim;
uint8_t padding0;
uint32_t localSizeX;
uint32_t localSizeY;
uint32_t localSizeZ;
uint64_t globalSizeX;
uint64_t globalSizeY;
uint64_t globalSizeZ;
uint64_t printfBufferPtr;
uint64_t globalOffsetX;
uint64_t globalOffsetY;
uint64_t globalOffsetZ;
uint64_t localIdTablePtr;
uint32_t groupCountX;
uint32_t groupCountY;
uint32_t groupCountZ;
uint32_t padding1;
uint64_t rtGlobalBufferPtr;
uint64_t assertBufferPtr;
uint64_t syncBufferPtr;
uint32_t enqueuedLocalSizeX;
uint32_t enqueuedLocalSizeY;
uint32_t enqueuedLocalSizeZ;
uint8_t reserved[24];
static constexpr uint8_t getSize() { return static_cast<uint8_t>((offsetof(ImplicitArgsV2, reserved) + sizeof(ImplicitArgsV2::reserved))); }
static constexpr uint8_t getAlignedSize() { return sizeof(ImplicitArgsV2); }
};
static_assert(ImplicitArgsV2::getSize() == (39 * sizeof(uint32_t)));
struct alignas(32) ImplicitArgs {
union {
ImplicitArgsV0 v0;
ImplicitArgsV1 v1;
ImplicitArgsV2 v2;
};
void initializeHeader(uint32_t version) {
@@ -103,6 +133,9 @@ struct alignas(32) ImplicitArgs {
} else if (version == 1) {
v1.header.structSize = NEO::ImplicitArgsV1::getSize();
v1.header.structVersion = 1;
} else if (version == 2) {
v2.header.structSize = NEO::ImplicitArgsV2::getSize();
v2.header.structVersion = 2;
}
}
@@ -112,6 +145,8 @@ struct alignas(32) ImplicitArgs {
} else if (v1.header.structVersion == 1) {
return v1.header.structSize;
} else if (v2.header.structVersion == 2) {
return v2.header.structSize;
}
DEBUG_BREAK_IF(true);
@@ -124,6 +159,8 @@ struct alignas(32) ImplicitArgs {
} else if (v1.header.structVersion == 1) {
return ImplicitArgsV1::getAlignedSize();
} else if (v2.header.structVersion == 2) {
return ImplicitArgsV2::getAlignedSize();
}
DEBUG_BREAK_IF(true);
@@ -136,6 +173,8 @@ struct alignas(32) ImplicitArgs {
} else if (v1.header.structVersion == 1) {
v1.numWorkDim = numWorkDim;
} else if (v2.header.structVersion == 2) {
v2.numWorkDim = numWorkDim;
}
}
@@ -162,6 +201,10 @@ struct alignas(32) ImplicitArgs {
v1.localSizeX = x;
v1.localSizeY = y;
v1.localSizeZ = z;
} else if (v2.header.structVersion == 2) {
v2.localSizeX = x;
v2.localSizeY = y;
v2.localSizeZ = z;
}
}
@@ -175,6 +218,10 @@ struct alignas(32) ImplicitArgs {
x = v1.localSizeX;
y = v1.localSizeY;
z = v1.localSizeZ;
} else if (v2.header.structVersion == 2) {
x = v2.localSizeX;
y = v2.localSizeY;
z = v2.localSizeZ;
}
}
@@ -188,6 +235,10 @@ struct alignas(32) ImplicitArgs {
v1.globalSizeX = x;
v1.globalSizeY = y;
v1.globalSizeZ = z;
} else if (v2.header.structVersion == 2) {
v2.globalSizeX = x;
v2.globalSizeY = y;
v2.globalSizeZ = z;
}
}
@@ -201,6 +252,10 @@ struct alignas(32) ImplicitArgs {
v1.globalOffsetX = x;
v1.globalOffsetY = y;
v1.globalOffsetZ = z;
} else if (v2.header.structVersion == 2) {
v2.globalOffsetX = x;
v2.globalOffsetY = y;
v2.globalOffsetZ = z;
}
}
void setGroupCount(uint32_t x, uint32_t y, uint32_t z) {
@@ -213,6 +268,10 @@ struct alignas(32) ImplicitArgs {
v1.groupCountX = x;
v1.groupCountY = y;
v1.groupCountZ = z;
} else if (v2.header.structVersion == 2) {
v2.groupCountX = x;
v2.groupCountY = y;
v2.groupCountZ = z;
}
}
@@ -222,6 +281,8 @@ struct alignas(32) ImplicitArgs {
} else if (v1.header.structVersion == 1) {
v1.localIdTablePtr = address;
} else if (v2.header.structVersion == 2) {
v2.localIdTablePtr = address;
}
}
void setPrintfBuffer(uint64_t address) {
@@ -230,6 +291,8 @@ struct alignas(32) ImplicitArgs {
} else if (v1.header.structVersion == 1) {
v1.printfBufferPtr = address;
} else if (v2.header.structVersion == 2) {
v2.printfBufferPtr = address;
}
}
@@ -239,6 +302,8 @@ struct alignas(32) ImplicitArgs {
} else if (v1.header.structVersion == 1) {
v1.rtGlobalBufferPtr = address;
} else if (v2.header.structVersion == 2) {
v2.rtGlobalBufferPtr = address;
}
}
@@ -248,10 +313,42 @@ struct alignas(32) ImplicitArgs {
} else if (v1.header.structVersion == 1) {
v1.assertBufferPtr = address;
} else if (v2.header.structVersion == 2) {
v2.assertBufferPtr = address;
}
}
void setSyncBufferPtr(uint64_t syncBuffer) {
if (v1.header.structVersion == 1) {
v1.syncBufferPtr = syncBuffer;
} else if (v2.header.structVersion == 2) {
v2.syncBufferPtr = syncBuffer;
}
}
void setEnqueuedLocalSize(uint32_t x, uint32_t y, uint32_t z) {
if (v1.header.structVersion == 1) {
v1.enqueuedLocalSizeX = x;
v1.enqueuedLocalSizeY = y;
v1.enqueuedLocalSizeZ = z;
} else if (v2.header.structVersion == 2) {
v2.enqueuedLocalSizeX = x;
v2.enqueuedLocalSizeY = y;
v2.enqueuedLocalSizeZ = z;
}
}
void getEnqueuedLocalSize(uint32_t &x, uint32_t &y, uint32_t &z) const {
if (v1.header.structVersion == 1) {
x = v1.enqueuedLocalSizeX;
y = v1.enqueuedLocalSizeY;
z = v1.enqueuedLocalSizeZ;
} else if (v2.header.structVersion == 2) {
x = v2.enqueuedLocalSizeX;
y = v2.enqueuedLocalSizeY;
z = v2.enqueuedLocalSizeZ;
}
}
};
static_assert(NEO::TypeTraits::isPodV<ImplicitArgs>);
} // namespace NEO

View File

@@ -0,0 +1,33 @@
/*
* Copyright (C) 2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "implicit_args.h"
#include <cstddef>
#include <cstdint>
#include <optional>
#include <type_traits>
namespace NEO {
static_assert(sizeof(ImplicitArgsHeader) == (2 * sizeof(uint8_t)));
static_assert(std::alignment_of_v<ImplicitArgsV0> == 32, "Implicit args size need to be aligned to 32");
static_assert(sizeof(ImplicitArgsV0) == (32 * sizeof(uint32_t)));
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV0>);
static_assert(std::alignment_of_v<ImplicitArgsV1> == 32, "Implicit args size need to be aligned to 32");
static_assert(sizeof(ImplicitArgsV1) == (40 * sizeof(uint32_t)));
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV1>);
static_assert(std::alignment_of_v<ImplicitArgsV2> == 32, "Implicit args size need to be aligned to 32");
static_assert(sizeof(ImplicitArgsV2) == (40 * sizeof(uint32_t)));
static_assert(NEO::TypeTraits::isPodV<ImplicitArgsV2>);
static_assert(NEO::TypeTraits::isPodV<ImplicitArgs>);
} // namespace NEO

View File

@@ -14,6 +14,8 @@ constexpr uint32_t getImplicitArgsSize(uint32_t version) {
return NEO::ImplicitArgsV0::getAlignedSize();
} else if (version == 1) {
return NEO::ImplicitArgsV1::getAlignedSize();
} else if (version == 2) {
return NEO::ImplicitArgsV2::getAlignedSize();
}
return 0;
}

View File

@@ -2302,7 +2302,7 @@ HWTEST_F(LinkerTests, givenImplicitArgRelocationAndImplicitArgsWithUnknownVersio
DebugManagerStateRestore restore;
struct MockGfxCoreHelper : NEO::GfxCoreHelperHw<FamilyType> {
uint32_t getImplicitArgsVersion() const override {
return 2; // unknown version
return 3; // unknown version
}
};

View File

@@ -263,6 +263,8 @@ TEST(ImplicitArgsV1Test, givenImplicitArgsV1WhenSettingFieldsThenCorrectFieldsAr
implicitArgs.setNumWorkDim(16);
implicitArgs.setRtGlobalBufferPtr(0x1000123400);
implicitArgs.setSimdWidth(32);
implicitArgs.setSyncBufferPtr(0x1234000);
implicitArgs.setEnqueuedLocalSize(2, 3, 4);
EXPECT_EQ(0x4567000u, implicitArgs.v1.assertBufferPtr);
@@ -282,10 +284,64 @@ TEST(ImplicitArgsV1Test, givenImplicitArgsV1WhenSettingFieldsThenCorrectFieldsAr
EXPECT_EQ(9u, implicitArgs.v1.localSizeY);
EXPECT_EQ(11u, implicitArgs.v1.localSizeZ);
EXPECT_EQ(2u, implicitArgs.v1.enqueuedLocalSizeX);
EXPECT_EQ(3u, implicitArgs.v1.enqueuedLocalSizeY);
EXPECT_EQ(4u, implicitArgs.v1.enqueuedLocalSizeZ);
EXPECT_EQ(0x5699000u, implicitArgs.v1.localIdTablePtr);
EXPECT_EQ(0xff000u, implicitArgs.v1.printfBufferPtr);
EXPECT_EQ(16u, implicitArgs.v1.numWorkDim);
EXPECT_EQ(0x1000123400u, implicitArgs.v1.rtGlobalBufferPtr);
EXPECT_EQ(0x1234000u, implicitArgs.v1.syncBufferPtr);
}
TEST(ImplicitArgsV2Test, givenImplicitArgsV2WhenSettingFieldsThenCorrectFieldsAreSet) {
ImplicitArgs implicitArgs{};
implicitArgs.v2.header.structSize = ImplicitArgsV2::getSize();
implicitArgs.v2.header.structVersion = 2;
EXPECT_EQ(ImplicitArgsV2::getSize(), implicitArgs.getSize());
implicitArgs.setAssertBufferPtr(0x4567000);
implicitArgs.setGlobalOffset(5, 6, 7);
implicitArgs.setGlobalSize(1, 2, 3);
implicitArgs.setGroupCount(10, 20, 30);
implicitArgs.setLocalSize(8, 9, 11);
implicitArgs.setLocalIdTablePtr(0x5699000);
implicitArgs.setPrintfBuffer(0xff000);
implicitArgs.setNumWorkDim(16);
implicitArgs.setRtGlobalBufferPtr(0x1000123400);
implicitArgs.setSimdWidth(32);
implicitArgs.setSyncBufferPtr(0x1234000);
implicitArgs.setEnqueuedLocalSize(2, 3, 4);
EXPECT_EQ(0x4567000u, implicitArgs.v1.assertBufferPtr);
EXPECT_EQ(5u, implicitArgs.v2.globalOffsetX);
EXPECT_EQ(6u, implicitArgs.v2.globalOffsetY);
EXPECT_EQ(7u, implicitArgs.v2.globalOffsetZ);
EXPECT_EQ(1u, implicitArgs.v2.globalSizeX);
EXPECT_EQ(2u, implicitArgs.v2.globalSizeY);
EXPECT_EQ(3u, implicitArgs.v2.globalSizeZ);
EXPECT_EQ(10u, implicitArgs.v2.groupCountX);
EXPECT_EQ(20u, implicitArgs.v2.groupCountY);
EXPECT_EQ(30u, implicitArgs.v2.groupCountZ);
EXPECT_EQ(8u, implicitArgs.v2.localSizeX);
EXPECT_EQ(9u, implicitArgs.v2.localSizeY);
EXPECT_EQ(11u, implicitArgs.v2.localSizeZ);
EXPECT_EQ(2u, implicitArgs.v2.enqueuedLocalSizeX);
EXPECT_EQ(3u, implicitArgs.v2.enqueuedLocalSizeY);
EXPECT_EQ(4u, implicitArgs.v2.enqueuedLocalSizeZ);
EXPECT_EQ(0x5699000u, implicitArgs.v2.localIdTablePtr);
EXPECT_EQ(0xff000u, implicitArgs.v2.printfBufferPtr);
EXPECT_EQ(16u, implicitArgs.v2.numWorkDim);
EXPECT_EQ(0x1000123400u, implicitArgs.v2.rtGlobalBufferPtr);
EXPECT_EQ(0x1234000u, implicitArgs.v2.syncBufferPtr);
}
TEST(ImplicitArgsV1Test, givenImplicitArgsWithUnknownVersionWhenSettingFieldsThenFieldsAreNotPopulated) {
@@ -294,7 +350,7 @@ TEST(ImplicitArgsV1Test, givenImplicitArgsWithUnknownVersionWhenSettingFieldsThe
memset(&implicitArgs, 0, sizeof(implicitArgs));
implicitArgs.v1.header.structSize = ImplicitArgsV1::getSize();
implicitArgs.v1.header.structVersion = 2; // unknown version
implicitArgs.v1.header.structVersion = 3; // unknown version
EXPECT_EQ(0u, implicitArgs.getSize());