Refactor bindless surface state offsets

Related-To: NEO-4607

Change-Id: I0b0ac275b532cf33f0292d3cf92abf2f49ff354f
Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe
2020-04-27 15:40:12 +02:00
committed by sys_ocldev
parent 1401b45018
commit d2c07cb9ed
11 changed files with 159 additions and 28 deletions

View File

@ -2447,11 +2447,13 @@ uint64_t Kernel::getKernelStartOffset(
}
void Kernel::patchBindlessSurfaceStateOffsets(const size_t sshOffset) {
const bool bindlessBuffers = DebugManager.flags.UseBindlessBuffers.get();
const bool bindlessImages = DebugManager.flags.UseBindlessImages.get();
const bool bindlessUsed = bindlessBuffers || bindlessImages;
if (bindlessUsed) {
auto &hardwareInfo = getDevice().getHardwareInfo();
auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
for (size_t i = 0; i < kernelInfo.kernelArgInfo.size(); i++) {
if ((kernelInfo.kernelArgInfo[i].isBuffer && bindlessBuffers) ||
@ -2460,12 +2462,9 @@ void Kernel::patchBindlessSurfaceStateOffsets(const size_t sshOffset) {
auto patchLocation = ptrOffset(getCrossThreadData(),
kernelInfo.kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset);
uint32_t patchSize = 4;
uint64_t patchValue = sshOffset + kernelInfo.kernelArgInfo[i].offsetHeap;
// compiler is not shifting surface offset << 6
patchValue <<= 6;
patchWithRequiredSize(patchLocation, patchSize, patchValue);
auto bindlessOffset = static_cast<uint32_t>(sshOffset) + kernelInfo.kernelArgInfo[i].offsetHeap;
auto patchValue = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(bindlessOffset);
patchWithRequiredSize(patchLocation, sizeof(patchValue), patchValue);
}
}
}

View File

@ -872,9 +872,9 @@ HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNo
}
using AUBBindlessKernel = Test<KernelAUBFixture<BindlessKernelFixture>>;
using IsBetweenSklAndTgllp = IsWithinProducts<IGFX_SKYLAKE, IGFX_TIGERLAKE_LP>;
using IsSklPlus = IsAtLeastProduct<IGFX_SKYLAKE>;
HWTEST2_F(AUBBindlessKernel, givenBindlessCopyKernelWhenEnqueuedThenResultsValidate, IsBetweenSklAndTgllp) {
HWTEST2_F(AUBBindlessKernel, givenBindlessCopyKernelWhenEnqueuedThenResultsValidate, IsSklPlus) {
constexpr size_t bufferSize = MemoryConstants::pageSize;
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
@ -891,21 +891,24 @@ HWTEST2_F(AUBBindlessKernel, givenBindlessCopyKernelWhenEnqueuedThenResultsValid
memset(bufferDataDst, 0, bufferSize);
auto pBufferSrc = std::unique_ptr<Buffer>(Buffer::create(context,
CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
CL_MEM_READ_WRITE,
bufferSize,
bufferDataSrc,
nullptr,
retVal));
ASSERT_NE(nullptr, pBufferSrc);
auto pBufferDst = std::unique_ptr<Buffer>(Buffer::create(context,
CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
CL_MEM_READ_WRITE,
bufferSize,
bufferDataDst,
nullptr,
retVal));
ASSERT_NE(nullptr, pBufferDst);
auto simulatedCsr = AUBFixture::getSimulatedCsr<FamilyType>();
memcpy(pBufferSrc->getGraphicsAllocation()->getUnderlyingBuffer(), bufferDataSrc, bufferSize);
memcpy(pBufferDst->getGraphicsAllocation()->getUnderlyingBuffer(), bufferDataDst, bufferSize);
simulatedCsr->writeMemory(*pBufferSrc->getGraphicsAllocation());
simulatedCsr->writeMemory(*pBufferDst->getGraphicsAllocation());

View File

@ -28,7 +28,8 @@ void KernelImageArgTest::SetUp() {
pKernelInfo->heapInfo.pKernelHeader = kernelHeader.get();
pKernelInfo->usesSsh = true;
pKernelInfo->kernelArgInfo.resize(5);
constexpr int numImages = 5;
pKernelInfo->kernelArgInfo.resize(numImages);
pKernelInfo->kernelArgInfo[4].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
pKernelInfo->kernelArgInfo[3].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
pKernelInfo->kernelArgInfo[2].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo);
@ -67,7 +68,7 @@ void KernelImageArgTest::SetUp() {
pKernel->setKernelArgHandler(3, &Kernel::setArgImage);
pKernel->setKernelArgHandler(4, &Kernel::setArgImage);
uint32_t crossThreadData[0x44] = {};
uint32_t crossThreadData[numImages * 0x20] = {};
crossThreadData[0x20 / sizeof(uint32_t)] = 0x12344321;
pKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));

View File

@ -28,3 +28,14 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HwHelperTest, givenHwHelperWhenAskedForLowPriorityEn
auto hwHelperEngineType = HwHelperHw<FamilyType>::lowPriorityEngineType;
EXPECT_EQ(aub_stream::EngineType::ENGINE_RCS, hwHelperEngineType);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HwHelperTest, givenHwHelperWhenGettingBindlessSurfaceExtendedMessageDescriptorValueThenCorrectValueIsReturned) {
auto &hwHelper = HwHelper::get(pDevice->getHardwareInfo().platform.eRenderCoreFamily);
auto value = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(0x200);
typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor messageExtDescriptor = {};
messageExtDescriptor.setBindlessSurfaceOffset(0x200);
EXPECT_EQ(messageExtDescriptor.getBindlessSurfaceOffsetToPatch(), value);
EXPECT_EQ(0x200u << 6, value);
}

View File

@ -5,6 +5,7 @@
*
*/
#include "shared/source/helpers/hw_cmds.h"
#include "shared/test/unit_test/helpers/debug_manager_state_restore.h"
#include "opencl/source/kernel/kernel.h"
@ -200,7 +201,8 @@ TEST_F(KernelArgBufferTest, givenNoCacheFlushBufferWhenSettingAsArgThenNotExpect
EXPECT_EQ(nullptr, pKernel->kernelArgRequiresCacheFlush[0]);
}
TEST_F(KernelArgBufferTest, givenUsedBindlessBuffersWhenPatchingSurfaceStateOffsetsThenCorrectOffsetIsPatchedInCrossThreadData) {
HWTEST_F(KernelArgBufferTest, givenUsedBindlessBuffersWhenPatchingSurfaceStateOffsetsThenCorrectOffsetIsPatchedInCrossThreadData) {
using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor;
DebugManagerStateRestore restorer;
DebugManager.flags.UseBindlessBuffers.set(1);
@ -214,14 +216,17 @@ TEST_F(KernelArgBufferTest, givenUsedBindlessBuffersWhenPatchingSurfaceStateOffs
auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
*patchLocation = 0xdead;
uint32_t sshOffset = 4000;
uint32_t sshOffset = 0x1000;
pKernel->patchBindlessSurfaceStateOffsets(sshOffset);
auto expectedOffset = (sshOffset + pKernelInfo->kernelArgInfo[0].offsetHeap) << 6;
DataPortBindlessSurfaceExtendedMessageDescriptor extMessageDesc;
extMessageDesc.setBindlessSurfaceOffset(sshOffset + pKernelInfo->kernelArgInfo[0].offsetHeap);
auto expectedOffset = extMessageDesc.getBindlessSurfaceOffsetToPatch();
EXPECT_EQ(expectedOffset, *patchLocation);
sshOffset = static_cast<uint32_t>(maxNBitValue(20)) - 64;
sshOffset = static_cast<uint32_t>(maxNBitValue(20) + 1) - 64;
pKernel->patchBindlessSurfaceStateOffsets(sshOffset);
expectedOffset = (sshOffset + pKernelInfo->kernelArgInfo[0].offsetHeap) << 6;
extMessageDesc.setBindlessSurfaceOffset(sshOffset + pKernelInfo->kernelArgInfo[0].offsetHeap);
expectedOffset = extMessageDesc.getBindlessSurfaceOffsetToPatch();
EXPECT_EQ(expectedOffset, *patchLocation);
}

View File

@ -5,6 +5,7 @@
*
*/
#include "shared/source/helpers/hw_cmds.h"
#include "shared/source/helpers/ptr_math.h"
#include "shared/test/unit_test/helpers/debug_manager_state_restore.h"
@ -326,14 +327,15 @@ TEST_F(KernelImageArgTest, givenNoCacheFlushImageWhenSettingAsArgThenExpectAlloc
EXPECT_EQ(nullptr, pKernel->kernelArgRequiresCacheFlush[0]);
}
TEST_F(KernelImageArgTest, givenUsedBindlessImagesWhenPatchingSurfaceStateOffsetsThenCorrectOffsetIsPatchedInCrossThreadData) {
HWTEST_F(KernelImageArgTest, givenUsedBindlessImagesWhenPatchingSurfaceStateOffsetsThenCorrectOffsetIsPatchedInCrossThreadData) {
using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor;
DebugManagerStateRestore restorer;
DebugManager.flags.UseBindlessImages.set(1);
pKernelInfo->usesSsh = true;
for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) {
pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(4 * i);
pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(0x20 * i);
auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset;
auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
*patchLocation = 0xdead;
@ -341,7 +343,7 @@ TEST_F(KernelImageArgTest, givenUsedBindlessImagesWhenPatchingSurfaceStateOffset
pKernelInfo->kernelArgInfo[pKernelInfo->kernelArgInfo.size() - 1].isImage = false;
uint32_t sshOffset = 4000;
uint32_t sshOffset = 0x4000;
pKernel->patchBindlessSurfaceStateOffsets(sshOffset);
for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) {
@ -349,7 +351,9 @@ TEST_F(KernelImageArgTest, givenUsedBindlessImagesWhenPatchingSurfaceStateOffset
auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
if (pKernelInfo->kernelArgInfo[i].isImage) {
auto expectedOffset = (sshOffset + pKernelInfo->kernelArgInfo[i].offsetHeap) << 6;
DataPortBindlessSurfaceExtendedMessageDescriptor extMessageDesc;
extMessageDesc.setBindlessSurfaceOffset(sshOffset + pKernelInfo->kernelArgInfo[i].offsetHeap);
auto expectedOffset = extMessageDesc.getBindlessSurfaceOffsetToPatch();
EXPECT_EQ(expectedOffset, *patchLocation);
} else {
EXPECT_EQ(0xdeadu, *patchLocation);
@ -364,7 +368,7 @@ TEST_F(KernelImageArgTest, givenUsedBindlessImagesAndNonImageArgWhenPatchingSurf
pKernelInfo->usesSsh = true;
for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) {
pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(4 * i);
pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(0x20 * i);
auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset;
auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
*patchLocation = 0xdead;
@ -373,7 +377,7 @@ TEST_F(KernelImageArgTest, givenUsedBindlessImagesAndNonImageArgWhenPatchingSurf
int nonImageIndex = 1;
pKernelInfo->kernelArgInfo[nonImageIndex].isImage = false;
uint32_t sshOffset = 4000;
uint32_t sshOffset = 0x4000;
pKernel->patchBindlessSurfaceStateOffsets(sshOffset);
auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[nonImageIndex].kernelArgPatchInfoVector[0].crossthreadOffset;
@ -390,7 +394,7 @@ TEST_F(KernelImageArgTest, givenNotUsedBindlessImagesAndImageArgWhenPatchingSurf
pKernelInfo->usesSsh = true;
for (size_t i = 0; i < pKernelInfo->kernelArgInfo.size(); i++) {
pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(4 * i);
pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset = 0x20 + static_cast<uint32_t>(0x20 * i);
auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset;
auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(pKernel->getCrossThreadData(), crossThreadDataOffset));
*patchLocation = 0xdead;
@ -399,7 +403,7 @@ TEST_F(KernelImageArgTest, givenNotUsedBindlessImagesAndImageArgWhenPatchingSurf
int nonImageIndex = 1;
pKernelInfo->kernelArgInfo[nonImageIndex].isImage = true;
uint32_t sshOffset = 4000;
uint32_t sshOffset = 0x4000;
pKernel->patchBindlessSurfaceStateOffsets(sshOffset);
auto crossThreadDataOffset = pKernelInfo->kernelArgInfo[nonImageIndex].kernelArgPatchInfoVector[0].crossthreadOffset;

View File

@ -19,6 +19,31 @@ namespace NEO {
struct GEN11 {
#include "shared/source/generated/gen11/hw_cmds_generated_gen11.inl"
struct DataPortBindlessSurfaceExtendedMessageDescriptor {
union {
struct {
uint32_t bindlessSurfaceOffset : 20;
uint32_t reserved : 1;
uint32_t executionUnitExtendedMessageDescriptorDefinition : 11;
};
uint32_t packed;
};
DataPortBindlessSurfaceExtendedMessageDescriptor() {
packed = 0;
}
void setBindlessSurfaceOffset(uint32_t offsetInBindlessSurfaceHeapInBytes) {
bindlessSurfaceOffset = offsetInBindlessSurfaceHeapInBytes >> 6;
}
uint32_t getBindlessSurfaceOffsetToPatch() {
return bindlessSurfaceOffset << 12;
}
};
static_assert(sizeof(DataPortBindlessSurfaceExtendedMessageDescriptor) == sizeof(DataPortBindlessSurfaceExtendedMessageDescriptor::packed), "");
};
struct ICLFamily : public GEN11 {
using PARSE = CmdParse<ICLFamily>;

View File

@ -22,6 +22,31 @@ namespace NEO {
struct GEN12LP {
#include "shared/source/generated/gen12lp/hw_cmds_generated_gen12lp.inl"
static constexpr uint32_t stateComputeModeForceNonCoherentMask = (((1 << 0) | (1 << 1)) << 3);
struct DataPortBindlessSurfaceExtendedMessageDescriptor {
union {
struct {
uint32_t bindlessSurfaceOffset : 20;
uint32_t reserved : 1;
uint32_t executionUnitExtendedMessageDescriptorDefinition : 11;
};
uint32_t packed;
};
DataPortBindlessSurfaceExtendedMessageDescriptor() {
packed = 0;
}
void setBindlessSurfaceOffset(uint32_t offsetInBindlessSurfaceHeapInBytes) {
bindlessSurfaceOffset = offsetInBindlessSurfaceHeapInBytes >> 6;
}
uint32_t getBindlessSurfaceOffsetToPatch() {
return bindlessSurfaceOffset << 12;
}
};
static_assert(sizeof(DataPortBindlessSurfaceExtendedMessageDescriptor) == sizeof(DataPortBindlessSurfaceExtendedMessageDescriptor::packed), "");
};
struct TGLLPFamily : public GEN12LP {
using PARSE = CmdParse<TGLLPFamily>;

View File

@ -21,6 +21,31 @@ namespace NEO {
struct GEN8 {
#include "shared/source/generated/gen8/hw_cmds_generated_gen8.inl"
struct DataPortBindlessSurfaceExtendedMessageDescriptor {
union {
struct {
uint32_t bindlessSurfaceOffset : 20;
uint32_t reserved : 1;
uint32_t executionUnitExtendedMessageDescriptorDefinition : 11;
};
uint32_t packed;
};
DataPortBindlessSurfaceExtendedMessageDescriptor() {
packed = 0;
}
void setBindlessSurfaceOffset(uint32_t offsetInBindlessSurfaceHeapInBytes) {
bindlessSurfaceOffset = offsetInBindlessSurfaceHeapInBytes >> 6;
}
uint32_t getBindlessSurfaceOffsetToPatch() {
return bindlessSurfaceOffset << 12;
}
};
static_assert(sizeof(DataPortBindlessSurfaceExtendedMessageDescriptor) == sizeof(DataPortBindlessSurfaceExtendedMessageDescriptor::packed), "");
};
struct BDWFamily : public GEN8 {
using PARSE = CmdParse<BDWFamily>;

View File

@ -21,6 +21,31 @@ namespace NEO {
struct GEN9 {
#include "shared/source/generated/gen9/hw_cmds_generated_gen9.inl"
struct DataPortBindlessSurfaceExtendedMessageDescriptor {
union {
struct {
uint32_t bindlessSurfaceOffset : 20;
uint32_t reserved : 1;
uint32_t executionUnitExtendedMessageDescriptorDefinition : 11;
};
uint32_t packed;
};
DataPortBindlessSurfaceExtendedMessageDescriptor() {
packed = 0;
}
void setBindlessSurfaceOffset(uint32_t offsetInBindlessSurfaceHeapInBytes) {
bindlessSurfaceOffset = offsetInBindlessSurfaceHeapInBytes >> 6;
}
uint32_t getBindlessSurfaceOffsetToPatch() {
return bindlessSurfaceOffset << 12;
}
};
static_assert(sizeof(DataPortBindlessSurfaceExtendedMessageDescriptor) == sizeof(DataPortBindlessSurfaceExtendedMessageDescriptor::packed), "");
};
struct SKLFamily : public GEN9 {

View File

@ -87,6 +87,7 @@ class HwHelper {
virtual bool isFusedEuDispatchEnabled(const HardwareInfo &hwInfo) const = 0;
virtual bool isIndependentForwardProgressSupported() = 0;
virtual uint64_t getGpuTimeStampInNS(uint64_t timeStamp, double frequency) const = 0;
virtual uint32_t getBindlessSurfaceExtendedMessageDescriptorValue(uint32_t surfStateOffset) const = 0;
static uint32_t getSubDevicesCount(const HardwareInfo *pHwInfo);
static uint32_t getEnginesCount(const HardwareInfo &hwInfo);
@ -135,6 +136,13 @@ class HwHelperHw : public HwHelper {
return sizeof(RENDER_SURFACE_STATE);
}
uint32_t getBindlessSurfaceExtendedMessageDescriptorValue(uint32_t surfStateOffset) const override {
using DataPortBindlessSurfaceExtendedMessageDescriptor = typename GfxFamily::DataPortBindlessSurfaceExtendedMessageDescriptor;
DataPortBindlessSurfaceExtendedMessageDescriptor messageExtDescriptor = {};
messageExtDescriptor.setBindlessSurfaceOffset(surfStateOffset);
return messageExtDescriptor.getBindlessSurfaceOffsetToPatch();
}
const AubMemDump::LrcaHelper &getCsTraits(aub_stream::EngineType engineType) const override;
size_t getMaxBarrierRegisterPerSlice() const override;