fix: opencl support for bindless kernels

Related-To: NEO-11156
Signed-off-by: Fabian Zwoliński <fabian.zwolinski@intel.com>
This commit is contained in:
Fabian Zwoliński 2024-04-29 15:53:09 +00:00 committed by Compute-Runtime-Automation
parent 5e57bb2a32
commit ee71157f7f
10 changed files with 872 additions and 11 deletions

View File

@ -266,6 +266,18 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
if (EncodeSurfaceState<GfxFamily>::doBindingTablePrefetch()) {
bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
}
const bool isBindlessKernel = NEO::KernelDescriptor::isBindlessAddressingKernel(kernel.getKernelInfo().kernelDescriptor);
if (isBindlessKernel) {
uint64_t bindlessSurfaceStateBaseOffset = ptrDiff(ssh.getSpace(0), ssh.getCpuBase());
auto sshHeapSize = kernel.getSurfaceStateHeapSize();
// Allocate space for new ssh data
auto dstSurfaceState = ssh.getSpace(sshHeapSize);
memcpy_s(dstSurfaceState, sshHeapSize, kernel.getSurfaceStateHeap(), sshHeapSize);
kernel.patchBindlessOffsetsInCrossThreadData(bindlessSurfaceStateBaseOffset);
}
}
auto &gfxCoreHelper = device.getGfxCoreHelper();

View File

@ -140,12 +140,41 @@ void Kernel::patchWithImplicitSurface(uint64_t ptrToPatchInCrossThreadData, Grap
}
void *ssh = getSurfaceStateHeap();
if ((nullptr != ssh) && isValidOffset(arg.bindful)) {
auto surfaceState = ptrOffset(ssh, arg.bindful);
if (nullptr != ssh) {
void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
size_t sizeToPatch = allocation.getUnderlyingBufferSize();
Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
areMultipleSubDevicesInContext());
if (isValidOffset(arg.bindful)) {
auto surfaceState = ptrOffset(ssh, arg.bindful);
Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
areMultipleSubDevicesInContext());
} else if (isValidOffset(arg.bindless)) {
auto &gfxCoreHelper = clDevice.getDevice().getGfxCoreHelper();
void *surfaceState = nullptr;
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
if (clDevice.getDevice().getBindlessHeapsHelper()) {
auto ssInHeap = allocation.getBindlessInfo();
surfaceState = ssInHeap.ssPtr;
auto patchLocation = ptrOffset(crossThreadData, arg.bindless);
auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(ssInHeap.surfaceStateOffset));
patchWithRequiredSize(reinterpret_cast<uint8_t *>(patchLocation), sizeof(patchValue), patchValue);
} else {
auto index = std::numeric_limits<uint32_t>::max();
const auto &iter = kernelInfo.kernelDescriptor.getBindlessOffsetToSurfaceState().find(arg.bindless);
if (iter != kernelInfo.kernelDescriptor.getBindlessOffsetToSurfaceState().end()) {
index = iter->second;
}
if (index < std::numeric_limits<uint32_t>::max()) {
surfaceState = ptrOffset(ssh, index * surfaceStateSize);
}
}
if (surfaceState) {
Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
areMultipleSubDevicesInContext());
}
}
}
}
@ -223,7 +252,13 @@ cl_int Kernel::initialize() {
// copy the ssh into our local copy
memcpy_s(pSshLocal.get(), sshLocalSize,
heapInfo.pSsh, heapInfo.surfaceStateHeapSize);
} else if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelDescriptor)) {
auto surfaceStateSize = static_cast<uint32_t>(gfxCoreHelper.getRenderSurfaceStateSize());
sshLocalSize = kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize;
DEBUG_BREAK_IF(kernelDescriptor.kernelAttributes.numArgsStateful != kernelDescriptor.getBindlessOffsetToSurfaceState().size());
pSshLocal = std::make_unique<char[]>(sshLocalSize);
}
numberOfBindingTableStates = kernelDescriptor.payloadMappings.bindingTable.numEntries;
localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;
@ -233,7 +268,8 @@ cl_int Kernel::initialize() {
return status;
}
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless) ||
isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.bindless)) {
DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr);
uint64_t constMemory = isBuiltIn ? castToUint64(program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer()) : program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch();
@ -241,7 +277,8 @@ cl_int Kernel::initialize() {
patchWithImplicitSurface(constMemory, *program->getConstantSurface(rootDeviceIndex), arg);
}
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless) ||
isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless)) {
DEBUG_BREAK_IF(program->getGlobalSurface(rootDeviceIndex) == nullptr);
uint64_t globalMemory = isBuiltIn ? castToUint64(program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer()) : program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch();
@ -932,6 +969,16 @@ cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, G
auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0,
areMultipleSubDevicesInContext());
} else if (isValidOffset(argAsPtr.bindless)) {
auto &gfxCoreHelper = this->getGfxCoreHelper();
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
auto ssIndex = getSurfaceStateIndexForBindlessOffset(argAsPtr.bindless);
if (ssIndex < std::numeric_limits<uint32_t>::max()) {
auto surfaceState = ptrOffset(getSurfaceStateHeap(), ssIndex * surfaceStateSize);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0,
areMultipleSubDevicesInContext());
}
}
storeKernelArg(argIndex, SVM_OBJ, nullptr, svmPtr, sizeof(void *), svmAlloc, svmFlags);
@ -987,6 +1034,24 @@ cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocatio
}
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, forceNonAuxMode, disableL3, allocSize, ptrToPatch, offset, svmAlloc, 0, 0,
areMultipleSubDevicesInContext());
} else if (isValidOffset(argAsPtr.bindless)) {
size_t allocSize = 0;
size_t offset = 0;
if (svmAlloc != nullptr) {
allocSize = svmAlloc->getUnderlyingBufferSize();
offset = ptrDiff(ptrToPatch, svmAlloc->getGpuAddressToPatch());
allocSize -= offset;
}
auto &gfxCoreHelper = this->getGfxCoreHelper();
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
auto ssIndex = getSurfaceStateIndexForBindlessOffset(argAsPtr.bindless);
if (ssIndex < std::numeric_limits<uint32_t>::max()) {
auto surfaceState = ptrOffset(getSurfaceStateHeap(), ssIndex * surfaceStateSize);
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, forceNonAuxMode, disableL3, allocSize, ptrToPatch, offset, svmAlloc, 0, 0,
areMultipleSubDevicesInContext());
}
}
storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t));
@ -1297,10 +1362,20 @@ void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
if (program->getConstantSurface(rootDeviceIndex)) {
commandStreamReceiver.makeResident(*(program->getConstantSurface(rootDeviceIndex)));
auto bindlessHeapAllocation = program->getConstantSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation;
if (bindlessHeapAllocation) {
commandStreamReceiver.makeResident(*bindlessHeapAllocation);
}
}
if (program->getGlobalSurface(rootDeviceIndex)) {
commandStreamReceiver.makeResident(*(program->getGlobalSurface(rootDeviceIndex)));
auto bindlessHeapAllocation = program->getGlobalSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation;
if (bindlessHeapAllocation) {
commandStreamReceiver.makeResident(*bindlessHeapAllocation);
}
}
if (program->getExportedFunctionsSurface(rootDeviceIndex)) {
@ -1510,11 +1585,14 @@ cl_int Kernel::setArgBuffer(uint32_t argIndex,
} else if (isValidOffset(argAsPtr.bindless)) {
auto &gfxCoreHelper = this->getGfxCoreHelper();
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
auto surfaceState = ptrOffset(getSurfaceStateHeap(), surfaceStateSize * argIndex);
buffer->setArgStateful(surfaceState, forceNonAuxMode,
disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
areMultipleSubDevicesInContext());
auto ssIndex = getSurfaceStateIndexForBindlessOffset(argAsPtr.bindless);
if (ssIndex < std::numeric_limits<uint32_t>::max()) {
auto surfaceState = ptrOffset(getSurfaceStateHeap(), ssIndex * surfaceStateSize);
buffer->setArgStateful(surfaceState, forceNonAuxMode,
disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
areMultipleSubDevicesInContext());
}
}
kernelArguments[argIndex].isStatelessUncacheable = argAsPtr.isPureStateful() ? false : buffer->isMemObjUncacheable();
@ -2080,6 +2158,68 @@ void *Kernel::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t
return ssInHeap.ssPtr;
}
uint32_t Kernel::getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const {
const auto &iter = kernelInfo.kernelDescriptor.getBindlessOffsetToSurfaceState().find(bindlessOffset);
if (iter != kernelInfo.kernelDescriptor.getBindlessOffsetToSurfaceState().end()) {
return iter->second;
}
DEBUG_BREAK_IF(true);
return std::numeric_limits<uint32_t>::max();
}
void Kernel::patchBindlessOffsetsForImplicitArgs(uint64_t bindlessSurfaceStateBaseOffset) const {
auto implicitArgsVec = kernelInfo.kernelDescriptor.getImplicitArgBindlessCandidatesVec();
auto &gfxCoreHelper = this->getGfxCoreHelper();
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
for (size_t i = 0; i < implicitArgsVec.size(); i++) {
if (NEO::isValidOffset(implicitArgsVec[i]->bindless)) {
auto patchLocation = ptrOffset(getCrossThreadData(), implicitArgsVec[i]->bindless);
auto index = getSurfaceStateIndexForBindlessOffset(implicitArgsVec[i]->bindless);
if (index < std::numeric_limits<uint32_t>::max()) {
auto surfaceStateOffset = static_cast<uint32_t>(bindlessSurfaceStateBaseOffset + index * surfaceStateSize);
auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(surfaceStateOffset));
patchWithRequiredSize(reinterpret_cast<uint8_t *>(patchLocation), sizeof(patchValue), patchValue);
}
}
}
}
void Kernel::patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const {
auto &gfxCoreHelper = this->getGfxCoreHelper();
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
for (size_t argIndex = 0; argIndex < kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size(); argIndex++) {
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
auto crossThreadOffset = NEO::undefined<NEO::CrossThreadDataOffset>;
if (arg.type == NEO::ArgDescriptor::argTPointer) {
crossThreadOffset = arg.as<NEO::ArgDescPointer>().bindless;
} else if (arg.type == NEO::ArgDescriptor::argTImage) {
crossThreadOffset = arg.as<NEO::ArgDescImage>().bindless;
} else {
continue;
}
if (NEO::isValidOffset(crossThreadOffset)) {
auto patchLocation = ptrOffset(getCrossThreadData(), crossThreadOffset);
auto index = getSurfaceStateIndexForBindlessOffset(crossThreadOffset);
if (index < std::numeric_limits<uint32_t>::max()) {
auto surfaceStateOffset = static_cast<uint32_t>(bindlessSurfaceStateBaseOffset + index * surfaceStateSize);
auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(surfaceStateOffset));
patchWithRequiredSize(reinterpret_cast<uint8_t *>(patchLocation), sizeof(patchValue), patchValue);
}
}
}
patchBindlessOffsetsForImplicitArgs(bindlessSurfaceStateBaseOffset);
}
void Kernel::setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo) {
this->additionalKernelExecInfo = additionalKernelExecInfo;
}

View File

@ -218,6 +218,9 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
bool usesSyncBuffer() const;
void patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset);
void *patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless);
uint32_t getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const;
void patchBindlessOffsetsForImplicitArgs(uint64_t bindlessSurfaceStateBaseOffset) const;
void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const;
// Helpers
cl_int setArg(uint32_t argIndex, uint32_t argValue);

View File

@ -265,16 +265,35 @@ cl_int Program::processProgramInfo(ProgramInfo &src, const ClDevice &clDevice) {
}
kernelInfoArray = std::move(src.kernelInfos);
bool isBindlessKernelPresent = false;
for (auto &kernelInfo : kernelInfoArray) {
if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelInfo->kernelDescriptor)) {
isBindlessKernelPresent = true;
break;
}
}
auto svmAllocsManager = context ? context->getSVMAllocsManager() : nullptr;
auto globalConstDataSize = src.globalConstants.size + src.globalConstants.zeroInitSize;
if (globalConstDataSize != 0) {
buildInfos[rootDeviceIndex].constantSurface = allocateGlobalsSurface(svmAllocsManager, clDevice.getDevice(), globalConstDataSize, src.globalConstants.zeroInitSize, true, linkerInput, src.globalConstants.initData);
if (isBindlessKernelPresent) {
if (!clDevice.getMemoryManager()->allocateBindlessSlot(buildInfos[rootDeviceIndex].constantSurface)) {
return CL_OUT_OF_HOST_MEMORY;
}
}
}
auto globalVariablesDataSize = src.globalVariables.size + src.globalVariables.zeroInitSize;
buildInfos[rootDeviceIndex].globalVarTotalSize = globalVariablesDataSize;
if (globalVariablesDataSize != 0) {
buildInfos[rootDeviceIndex].globalSurface = allocateGlobalsSurface(svmAllocsManager, clDevice.getDevice(), globalVariablesDataSize, src.globalVariables.zeroInitSize, false, linkerInput, src.globalVariables.initData);
if (isBindlessKernelPresent) {
if (!clDevice.getMemoryManager()->allocateBindlessSlot(buildInfos[rootDeviceIndex].globalSurface)) {
return CL_OUT_OF_HOST_MEMORY;
}
}
if (clDevice.areOcl21FeaturesEnabled() == false) {
buildInfos[rootDeviceIndex].globalVarTotalSize = 0u;
}

View File

@ -1080,6 +1080,82 @@ HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenInd
delete[] mockDsh;
}
HWTEST2_F(HardwareCommandsTest, givenBindlessKernelWithBufferArgWhenSendIndirectStateThenSurfaceStateIsCopiedToHeapAndCrossThreadDataIsCorrectlyPatched, IsAtLeastXeHpCore) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using DefaultWalkerType = typename FamilyType::DefaultWalkerType;
CommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, 0, false);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<DefaultWalkerType *>(commandStream.getSpace(sizeof(DefaultWalkerType)));
// define kernel info
std::unique_ptr<MockKernelInfo> pKernelInfo = std::make_unique<MockKernelInfo>();
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
pKernelInfo->addArgBuffer(0, 0x30, sizeof(void *), 0x0);
pKernelInfo->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::AddressingMode::BindlessAndStateless;
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
pKernelInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize = 1024;
MockKernel mockKernel(mockKernelWithInternal->mockKernel->getProgram(), *pKernelInfo, *pClDevice);
auto retVal = mockKernel.initialize();
EXPECT_EQ(0, retVal);
memset(mockKernel.getSurfaceStateHeap(), 0x22, mockKernel.getSurfaceStateHeapSize());
memset(mockKernel.getCrossThreadData(), 0x00, mockKernel.getCrossThreadDataSize());
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::Type::dynamicState, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::Type::indirectObject, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::Type::surfaceState, 8192);
const auto expectedDestinationInHeap = ssh.getSpace(0);
const uint64_t bindlessSurfaceStateBaseOffset = ptrDiff(ssh.getSpace(0), ssh.getCpuBase());
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
const uint32_t threadGroupCount = 1u;
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = EngineHelpers::isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(mockKernel);
INTERFACE_DESCRIPTOR_DATA interfaceDescriptorData;
HardwareCommandsHelper<FamilyType>::template sendIndirectState<DefaultWalkerType, INTERFACE_DESCRIPTOR_DATA>(
commandStream,
dsh,
ioh,
ssh,
mockKernel,
mockKernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
pKernelInfo->getMaxSimdSize(),
localWorkSizes,
threadGroupCount,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
&interfaceDescriptorData,
true,
0,
*pDevice);
EXPECT_EQ(0, std::memcmp(expectedDestinationInHeap, mockKernel.getSurfaceStateHeap(), mockKernel.getSurfaceStateHeapSize()));
const auto &gfxCoreHelper = mockKernel.getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto surfaceStateOffset = static_cast<uint32_t>(bindlessSurfaceStateBaseOffset + ssIndex * surfaceStateSize);
const auto expectedPatchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(surfaceStateOffset));
const auto expectedPatchLocation = reinterpret_cast<uint32_t *>(ptrOffset(mockKernel.getCrossThreadData(), bindlessOffset));
EXPECT_EQ(expectedPatchValue, *expectedPatchLocation);
}
HWTEST_F(HardwareCommandsTest, whenNumLocalIdsIsBiggerThanZeroThenExpectLocalIdsInUseIsTrue) {
mockKernelWithInternal->kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels = 1;
EXPECT_TRUE(HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));

View File

@ -651,6 +651,8 @@ class KernelArgBufferFixtureBindless : public KernelArgBufferFixture {
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->argAsPtr(0).stateless = undefined<CrossThreadDataOffset>;
pKernelInfo->argAsPtr(0).bindful = undefined<SurfaceStateHeapOffset>;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
}
void tearDown() {
delete pBuffer;
@ -674,6 +676,46 @@ HWTEST_F(KernelArgBufferTestBindless, givenUsedBindlessBuffersWhenSettingKernelA
EXPECT_EQ(0xdeadu, *patchLocation);
}
HWTEST_F(KernelArgBufferTestBindless, givenBindlessArgBufferWhenSettingKernelArgThenSurfaceStateIsEncodedAtProperOffset) {
using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor;
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize();
EXPECT_EQ(pKernelInfo->kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize, surfaceStateHeapSize);
cl_mem memObj = pBuffer;
retVal = pKernel->setArg(0, sizeof(memObj), &memObj);
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
const auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(), ssOffset));
const auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
const auto bufferAddress = pBuffer->getGraphicsAllocation(pDevice->getRootDeviceIndex())->getGpuAddress();
EXPECT_EQ(bufferAddress, surfaceAddress);
}
HWTEST_F(KernelArgBufferTestBindless, givenBindlessArgBufferAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) {
using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor;
const auto surfaceStateHeap = pKernel->getSurfaceStateHeap();
const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize();
auto ssHeapDataInitial = std::make_unique<char[]>(surfaceStateHeapSize);
std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize);
pKernelInfo->kernelDescriptor.bindlessArgsMap.clear();
cl_mem memObj = pBuffer;
retVal = pKernel->setArg(0, sizeof(memObj), &memObj);
EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize));
}
HWTEST_F(KernelArgBufferTestBindless, givenBindlessBuffersWhenPatchBindlessOffsetCalledThenBindlessOffsetToSurfaceStateWrittenInCrossThreadData) {
pClDevice->getExecutionEnvironment()->rootDeviceEnvironments[pClDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(pDevice,

View File

@ -6,6 +6,7 @@
*/
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "opencl/source/kernel/kernel.h"
@ -116,6 +117,61 @@ HWTEST_F(KernelArgSvmTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArgumentsA
delete[] svmPtr;
}
HWTEST_F(KernelArgSvmTest, GivenSvmPtrBindlessWhenSettingKernelArgThenArgumentsAreSetCorrectly) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
auto svmPtr = std::make_unique<char[]>(256);
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
auto retVal = pKernel->setArgSvm(0, 256, svmPtr.get(), nullptr, 0u);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
ptrOffset(pKernel->getSurfaceStateHeap(),
ssOffset));
void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
EXPECT_EQ(svmPtr.get(), surfaceAddress);
}
HWTEST_F(KernelArgSvmTest, GivenSvmPtrBindlessAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
auto svmPtr = std::make_unique<char[]>(256);
const auto surfaceStateHeap = pKernel->getSurfaceStateHeap();
const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize();
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
auto ssHeapDataInitial = std::make_unique<char[]>(surfaceStateHeapSize);
std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize);
pKernelInfo->kernelDescriptor.bindlessArgsMap.clear();
auto retVal = pKernel->setArgSvm(0, 256, svmPtr.get(), nullptr, 0u);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize));
}
TEST_F(KernelArgSvmTest, GivenValidSvmAllocWhenSettingKernelArgThenArgumentsAreSetCorrectly) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
@ -221,6 +277,100 @@ HWTEST_F(KernelArgSvmTest, givenOffsetedSvmPointerWhenSetArgSvmAllocIsCalledThen
EXPECT_EQ(offsetedPtr, surfaceAddress);
}
HWTEST_F(KernelArgSvmTest, GivenValidSvmAllocBindlessWhenSettingKernelArgThenArgumentsAreSetCorrectly) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
auto svmPtr = std::make_unique<char[]>(256);
MockGraphicsAllocation svmAlloc(svmPtr.get(), 256);
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
auto retVal = pKernel->setArgSvmAlloc(0, svmPtr.get(), &svmAlloc, 0u);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
ptrOffset(pKernel->getSurfaceStateHeap(),
ssOffset));
void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
EXPECT_EQ(svmPtr.get(), surfaceAddress);
}
HWTEST_F(KernelArgSvmTest, givenOffsetedSvmPointerBindlessWhenSetArgSvmAllocIsCalledThenProperSvmAddressIsPatched) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
std::unique_ptr<char[]> svmPtr(new char[256]);
auto offsetedPtr = svmPtr.get() + 4;
MockGraphicsAllocation svmAlloc(svmPtr.get(), 256);
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
pKernel->setArgSvmAlloc(0, offsetedPtr, &svmAlloc, 0u);
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
ptrOffset(pKernel->getSurfaceStateHeap(),
ssOffset));
void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
EXPECT_EQ(offsetedPtr, surfaceAddress);
}
HWTEST_F(KernelArgSvmTest, GivenValidSvmAllocBindlessAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
const auto surfaceStateHeap = pKernel->getSurfaceStateHeap();
const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize();
auto svmPtr = std::make_unique<char[]>(256);
MockGraphicsAllocation svmAlloc(svmPtr.get(), 256);
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
auto ssHeapDataInitial = std::make_unique<char[]>(surfaceStateHeapSize);
std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize);
pKernelInfo->kernelDescriptor.bindlessArgsMap.clear();
auto retVal = pKernel->setArgSvmAlloc(0, svmPtr.get(), &svmAlloc, 0u);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize));
}
HWTEST_F(KernelArgSvmTest, givenDeviceSupportingSharedSystemAllocationsWhenSetArgSvmIsCalledWithSurfaceStateThenSizeIsMaxAndAddressIsProgrammed) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
@ -246,6 +396,42 @@ HWTEST_F(KernelArgSvmTest, givenDeviceSupportingSharedSystemAllocationsWhenSetAr
EXPECT_EQ(16384u, surfaceState->getHeight());
}
HWTEST_F(KernelArgSvmTest, givenBindlessArgAndDeviceSupportingSharedSystemAllocationsWhenSetArgSvmIsCalledWithSurfaceStateThenSizeIsMaxAndAddressIsProgrammed) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
this->pClDevice->deviceInfo.sharedSystemMemCapabilities = CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL;
auto systemPointer = reinterpret_cast<void *>(0xfeedbac);
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
pKernel->setArgSvmAlloc(0, systemPointer, nullptr, 0u);
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
ptrOffset(pKernel->getSurfaceStateHeap(),
ssOffset));
void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
EXPECT_EQ(systemPointer, surfaceAddress);
EXPECT_EQ(128u, surfaceState->getWidth());
EXPECT_EQ(2048u, surfaceState->getDepth());
EXPECT_EQ(16384u, surfaceState->getHeight());
}
TEST_F(KernelArgSvmTest, WhenSettingKernelArgImmediateThenInvalidArgValueErrorIsReturned) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {

View File

@ -24,6 +24,7 @@
#include "shared/test/common/helpers/gtest_helpers.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_allocation_properties.h"
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
#include "shared/test/common/mocks/mock_cpu_page_fault_manager.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
@ -395,6 +396,137 @@ TEST_F(KernelTests, WhenIsSingleSubdevicePreferredIsCalledThenCorrectValuesAreRe
}
}
using BindlessKernelTests = KernelTests;
TEST_F(BindlessKernelTests, GivenBindlessAddressingKernelWhenInitializeThenSurfaceStateIsCreatedWithCorrectSize) {
KernelInfo kernelInfo = {};
kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 32;
kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::Bindless;
kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful = 3;
MockKernel kernel(pProgram, kernelInfo, *pClDevice);
auto retVal = kernel.initialize();
EXPECT_EQ(CL_SUCCESS, retVal);
const auto &gfxCoreHelper = pClDevice->getGfxCoreHelper();
const auto surfaceStateSize = static_cast<uint32_t>(gfxCoreHelper.getRenderSurfaceStateSize());
const auto expectedSsHeapSize = kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize;
const auto ssHeap = kernel.getSurfaceStateHeap();
const auto ssHeapSize = kernel.getSurfaceStateHeapSize();
EXPECT_EQ(expectedSsHeapSize, ssHeapSize);
EXPECT_NE(nullptr, ssHeap);
}
TEST_F(BindlessKernelTests, givenBindlessKernelWhenPatchingCrossThreadDataThenCorrectBindlessOffsetsAreWritten) {
auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor.as<NEO::ArgDescPointer>().bindless = 0x0;
auto argDescriptorImg = NEO::ArgDescriptor(NEO::ArgDescriptor::argTImage);
argDescriptorImg.as<NEO::ArgDescImage>() = NEO::ArgDescImage();
argDescriptorImg.as<NEO::ArgDescImage>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptorImg.as<NEO::ArgDescImage>().bindless = sizeof(uint64_t);
auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor2.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor2.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor2.as<NEO::ArgDescPointer>().stateless = 2 * sizeof(uint64_t);
KernelInfo kernelInfo = {};
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptorImg);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor2);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = 3 * sizeof(uint64_t);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.bindless = 4 * sizeof(uint64_t);
MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice);
pProgram->mockKernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState();
mockKernel.crossThreadData = new char[5 * sizeof(uint64_t)];
mockKernel.crossThreadDataSize = 5 * sizeof(uint64_t);
memset(mockKernel.crossThreadData, 0x00, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = 0x1000;
auto &gfxCoreHelper = pClDevice->getGfxCoreHelper();
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
auto patchValue1 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(baseAddress));
auto patchValue2 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(baseAddress + 1 * surfaceStateSize));
auto patchValue3 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(baseAddress + 2 * surfaceStateSize));
auto patchValue4 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(baseAddress + 3 * surfaceStateSize));
mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize);
EXPECT_EQ(patchValue1, crossThreadData[0]);
EXPECT_EQ(patchValue2, crossThreadData[1]);
EXPECT_EQ(0u, crossThreadData[2]);
EXPECT_EQ(patchValue3, crossThreadData[3]);
EXPECT_EQ(patchValue4, crossThreadData[4]);
}
TEST_F(BindlessKernelTests, givenNoEntryInBindlessOffsetsMapWhenPatchingCrossThreadDataThenMemoryIsNotPatched) {
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;
auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor.as<NEO::ArgDescPointer>().bindless = 0x0;
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = sizeof(uint64_t);
MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice);
mockKernel.crossThreadData = new char[4 * sizeof(uint64_t)];
mockKernel.crossThreadDataSize = 4 * sizeof(uint64_t);
memset(mockKernel.crossThreadData, 0, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = 0x1000;
mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize);
EXPECT_EQ(0u, crossThreadData[0]);
}
TEST_F(BindlessKernelTests, givenNoStatefulArgsWhenPatchingBindlessOffsetsInCrossThreadDataThenMemoryIsNotPatched) {
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;
auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTValue);
argDescriptor.as<NEO::ArgDescValue>() = NEO::ArgDescValue();
argDescriptor.as<NEO::ArgDescValue>().elements.push_back(NEO::ArgDescValue::Element{0, 8, 0, false});
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor);
MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice);
mockKernel.crossThreadData = new char[sizeof(uint64_t)];
mockKernel.crossThreadDataSize = sizeof(uint64_t);
memset(mockKernel.crossThreadData, 0, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = 0x1000;
mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize);
EXPECT_EQ(0u, crossThreadData[0]);
}
class KernelFromBinaryTest : public ProgramSimpleFixture {
public:
void setUp() {
@ -1218,6 +1350,42 @@ HWTEST_F(KernelResidencyTest, givenKernelWhenMakeResidentIsCalledThenGlobalBuffe
memoryManager->freeGraphicsMemory(pKernelInfo->kernelAllocation);
}
HWTEST_F(KernelResidencyTest, givenBindlessHeapsHelperAndGlobalAndConstantBuffersWhenMakeResidentIsCalledThenGlobalAndConstantBufferHeapAllocationsAreMadeResident) {
auto bindlessHeapHelper = new MockBindlesHeapsHelper(pDevice, false);
pDevice->getExecutionEnvironment()->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper);
auto pKernelInfo = std::make_unique<KernelInfo>();
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.storeMakeResidentAllocations = true;
auto memoryManager = commandStreamReceiver.getMemoryManager();
pKernelInfo->kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
MockProgram program(toClDeviceVector(*pClDevice));
MockContext ctx;
program.setContext(&ctx);
program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface = new MockGraphicsAllocation();
program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface = new MockGraphicsAllocation();
EXPECT_TRUE(memoryManager->allocateBindlessSlot(program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface));
EXPECT_TRUE(memoryManager->allocateBindlessSlot(program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface));
std::unique_ptr<MockKernel> kernel(new MockKernel(&program, *pKernelInfo, *pClDevice));
ASSERT_EQ(CL_SUCCESS, kernel->initialize());
EXPECT_EQ(0u, commandStreamReceiver.makeResidentAllocations.size());
kernel->makeResident(pDevice->getGpgpuCommandStreamReceiver());
EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.getGlobalSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.getConstantSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation));
memoryManager->freeGraphicsMemory(pKernelInfo->kernelAllocation);
}
HWTEST_F(KernelResidencyTest, givenKernelWhenItUsesIndirectUnifiedMemoryDeviceAllocationThenTheyAreMadeResident) {
MockKernelWithInternals mockKernel(*this->pClDevice);
auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver<FamilyType>();
@ -2962,6 +3130,108 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionDisabledWhenPatchWithImplicit
EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size());
}
HWTEST_F(KernelTest, givenBindlessArgBufferWhenPatchWithImplicitSurfaceThenSurfaceStateIsEncodedAtProperOffset) {
auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
MockKernelWithInternals kernel(*device);
uint64_t gpuAddress = 0x1200;
const void *cpuPtr = reinterpret_cast<const void *>(gpuAddress);
size_t allocSize = 0x1000;
MockGraphicsAllocation mockAllocation(const_cast<void *>(cpuPtr), gpuAddress, allocSize);
kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
const CrossThreadDataOffset bindlessOffset = 0x10;
kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined<CrossThreadDataOffset>, bindlessOffset);
kernel.kernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState();
uint64_t crossThreadData = 0;
kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0));
const auto &gfxCoreHelper = device->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
const auto ssIndex = kernel.kernelInfo.kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
const auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(kernel.mockKernel->getSurfaceStateHeap(), ssOffset));
const auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
const auto bufferAddress = mockAllocation.getGpuAddressToPatch();
EXPECT_EQ(bufferAddress, surfaceAddress);
}
HWTEST_F(KernelTest, givenBindlessArgBufferAndNotInitializedBindlessOffsetToSurfaceStateWhenPatchWithImplicitSurfaceThenSurfaceStateIsNotEncoded) {
auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
MockKernelWithInternals kernel(*device);
uint64_t gpuAddress = 0x1200;
const void *cpuPtr = reinterpret_cast<const void *>(gpuAddress);
size_t allocSize = 0x1000;
MockGraphicsAllocation mockAllocation(const_cast<void *>(cpuPtr), gpuAddress, allocSize);
kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
const CrossThreadDataOffset bindlessOffset = 0x10;
kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined<CrossThreadDataOffset>, bindlessOffset);
const auto surfaceStateHeap = kernel.mockKernel->getSurfaceStateHeap();
const auto surfaceStateHeapSize = kernel.mockKernel->getSurfaceStateHeapSize();
auto ssHeapDataInitial = std::make_unique<char[]>(surfaceStateHeapSize);
std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize);
kernel.kernelInfo.kernelDescriptor.bindlessArgsMap.clear();
uint64_t crossThreadData = 0;
kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0));
EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize));
}
HWTEST_F(KernelTest, givenBindlessHeapsHelperAndBindlessArgBufferWhenPatchWithImplicitSurfaceThenCrossThreadDataIsPatchedAndSurfaceStateIsEncoded) {
auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
auto &neoDevice = device->getDevice();
auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false);
neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper);
MockKernelWithInternals kernel(*device);
uint64_t gpuAddress = 0x1200;
const void *cpuPtr = reinterpret_cast<const void *>(gpuAddress);
size_t allocSize = 0x1000;
MockGraphicsAllocation mockAllocation(const_cast<void *>(cpuPtr), gpuAddress, allocSize);
kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
EXPECT_TRUE(device->getMemoryManager()->allocateBindlessSlot(&mockAllocation));
const CrossThreadDataOffset bindlessOffset = 0x10;
kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined<CrossThreadDataOffset>, bindlessOffset);
kernel.kernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState();
uint64_t crossThreadData = 0;
kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0));
auto ssInHeapInfo = mockAllocation.getBindlessInfo();
auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(kernel.mockKernel->crossThreadData, bindlessOffset));
auto patchValue = device->getGfxCoreHelper().getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(ssInHeapInfo.surfaceStateOffset));
EXPECT_EQ(patchValue, *patchLocation);
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
const auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ssInHeapInfo.ssPtr);
const auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
const auto bufferAddress = mockAllocation.getGpuAddressToPatch();
EXPECT_EQ(bufferAddress, surfaceAddress);
}
TEST(KernelTest, givenDefaultKernelWhenItIsCreatedThenItReportsStatelessWrites) {
auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
MockKernelWithInternals kernel(*device);

View File

@ -98,6 +98,8 @@ class MockKernel : public Kernel {
using Kernel::anyKernelArgumentUsingSystemMemory;
using Kernel::auxTranslationRequired;
using Kernel::containsStatelessWrites;
using Kernel::crossThreadData;
using Kernel::crossThreadDataSize;
using Kernel::dataParameterSimdSize;
using Kernel::executionType;
using Kernel::getDevice;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2023 Intel Corporation
* Copyright (C) 2018-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -15,6 +15,7 @@
#include "shared/test/common/device_binary_format/patchtokens_tests.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/gtest_helpers.h"
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
#include "shared/test/common/mocks/mock_csr.h"
#include "shared/test/common/mocks/mock_execution_environment.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
@ -324,6 +325,116 @@ TEST_F(ProgramDataTest, whenGlobalVariablesAreNotExportedThenAllocateSurfacesAsN
EXPECT_EQ(nullptr, this->pContext->getSVMAllocsManager()->getSVMAlloc(reinterpret_cast<const void *>(pProgram->getGlobalSurface(pContext->getDevice(0)->getRootDeviceIndex())->getGpuAddress())));
}
using ProgramDataBindlessTest = ProgramDataTest;
TEST_F(ProgramDataBindlessTest, givenBindlessKernelAndConstantsAndVariablesMemorySurfaceWhenProcessProgramInfoThenConstantsAndVariablesSurfaceBindlessSlotIsAllocated) {
auto &neoDevice = pClDevice->getDevice();
neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->memoryOperationsInterface =
std::make_unique<NEO::MockMemoryOperations>();
auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false);
neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper);
ProgramInfo programInfo;
char globalConstantsData[128] = {};
programInfo.globalConstants.initData = globalConstantsData;
programInfo.globalConstants.size = sizeof(globalConstantsData);
char globalVariablesData[128] = {};
programInfo.globalVariables.initData = globalVariablesData;
programInfo.globalVariables.size = sizeof(globalVariablesData);
auto kernelInfo1 = std::make_unique<KernelInfo>();
kernelInfo1->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::Bindful;
auto kernelInfo2 = std::make_unique<KernelInfo>();
kernelInfo1->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
programInfo.kernelInfos.push_back(kernelInfo1.release());
programInfo.kernelInfos.push_back(kernelInfo2.release());
std::unique_ptr<WhiteBox<NEO::LinkerInput>> mockLinkerInput = std::make_unique<WhiteBox<NEO::LinkerInput>>();
programInfo.linkerInput = std::move(mockLinkerInput);
this->pProgram->processProgramInfo(programInfo, *pClDevice);
ASSERT_NE(nullptr, pProgram->getConstantSurface(pContext->getDevice(0)->getRootDeviceIndex()));
ASSERT_NE(nullptr, pProgram->getGlobalSurface(pContext->getDevice(0)->getRootDeviceIndex()));
auto globalConstantsAlloc = pProgram->getConstantSurface(pContext->getDevice(0)->getRootDeviceIndex());
auto ssInHeap1 = globalConstantsAlloc->getBindlessInfo();
EXPECT_NE(nullptr, ssInHeap1.heapAllocation);
auto globalVariablesAlloc = pProgram->getGlobalSurface(pContext->getDevice(0)->getRootDeviceIndex());
auto ssInHeap2 = globalVariablesAlloc->getBindlessInfo();
EXPECT_NE(nullptr, ssInHeap2.heapAllocation);
}
TEST_F(ProgramDataBindlessTest, givenBindlessKernelAndGlobalConstantsMemorySurfaceWhenProcessProgramInfoAndSSAllocationFailsThenGlobalConstantsSurfaceBindlessSlotIsNotAllocatedAndReturnOutOfHostMemory) {
auto &neoDevice = pClDevice->getDevice();
neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->memoryOperationsInterface =
std::make_unique<NEO::MockMemoryOperations>();
auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false);
bindlessHeapHelper->failAllocateSS = true;
neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper);
ProgramInfo programInfo;
char globalConstantsData[128] = {};
programInfo.globalConstants.initData = globalConstantsData;
programInfo.globalConstants.size = sizeof(globalConstantsData);
auto kernelInfo = std::make_unique<KernelInfo>();
kernelInfo->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
programInfo.kernelInfos.push_back(kernelInfo.release());
std::unique_ptr<WhiteBox<NEO::LinkerInput>> mockLinkerInput = std::make_unique<WhiteBox<NEO::LinkerInput>>();
programInfo.linkerInput = std::move(mockLinkerInput);
auto ret = this->pProgram->processProgramInfo(programInfo, *pClDevice);
EXPECT_EQ(ret, CL_OUT_OF_HOST_MEMORY);
auto globalConstantsAlloc = pProgram->getConstantSurface(pContext->getDevice(0)->getRootDeviceIndex());
ASSERT_NE(nullptr, globalConstantsAlloc);
auto ssInHeap = globalConstantsAlloc->getBindlessInfo();
EXPECT_EQ(nullptr, ssInHeap.heapAllocation);
}
TEST_F(ProgramDataBindlessTest, givenBindlessKernelAndGlobalVariablesMemorySurfaceWhenProcessProgramInfoAndSSAllocationFailsThenGlobalVariablesSurfaceBindlessSlotIsNotAllocatedAndReturnOutOfHostMemory) {
auto &neoDevice = pClDevice->getDevice();
neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->memoryOperationsInterface =
std::make_unique<NEO::MockMemoryOperations>();
auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false);
bindlessHeapHelper->failAllocateSS = true;
neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper);
ProgramInfo programInfo;
char globalVariablesData[128] = {};
programInfo.globalVariables.initData = globalVariablesData;
programInfo.globalVariables.size = sizeof(globalVariablesData);
auto kernelInfo = std::make_unique<KernelInfo>();
kernelInfo->kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
programInfo.kernelInfos.push_back(kernelInfo.release());
std::unique_ptr<WhiteBox<NEO::LinkerInput>> mockLinkerInput = std::make_unique<WhiteBox<NEO::LinkerInput>>();
programInfo.linkerInput = std::move(mockLinkerInput);
auto ret = this->pProgram->processProgramInfo(programInfo, *pClDevice);
EXPECT_EQ(ret, CL_OUT_OF_HOST_MEMORY);
auto globalVariablesAlloc = pProgram->getGlobalSurface(pContext->getDevice(0)->getRootDeviceIndex());
ASSERT_NE(nullptr, globalVariablesAlloc);
auto ssInHeap = globalVariablesAlloc->getBindlessInfo();
EXPECT_EQ(nullptr, ssInHeap.heapAllocation);
}
TEST_F(ProgramDataTest, givenConstantAllocationThatIsInUseByGpuWhenProgramIsBeingDestroyedThenItIsAddedToTemporaryAllocationList) {
setupConstantAllocation();