fix: opencl support for bindless kernels

Related-To: NEO-11156
Signed-off-by: Fabian Zwoliński <fabian.zwolinski@intel.com>
This commit is contained in:
Fabian Zwoliński
2024-04-29 15:53:09 +00:00
committed by Compute-Runtime-Automation
parent 5e57bb2a32
commit ee71157f7f
10 changed files with 872 additions and 11 deletions

View File

@@ -651,6 +651,8 @@ class KernelArgBufferFixtureBindless : public KernelArgBufferFixture {
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->argAsPtr(0).stateless = undefined<CrossThreadDataOffset>;
pKernelInfo->argAsPtr(0).bindful = undefined<SurfaceStateHeapOffset>;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
}
void tearDown() {
delete pBuffer;
@@ -674,6 +676,46 @@ HWTEST_F(KernelArgBufferTestBindless, givenUsedBindlessBuffersWhenSettingKernelA
EXPECT_EQ(0xdeadu, *patchLocation);
}
HWTEST_F(KernelArgBufferTestBindless, givenBindlessArgBufferWhenSettingKernelArgThenSurfaceStateIsEncodedAtProperOffset) {
using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor;
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize();
EXPECT_EQ(pKernelInfo->kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize, surfaceStateHeapSize);
cl_mem memObj = pBuffer;
retVal = pKernel->setArg(0, sizeof(memObj), &memObj);
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
const auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(pKernel->getSurfaceStateHeap(), ssOffset));
const auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
const auto bufferAddress = pBuffer->getGraphicsAllocation(pDevice->getRootDeviceIndex())->getGpuAddress();
EXPECT_EQ(bufferAddress, surfaceAddress);
}
HWTEST_F(KernelArgBufferTestBindless, givenBindlessArgBufferAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) {
using DataPortBindlessSurfaceExtendedMessageDescriptor = typename FamilyType::DataPortBindlessSurfaceExtendedMessageDescriptor;
const auto surfaceStateHeap = pKernel->getSurfaceStateHeap();
const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize();
auto ssHeapDataInitial = std::make_unique<char[]>(surfaceStateHeapSize);
std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize);
pKernelInfo->kernelDescriptor.bindlessArgsMap.clear();
cl_mem memObj = pBuffer;
retVal = pKernel->setArg(0, sizeof(memObj), &memObj);
EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize));
}
HWTEST_F(KernelArgBufferTestBindless, givenBindlessBuffersWhenPatchBindlessOffsetCalledThenBindlessOffsetToSurfaceStateWrittenInCrossThreadData) {
pClDevice->getExecutionEnvironment()->rootDeviceEnvironments[pClDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(pDevice,

View File

@@ -6,6 +6,7 @@
*/
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/gfx_core_helper.h"
#include "shared/test/common/test_macros/hw_test.h"
#include "opencl/source/kernel/kernel.h"
@@ -116,6 +117,61 @@ HWTEST_F(KernelArgSvmTest, GivenSvmPtrStatefulWhenSettingKernelArgThenArgumentsA
delete[] svmPtr;
}
HWTEST_F(KernelArgSvmTest, GivenSvmPtrBindlessWhenSettingKernelArgThenArgumentsAreSetCorrectly) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
auto svmPtr = std::make_unique<char[]>(256);
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
auto retVal = pKernel->setArgSvm(0, 256, svmPtr.get(), nullptr, 0u);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
ptrOffset(pKernel->getSurfaceStateHeap(),
ssOffset));
void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
EXPECT_EQ(svmPtr.get(), surfaceAddress);
}
HWTEST_F(KernelArgSvmTest, GivenSvmPtrBindlessAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
auto svmPtr = std::make_unique<char[]>(256);
const auto surfaceStateHeap = pKernel->getSurfaceStateHeap();
const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize();
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
auto ssHeapDataInitial = std::make_unique<char[]>(surfaceStateHeapSize);
std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize);
pKernelInfo->kernelDescriptor.bindlessArgsMap.clear();
auto retVal = pKernel->setArgSvm(0, 256, svmPtr.get(), nullptr, 0u);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize));
}
TEST_F(KernelArgSvmTest, GivenValidSvmAllocWhenSettingKernelArgThenArgumentsAreSetCorrectly) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
@@ -221,6 +277,100 @@ HWTEST_F(KernelArgSvmTest, givenOffsetedSvmPointerWhenSetArgSvmAllocIsCalledThen
EXPECT_EQ(offsetedPtr, surfaceAddress);
}
HWTEST_F(KernelArgSvmTest, GivenValidSvmAllocBindlessWhenSettingKernelArgThenArgumentsAreSetCorrectly) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
auto svmPtr = std::make_unique<char[]>(256);
MockGraphicsAllocation svmAlloc(svmPtr.get(), 256);
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
auto retVal = pKernel->setArgSvmAlloc(0, svmPtr.get(), &svmAlloc, 0u);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(0u, pKernel->getSurfaceStateHeapSize());
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
ptrOffset(pKernel->getSurfaceStateHeap(),
ssOffset));
void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
EXPECT_EQ(svmPtr.get(), surfaceAddress);
}
HWTEST_F(KernelArgSvmTest, givenOffsetedSvmPointerBindlessWhenSetArgSvmAllocIsCalledThenProperSvmAddressIsPatched) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
std::unique_ptr<char[]> svmPtr(new char[256]);
auto offsetedPtr = svmPtr.get() + 4;
MockGraphicsAllocation svmAlloc(svmPtr.get(), 256);
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
pKernel->setArgSvmAlloc(0, offsetedPtr, &svmAlloc, 0u);
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
ptrOffset(pKernel->getSurfaceStateHeap(),
ssOffset));
void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
EXPECT_EQ(offsetedPtr, surfaceAddress);
}
HWTEST_F(KernelArgSvmTest, GivenValidSvmAllocBindlessAndNotInitializedBindlessOffsetToSurfaceStateWhenSettingKernelArgThenSurfaceStateIsNotEncoded) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
const auto surfaceStateHeap = pKernel->getSurfaceStateHeap();
const auto surfaceStateHeapSize = pKernel->getSurfaceStateHeapSize();
auto svmPtr = std::make_unique<char[]>(256);
MockGraphicsAllocation svmAlloc(svmPtr.get(), 256);
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
auto ssHeapDataInitial = std::make_unique<char[]>(surfaceStateHeapSize);
std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize);
pKernelInfo->kernelDescriptor.bindlessArgsMap.clear();
auto retVal = pKernel->setArgSvmAlloc(0, svmPtr.get(), &svmAlloc, 0u);
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize));
}
HWTEST_F(KernelArgSvmTest, givenDeviceSupportingSharedSystemAllocationsWhenSetArgSvmIsCalledWithSurfaceStateThenSizeIsMaxAndAddressIsProgrammed) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
@@ -246,6 +396,42 @@ HWTEST_F(KernelArgSvmTest, givenDeviceSupportingSharedSystemAllocationsWhenSetAr
EXPECT_EQ(16384u, surfaceState->getHeight());
}
HWTEST_F(KernelArgSvmTest, givenBindlessArgAndDeviceSupportingSharedSystemAllocationsWhenSetArgSvmIsCalledWithSurfaceStateThenSizeIsMaxAndAddressIsProgrammed) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {
GTEST_SKIP();
}
const auto &gfxCoreHelper = pKernel->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
this->pClDevice->deviceInfo.sharedSystemMemCapabilities = CL_UNIFIED_SHARED_MEMORY_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_ATOMIC_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ACCESS_INTEL | CL_UNIFIED_SHARED_MEMORY_CONCURRENT_ATOMIC_ACCESS_INTEL;
auto systemPointer = reinterpret_cast<void *>(0xfeedbac);
const auto bindlessOffset = 0x10;
pKernelInfo->argAsPtr(0).bindless = bindlessOffset;
pKernelInfo->kernelDescriptor.initBindlessOffsetToSurfaceState();
pKernel->setArgSvmAlloc(0, systemPointer, nullptr, 0u);
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
const auto ssIndex = pKernelInfo->kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(
ptrOffset(pKernel->getSurfaceStateHeap(),
ssOffset));
void *surfaceAddress = reinterpret_cast<void *>(surfaceState->getSurfaceBaseAddress());
EXPECT_EQ(systemPointer, surfaceAddress);
EXPECT_EQ(128u, surfaceState->getWidth());
EXPECT_EQ(2048u, surfaceState->getDepth());
EXPECT_EQ(16384u, surfaceState->getHeight());
}
TEST_F(KernelArgSvmTest, WhenSettingKernelArgImmediateThenInvalidArgValueErrorIsReturned) {
const ClDeviceInfo &devInfo = pClDevice->getDeviceInfo();
if (devInfo.svmCapabilities == 0) {

View File

@@ -24,6 +24,7 @@
#include "shared/test/common/helpers/gtest_helpers.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
#include "shared/test/common/mocks/mock_allocation_properties.h"
#include "shared/test/common/mocks/mock_bindless_heaps_helper.h"
#include "shared/test/common/mocks/mock_cpu_page_fault_manager.h"
#include "shared/test/common/mocks/mock_graphics_allocation.h"
#include "shared/test/common/mocks/mock_memory_manager.h"
@@ -395,6 +396,137 @@ TEST_F(KernelTests, WhenIsSingleSubdevicePreferredIsCalledThenCorrectValuesAreRe
}
}
using BindlessKernelTests = KernelTests;
TEST_F(BindlessKernelTests, GivenBindlessAddressingKernelWhenInitializeThenSurfaceStateIsCreatedWithCorrectSize) {
KernelInfo kernelInfo = {};
kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 32;
kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::Bindless;
kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful = 3;
MockKernel kernel(pProgram, kernelInfo, *pClDevice);
auto retVal = kernel.initialize();
EXPECT_EQ(CL_SUCCESS, retVal);
const auto &gfxCoreHelper = pClDevice->getGfxCoreHelper();
const auto surfaceStateSize = static_cast<uint32_t>(gfxCoreHelper.getRenderSurfaceStateSize());
const auto expectedSsHeapSize = kernelInfo.kernelDescriptor.kernelAttributes.numArgsStateful * surfaceStateSize;
const auto ssHeap = kernel.getSurfaceStateHeap();
const auto ssHeapSize = kernel.getSurfaceStateHeapSize();
EXPECT_EQ(expectedSsHeapSize, ssHeapSize);
EXPECT_NE(nullptr, ssHeap);
}
TEST_F(BindlessKernelTests, givenBindlessKernelWhenPatchingCrossThreadDataThenCorrectBindlessOffsetsAreWritten) {
auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor.as<NEO::ArgDescPointer>().bindless = 0x0;
auto argDescriptorImg = NEO::ArgDescriptor(NEO::ArgDescriptor::argTImage);
argDescriptorImg.as<NEO::ArgDescImage>() = NEO::ArgDescImage();
argDescriptorImg.as<NEO::ArgDescImage>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptorImg.as<NEO::ArgDescImage>().bindless = sizeof(uint64_t);
auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor2.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor2.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor2.as<NEO::ArgDescPointer>().stateless = 2 * sizeof(uint64_t);
KernelInfo kernelInfo = {};
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptorImg);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor2);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = 3 * sizeof(uint64_t);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.bindless = 4 * sizeof(uint64_t);
MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice);
pProgram->mockKernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState();
mockKernel.crossThreadData = new char[5 * sizeof(uint64_t)];
mockKernel.crossThreadDataSize = 5 * sizeof(uint64_t);
memset(mockKernel.crossThreadData, 0x00, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = 0x1000;
auto &gfxCoreHelper = pClDevice->getGfxCoreHelper();
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
auto patchValue1 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(baseAddress));
auto patchValue2 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(baseAddress + 1 * surfaceStateSize));
auto patchValue3 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(baseAddress + 2 * surfaceStateSize));
auto patchValue4 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(baseAddress + 3 * surfaceStateSize));
mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize);
EXPECT_EQ(patchValue1, crossThreadData[0]);
EXPECT_EQ(patchValue2, crossThreadData[1]);
EXPECT_EQ(0u, crossThreadData[2]);
EXPECT_EQ(patchValue3, crossThreadData[3]);
EXPECT_EQ(patchValue4, crossThreadData[4]);
}
TEST_F(BindlessKernelTests, givenNoEntryInBindlessOffsetsMapWhenPatchingCrossThreadDataThenMemoryIsNotPatched) {
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;
auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor.as<NEO::ArgDescPointer>().bindless = 0x0;
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = sizeof(uint64_t);
MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice);
mockKernel.crossThreadData = new char[4 * sizeof(uint64_t)];
mockKernel.crossThreadDataSize = 4 * sizeof(uint64_t);
memset(mockKernel.crossThreadData, 0, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = 0x1000;
mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize);
EXPECT_EQ(0u, crossThreadData[0]);
}
TEST_F(BindlessKernelTests, givenNoStatefulArgsWhenPatchingBindlessOffsetsInCrossThreadDataThenMemoryIsNotPatched) {
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;
auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTValue);
argDescriptor.as<NEO::ArgDescValue>() = NEO::ArgDescValue();
argDescriptor.as<NEO::ArgDescValue>().elements.push_back(NEO::ArgDescValue::Element{0, 8, 0, false});
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptor);
MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice);
mockKernel.crossThreadData = new char[sizeof(uint64_t)];
mockKernel.crossThreadDataSize = sizeof(uint64_t);
memset(mockKernel.crossThreadData, 0, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = 0x1000;
mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize);
EXPECT_EQ(0u, crossThreadData[0]);
}
class KernelFromBinaryTest : public ProgramSimpleFixture {
public:
void setUp() {
@@ -1218,6 +1350,42 @@ HWTEST_F(KernelResidencyTest, givenKernelWhenMakeResidentIsCalledThenGlobalBuffe
memoryManager->freeGraphicsMemory(pKernelInfo->kernelAllocation);
}
HWTEST_F(KernelResidencyTest, givenBindlessHeapsHelperAndGlobalAndConstantBuffersWhenMakeResidentIsCalledThenGlobalAndConstantBufferHeapAllocationsAreMadeResident) {
auto bindlessHeapHelper = new MockBindlesHeapsHelper(pDevice, false);
pDevice->getExecutionEnvironment()->rootDeviceEnvironments[pDevice->getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper);
auto pKernelInfo = std::make_unique<KernelInfo>();
pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver<FamilyType>();
commandStreamReceiver.storeMakeResidentAllocations = true;
auto memoryManager = commandStreamReceiver.getMemoryManager();
pKernelInfo->kernelAllocation = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize});
MockProgram program(toClDeviceVector(*pClDevice));
MockContext ctx;
program.setContext(&ctx);
program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface = new MockGraphicsAllocation();
program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface = new MockGraphicsAllocation();
EXPECT_TRUE(memoryManager->allocateBindlessSlot(program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface));
EXPECT_TRUE(memoryManager->allocateBindlessSlot(program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface));
std::unique_ptr<MockKernel> kernel(new MockKernel(&program, *pKernelInfo, *pClDevice));
ASSERT_EQ(CL_SUCCESS, kernel->initialize());
EXPECT_EQ(0u, commandStreamReceiver.makeResidentAllocations.size());
kernel->makeResident(pDevice->getGpgpuCommandStreamReceiver());
EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.buildInfos[pDevice->getRootDeviceIndex()].globalSurface));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.getGlobalSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.buildInfos[pDevice->getRootDeviceIndex()].constantSurface));
EXPECT_TRUE(commandStreamReceiver.isMadeResident(program.getConstantSurface(rootDeviceIndex)->getBindlessInfo().heapAllocation));
memoryManager->freeGraphicsMemory(pKernelInfo->kernelAllocation);
}
HWTEST_F(KernelResidencyTest, givenKernelWhenItUsesIndirectUnifiedMemoryDeviceAllocationThenTheyAreMadeResident) {
MockKernelWithInternals mockKernel(*this->pClDevice);
auto &commandStreamReceiver = this->pDevice->getUltCommandStreamReceiver<FamilyType>();
@@ -2962,6 +3130,108 @@ TEST(KernelTest, givenKernelWithPatchInfoCollectionDisabledWhenPatchWithImplicit
EXPECT_EQ(0u, kernel.mockKernel->getPatchInfoDataList().size());
}
HWTEST_F(KernelTest, givenBindlessArgBufferWhenPatchWithImplicitSurfaceThenSurfaceStateIsEncodedAtProperOffset) {
auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
MockKernelWithInternals kernel(*device);
uint64_t gpuAddress = 0x1200;
const void *cpuPtr = reinterpret_cast<const void *>(gpuAddress);
size_t allocSize = 0x1000;
MockGraphicsAllocation mockAllocation(const_cast<void *>(cpuPtr), gpuAddress, allocSize);
kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
const CrossThreadDataOffset bindlessOffset = 0x10;
kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined<CrossThreadDataOffset>, bindlessOffset);
kernel.kernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState();
uint64_t crossThreadData = 0;
kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0));
const auto &gfxCoreHelper = device->getGfxCoreHelper();
const auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
const auto ssIndex = kernel.kernelInfo.kernelDescriptor.bindlessArgsMap.find(bindlessOffset)->second;
const auto ssOffset = ssIndex * surfaceStateSize;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
const auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ptrOffset(kernel.mockKernel->getSurfaceStateHeap(), ssOffset));
const auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
const auto bufferAddress = mockAllocation.getGpuAddressToPatch();
EXPECT_EQ(bufferAddress, surfaceAddress);
}
HWTEST_F(KernelTest, givenBindlessArgBufferAndNotInitializedBindlessOffsetToSurfaceStateWhenPatchWithImplicitSurfaceThenSurfaceStateIsNotEncoded) {
auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
MockKernelWithInternals kernel(*device);
uint64_t gpuAddress = 0x1200;
const void *cpuPtr = reinterpret_cast<const void *>(gpuAddress);
size_t allocSize = 0x1000;
MockGraphicsAllocation mockAllocation(const_cast<void *>(cpuPtr), gpuAddress, allocSize);
kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
const CrossThreadDataOffset bindlessOffset = 0x10;
kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined<CrossThreadDataOffset>, bindlessOffset);
const auto surfaceStateHeap = kernel.mockKernel->getSurfaceStateHeap();
const auto surfaceStateHeapSize = kernel.mockKernel->getSurfaceStateHeapSize();
auto ssHeapDataInitial = std::make_unique<char[]>(surfaceStateHeapSize);
std::memcpy(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize);
kernel.kernelInfo.kernelDescriptor.bindlessArgsMap.clear();
uint64_t crossThreadData = 0;
kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0));
EXPECT_EQ(0, std::memcmp(ssHeapDataInitial.get(), surfaceStateHeap, surfaceStateHeapSize));
}
HWTEST_F(KernelTest, givenBindlessHeapsHelperAndBindlessArgBufferWhenPatchWithImplicitSurfaceThenCrossThreadDataIsPatchedAndSurfaceStateIsEncoded) {
auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
auto &neoDevice = device->getDevice();
auto bindlessHeapHelper = new MockBindlesHeapsHelper(&neoDevice, false);
neoDevice.getExecutionEnvironment()->rootDeviceEnvironments[neoDevice.getRootDeviceIndex()]->bindlessHeapsHelper.reset(bindlessHeapHelper);
MockKernelWithInternals kernel(*device);
uint64_t gpuAddress = 0x1200;
const void *cpuPtr = reinterpret_cast<const void *>(gpuAddress);
size_t allocSize = 0x1000;
MockGraphicsAllocation mockAllocation(const_cast<void *>(cpuPtr), gpuAddress, allocSize);
kernel.kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
EXPECT_TRUE(device->getMemoryManager()->allocateBindlessSlot(&mockAllocation));
const CrossThreadDataOffset bindlessOffset = 0x10;
kernel.kernelInfo.addArgBuffer(0, 0, sizeof(void *), undefined<CrossThreadDataOffset>, bindlessOffset);
kernel.kernelInfo.kernelDescriptor.initBindlessOffsetToSurfaceState();
uint64_t crossThreadData = 0;
kernel.mockKernel->patchWithImplicitSurface(castToUint64(&crossThreadData), mockAllocation, kernel.kernelInfo.argAsPtr(0));
auto ssInHeapInfo = mockAllocation.getBindlessInfo();
auto patchLocation = reinterpret_cast<uint32_t *>(ptrOffset(kernel.mockKernel->crossThreadData, bindlessOffset));
auto patchValue = device->getGfxCoreHelper().getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(ssInHeapInfo.surfaceStateOffset));
EXPECT_EQ(patchValue, *patchLocation);
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
const auto surfaceState = reinterpret_cast<const RENDER_SURFACE_STATE *>(ssInHeapInfo.ssPtr);
const auto surfaceAddress = surfaceState->getSurfaceBaseAddress();
const auto bufferAddress = mockAllocation.getGpuAddressToPatch();
EXPECT_EQ(bufferAddress, surfaceAddress);
}
TEST(KernelTest, givenDefaultKernelWhenItIsCreatedThenItReportsStatelessWrites) {
auto device = clUniquePtr(new MockClDevice(MockDevice::createWithNewExecutionEnvironment<MockDevice>(defaultHwInfo.get())));
MockKernelWithInternals kernel(*device);