feature: use bindless samplers in heapless ocl

Related-To: NEO-12741
Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk
2025-01-22 10:25:34 +00:00
committed by Compute-Runtime-Automation
parent 47fc1ce34b
commit 6926582915
4 changed files with 121 additions and 2 deletions

View File

@ -264,6 +264,11 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
samplerCount, samplerTable.borderColor,
kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(),
device.getRootDeviceEnvironment());
if constexpr (heaplessModeEnabled) {
uint64_t bindlessSamplerStateAddress = samplerStateOffset;
bindlessSamplerStateAddress += dsh.getGraphicsAllocation()->getGpuAddress();
kernel.patchBindlessSamplerStatesInCrossThreadData(bindlessSamplerStateAddress);
}
}
if constexpr (bindfulAllowed) {

View File

@ -68,6 +68,7 @@
#include <algorithm>
#include <cstdint>
#include <ranges>
#include <vector>
using namespace iOpenCL;
@ -1335,7 +1336,18 @@ void Kernel::setInlineSamplers() {
errCode));
UNRECOVERABLE_IF(errCode != CL_SUCCESS);
auto samplerState = ptrOffset(getDynamicStateHeap(), static_cast<size_t>(inlineSampler.getSamplerBindfulOffset()));
void *samplerState = nullptr;
auto dsh = const_cast<void *>(getDynamicStateHeap());
if (isValidOffset(inlineSampler.bindless)) {
auto samplerStateIndex = inlineSampler.samplerIndex;
auto &gfxCoreHelper = this->getGfxCoreHelper();
auto samplerStateSize = gfxCoreHelper.getSamplerStateSize();
auto offset = inlineSampler.borderColorStateSize;
samplerState = ptrOffset(dsh, (samplerStateIndex * samplerStateSize) + offset);
} else {
samplerState = ptrOffset(dsh, static_cast<size_t>(inlineSampler.getSamplerBindfulOffset()));
}
sampler->setArg(const_cast<void *>(samplerState), clDevice.getRootDeviceEnvironment());
}
}
@ -1792,8 +1804,19 @@ cl_int Kernel::setArgSampler(uint32_t argIndex,
storeKernelArg(argIndex, SAMPLER_OBJ, clSamplerObj, argVal, argSize);
void *samplerState = nullptr;
auto dsh = getDynamicStateHeap();
auto samplerState = ptrOffset(dsh, argAsSmp.bindful);
if (isValidOffset(argAsSmp.bindless)) {
auto samplerStateIndex = argAsSmp.index;
auto &gfxCoreHelper = this->getGfxCoreHelper();
auto samplerStateSize = gfxCoreHelper.getSamplerStateSize();
const auto offset = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset;
samplerState = ptrOffset(const_cast<void *>(dsh), (samplerStateIndex * samplerStateSize) + offset);
} else {
DEBUG_BREAK_IF(isUndefinedOffset(argAsSmp.bindful));
samplerState = ptrOffset(const_cast<void *>(dsh), argAsSmp.bindful);
}
pSampler->setArg(const_cast<void *>(samplerState), clDevice.getRootDeviceEnvironment());
@ -2187,6 +2210,35 @@ void Kernel::patchBindlessSurfaceStatesInCrossThreadData(uint64_t bindlessSurfac
}
}
void Kernel::patchBindlessSamplerStatesInCrossThreadData(uint64_t bindlessSamplerStatesBaseAddress) const {
auto &gfxCoreHelper = this->getGfxCoreHelper();
const auto samplerStateSize = gfxCoreHelper.getSamplerStateSize();
auto *crossThreadDataPtr = reinterpret_cast<uint8_t *>(getCrossThreadData());
auto samplerArgs = std::ranges::subrange(kernelInfo.kernelDescriptor.payloadMappings.explicitArgs) | std::views::filter([](const auto &arg) {
return (arg.type == NEO::ArgDescriptor::argTSampler) && NEO::isValidOffset(arg.template as<NEO::ArgDescSampler>().bindless);
});
for (auto &arg : samplerArgs) {
auto &sampler = arg.template as<NEO::ArgDescSampler>();
auto patchLocation = ptrOffset(crossThreadDataPtr, sampler.bindless);
auto samplerStateAddress = static_cast<uint64_t>(bindlessSamplerStatesBaseAddress + sampler.index * samplerStateSize);
auto patchValue = samplerStateAddress;
patchWithRequiredSize(patchLocation, sampler.size, patchValue);
}
auto inlineSamplers = kernelInfo.kernelDescriptor.inlineSamplers | std::views::filter([](const auto &sampler) {
return (NEO::isValidOffset(sampler.bindless));
});
for (auto &sampler : inlineSamplers) {
auto patchLocation = ptrOffset(crossThreadDataPtr, sampler.bindless);
auto samplerStateAddress = static_cast<uint64_t>(bindlessSamplerStatesBaseAddress + sampler.samplerIndex * samplerStateSize);
auto patchValue = samplerStateAddress;
patchWithRequiredSize(patchLocation, sampler.size, patchValue);
}
}
void Kernel::setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo) {
this->additionalKernelExecInfo = additionalKernelExecInfo;
}

View File

@ -220,6 +220,8 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
template <bool heaplessEnabled>
void patchBindlessSurfaceStatesInCrossThreadData(uint64_t bindlessSurfaceStatesBaseAddress) const;
void patchBindlessSamplerStatesInCrossThreadData(uint64_t bindlessSamplerStatesBaseAddress) const;
// Helpers
cl_int setArg(uint32_t argIndex, uint32_t argValue);
cl_int setArg(uint32_t argIndex, uint64_t argValue);

View File

@ -571,6 +571,66 @@ TEST_F(BindlessKernelTests, givenBindlessKernelWhenPatchBindlessSurfaceStatesInC
EXPECT_EQ(globalConstantsSurfaceAddress, crossThreadData[4]);
}
HWTEST_F(BindlessKernelTests, givenBindlessKernelAndSamplersWhenPatchBindlessSamplerStatesInCrossThreadDataThenCorrectAddressesAreWritten) {
auto argDescriptorSampler1 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTSampler);
argDescriptorSampler1.as<NEO::ArgDescSampler>() = NEO::ArgDescSampler();
argDescriptorSampler1.as<NEO::ArgDescSampler>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptorSampler1.as<NEO::ArgDescSampler>().bindless = 0x0;
argDescriptorSampler1.as<NEO::ArgDescSampler>().size = 8;
argDescriptorSampler1.as<NEO::ArgDescSampler>().index = 0;
auto argDescriptorSampler2 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTSampler);
argDescriptorSampler2.as<NEO::ArgDescSampler>() = NEO::ArgDescSampler();
argDescriptorSampler2.as<NEO::ArgDescSampler>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptorSampler2.as<NEO::ArgDescSampler>().bindless = sizeof(uint64_t);
argDescriptorSampler2.as<NEO::ArgDescSampler>().size = 8;
argDescriptorSampler2.as<NEO::ArgDescSampler>().index = 1;
KernelInfo kernelInfo = {};
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.samplerAddressingMode = NEO::KernelDescriptor::Bindless;
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptorSampler1);
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.explicitArgs.push_back(argDescriptorSampler2);
auto &inlineSampler = pProgram->mockKernelInfo.kernelDescriptor.inlineSamplers.emplace_back();
inlineSampler.addrMode = NEO::KernelDescriptor::InlineSampler::AddrMode::repeat;
inlineSampler.filterMode = NEO::KernelDescriptor::InlineSampler::FilterMode::nearest;
inlineSampler.isNormalized = false;
inlineSampler.bindless = 2 * sizeof(uint64_t);
inlineSampler.samplerIndex = 2;
inlineSampler.size = 8;
const uint32_t borderColorSize = inlineSampler.borderColorStateSize;
pProgram->mockKernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset = borderColorSize;
MockKernel mockKernel(pProgram, pProgram->mockKernelInfo, *pClDevice);
using SamplerState = typename FamilyType::SAMPLER_STATE;
std::array<uint8_t, 64 + 3 * sizeof(SamplerState)> dsh = {0};
pProgram->mockKernelInfo.heapInfo.pDsh = dsh.data();
pProgram->mockKernelInfo.heapInfo.dynamicStateHeapSize = static_cast<uint32_t>(dsh.size());
mockKernel.crossThreadData = new char[3 * sizeof(uint64_t)];
mockKernel.crossThreadDataSize = 3 * sizeof(uint64_t);
memset(mockKernel.crossThreadData, 0x00, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = reinterpret_cast<uint64_t>(dsh.data()) + borderColorSize;
auto &gfxCoreHelper = pClDevice->getGfxCoreHelper();
auto samplerStateSize = gfxCoreHelper.getSamplerStateSize();
auto bindlessSamplerState1Address = baseAddress;
auto bindlessSamplerState2Address = baseAddress + 1 * samplerStateSize;
auto bindlessInlineSamplerStateAddress = baseAddress + inlineSampler.samplerIndex * samplerStateSize;
mockKernel.setInlineSamplers();
mockKernel.patchBindlessSamplerStatesInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData, mockKernel.crossThreadDataSize);
EXPECT_EQ(bindlessSamplerState1Address, crossThreadData[0]);
EXPECT_EQ(bindlessSamplerState2Address, crossThreadData[1]);
EXPECT_EQ(bindlessInlineSamplerStateAddress, crossThreadData[2]);
}
TEST_F(BindlessKernelTests, givenNoEntryInBindlessOffsetsMapWhenPatchingCrossThreadDataThenMemoryIsNotPatched) {
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
pProgram->mockKernelInfo.kernelDescriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;