feature: add bindless samplers support to level zero

- samplers using bindless adressing require patching bindless offsets to
sampler states on kernel's cross thread data

Related-To: NEO-10505

Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe 2024-03-29 06:44:27 +00:00 committed by Compute-Runtime-Automation
parent 420e1391b2
commit f86d4220a5
13 changed files with 210 additions and 14 deletions

View File

@ -859,8 +859,16 @@ ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void
ze_result_t KernelImp::setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal) {
const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescSampler>();
const auto sampler = Sampler::fromHandle(*static_cast<const ze_sampler_handle_t *>(argVal));
if (NEO::isValidOffset(arg.bindful)) {
sampler->copySamplerStateToDSH(dynamicStateHeapData.get(), dynamicStateHeapDataSize, arg.bindful);
} else if (NEO::isValidOffset(arg.bindless)) {
const auto offset = kernelImmData->getDescriptor().payloadMappings.samplerTable.tableOffset;
auto &gfxCoreHelper = this->module->getDevice()->getNEODevice()->getRootDeviceEnvironmentRef().getHelper<NEO::GfxCoreHelper>();
const auto stateSize = gfxCoreHelper.getSamplerStateSize();
auto heapOffset = offset + static_cast<uint32_t>(stateSize) * arg.index;
sampler->copySamplerStateToDSH(dynamicStateHeapData.get(), dynamicStateHeapDataSize, heapOffset);
}
auto samplerDesc = sampler->getSamplerDesc();
NEO::patchNonPointer<uint32_t, uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg.metadataPayload.samplerSnapWa, (samplerDesc.addressMode == ZE_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER && samplerDesc.filterMode == ZE_SAMPLER_FILTER_MODE_NEAREST) ? std::numeric_limits<uint32_t>::max() : 0u);
@ -1352,6 +1360,36 @@ void KernelImp::patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceSt
patchBindlessOffsetsForImplicitArgs(bindlessSurfaceStateBaseOffset);
}
void KernelImp::patchSamplerBindlessOffsetsInCrossThreadData(uint64_t samplerStateOffset) const {
if (this->module == nullptr) {
return;
}
const auto &gfxCoreHelper = this->module->getDevice()->getGfxCoreHelper();
const auto samplerStateSize = gfxCoreHelper.getSamplerStateSize();
auto crossThreadData = getCrossThreadData();
for (size_t index = 0; index < kernelImmData->getDescriptor().payloadMappings.explicitArgs.size(); index++) {
const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[index];
auto crossThreadOffset = NEO::undefined<NEO::CrossThreadDataOffset>;
if (arg.type == NEO::ArgDescriptor::argTSampler) {
crossThreadOffset = arg.as<NEO::ArgDescSampler>().bindless;
} else {
continue;
}
auto samplerIndex = arg.as<NEO::ArgDescSampler>().index;
if (NEO::isValidOffset(crossThreadOffset)) {
auto patchLocation = ptrOffset(crossThreadData, crossThreadOffset);
if (samplerIndex < std::numeric_limits<uint8_t>::max()) {
auto surfaceStateOffset = static_cast<uint64_t>(samplerStateOffset + samplerIndex * samplerStateSize);
auto patchValue = surfaceStateOffset;
patchWithRequiredSize(const_cast<uint8_t *>(patchLocation), arg.as<NEO::ArgDescSampler>().size, patchValue);
}
}
}
}
uint32_t KernelImp::getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const {
const auto &iter = getKernelDescriptor().getBindlessOffsetToSurfaceState().find(bindlessOffset);
if (iter != getKernelDescriptor().getBindlessOffsetToSurfaceState().end()) {

View File

@ -175,6 +175,7 @@ struct KernelImp : Kernel {
void patchCrossthreadDataWithPrivateAllocation(NEO::GraphicsAllocation *privateAllocation) override;
void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const override;
void patchBindlessOffsetsForImplicitArgs(uint64_t bindlessSurfaceStateBaseOffset) const;
void patchSamplerBindlessOffsetsInCrossThreadData(uint64_t samplerStateOffset) const override;
NEO::GraphicsAllocation *getPrivateMemoryGraphicsAllocation() override {
return privateMemoryGraphicsAllocation;

View File

@ -1023,10 +1023,6 @@ int main(int argc, char *argv[]) {
i = testCase;
}
if (testCase == -1 && i == 2) {
continue;
}
switch (i) {
default:
case 0:

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -53,6 +53,7 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp {
using ::L0::KernelImp::numThreadsPerThreadGroup;
using ::L0::KernelImp::patchBindlessOffsetsInCrossThreadData;
using ::L0::KernelImp::patchBindlessSurfaceState;
using ::L0::KernelImp::patchSamplerBindlessOffsetsInCrossThreadData;
using ::L0::KernelImp::perThreadDataForWholeThreadGroup;
using ::L0::KernelImp::perThreadDataSize;
using ::L0::KernelImp::perThreadDataSizeForWholeThreadGroup;

View File

@ -3894,5 +3894,70 @@ TEST(KernelImmutableDataTest, givenBindlessKernelWhenInitializingImmDataThenSshT
}
}
TEST_F(BindlessKernelTest, givenBindlessKernelWhenPatchingSamplerOffsetsInCrossThreadDataThenCorrectBindlessOffsetsAreWritten) {
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
mockKernel.module = &mockModule;
mockKernel.descriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
mockKernel.descriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;
auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor.as<NEO::ArgDescPointer>().bindless = 0x0;
mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor);
auto argDescriptorSampler = NEO::ArgDescriptor(NEO::ArgDescriptor::argTSampler);
argDescriptorSampler.as<NEO::ArgDescSampler>() = NEO::ArgDescSampler();
argDescriptorSampler.as<NEO::ArgDescSampler>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptorSampler.as<NEO::ArgDescSampler>().bindless = sizeof(uint64_t);
argDescriptorSampler.as<NEO::ArgDescSampler>().size = sizeof(uint64_t);
argDescriptorSampler.as<NEO::ArgDescSampler>().index = 1;
mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptorSampler);
auto argDescriptorSampler2 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTSampler);
argDescriptorSampler2.as<NEO::ArgDescSampler>() = NEO::ArgDescSampler();
argDescriptorSampler2.as<NEO::ArgDescSampler>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptorSampler2.as<NEO::ArgDescSampler>().bindless = 2 * sizeof(uint64_t);
argDescriptorSampler2.as<NEO::ArgDescSampler>().size = sizeof(uint64_t);
argDescriptorSampler2.as<NEO::ArgDescSampler>().index = undefined<uint8_t>;
mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptorSampler2);
auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor2.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor2.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor2.as<NEO::ArgDescPointer>().stateless = 2 * sizeof(uint64_t);
mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor2);
mockKernel.descriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = 3 * sizeof(uint64_t);
mockKernel.descriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.bindless = 4 * sizeof(uint64_t);
mockKernel.isBindlessOffsetSet.resize(2, 0);
mockKernel.usingSurfaceStateHeap.resize(2, 0);
mockKernel.descriptor.initBindlessOffsetToSurfaceState();
mockKernel.crossThreadData = std::make_unique<uint8_t[]>(5 * sizeof(uint64_t));
mockKernel.crossThreadDataSize = 5 * sizeof(uint64_t);
memset(mockKernel.crossThreadData.get(), 0, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = 0x1000;
auto &gfxCoreHelper = this->device->getGfxCoreHelper();
auto samplerStateSize = gfxCoreHelper.getSamplerStateSize();
auto patchValue1 = (static_cast<uint32_t>(baseAddress + 1 * samplerStateSize));
auto patchValue2 = 0u;
mockKernel.patchSamplerBindlessOffsetsInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData.get(), mockKernel.crossThreadDataSize);
EXPECT_EQ(patchValue1, crossThreadData[1]);
EXPECT_EQ(0u, patchValue2);
EXPECT_EQ(0u, crossThreadData[2]);
}
} // namespace ult
} // namespace L0

View File

@ -339,8 +339,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHpCommandStreamReceiverFlushTaskTests, whenFlushi
ASSERT_NE(nullptr, hwParserCsr.cmdStateBaseAddress);
auto stateBaseAddress = static_cast<STATE_BASE_ADDRESS *>(hwParserCsr.cmdStateBaseAddress);
EXPECT_TRUE(stateBaseAddress->getBindlessSamplerStateBaseAddressModifyEnable());
EXPECT_EQ(0u, stateBaseAddress->getBindlessSamplerStateBaseAddress());
EXPECT_EQ(0u, stateBaseAddress->getBindlessSamplerStateBufferSize());
EXPECT_EQ(dsh.getHeapGpuBase(), stateBaseAddress->getBindlessSamplerStateBaseAddress());
EXPECT_EQ(dsh.getHeapSizeInPages(), stateBaseAddress->getBindlessSamplerStateBufferSize());
}
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHpCommandStreamReceiverFlushTaskTests, givenDebugKeysThatOverrideMultiGpuSettingWhenStateBaseAddressIsProgrammedThenValuesMatch) {

View File

@ -217,6 +217,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
}
idd.setSamplerStatePointer(samplerStateOffset);
args.dispatchInterface->patchSamplerBindlessOffsetsInCrossThreadData(samplerStateOffset);
}
}

View File

@ -1195,6 +1195,7 @@ DecodeError populateKernelPayloadArgument(NEO::KernelDescriptor &dst, const Kern
static constexpr auto maxIndirectSamplerStateSize = 64U;
auto &sampler = dst.payloadMappings.explicitArgs[src.argIndex].as<ArgDescSampler>();
sampler.bindful = maxIndirectSamplerStateSize + maxSamplerStateSize * src.samplerIndex;
sampler.index = src.samplerIndex;
dst.payloadMappings.samplerTable.numSamplers = std::max<uint8_t>(dst.payloadMappings.samplerTable.numSamplers, static_cast<uint8_t>(src.samplerIndex + 1));
} else {
dst.kernelAttributes.numArgsStateful++;
@ -1212,12 +1213,16 @@ DecodeError populateKernelPayloadArgument(NEO::KernelDescriptor &dst, const Kern
case Types::Kernel::PayloadArgument::memoryAddressingModeBindless:
if (dst.payloadMappings.explicitArgs[src.argIndex].is<NEO::ArgDescriptor::argTPointer>()) {
dst.payloadMappings.explicitArgs[src.argIndex].as<ArgDescPointer>(false).bindless = src.offset;
dst.kernelAttributes.numArgsStateful++;
} else if (dst.payloadMappings.explicitArgs[src.argIndex].is<NEO::ArgDescriptor::argTImage>()) {
dst.payloadMappings.explicitArgs[src.argIndex].as<ArgDescImage>(false).bindless = src.offset;
dst.kernelAttributes.numArgsStateful++;
} else {
dst.payloadMappings.explicitArgs[src.argIndex].as<ArgDescSampler>(false).bindless = src.offset;
dst.payloadMappings.explicitArgs[src.argIndex].as<ArgDescSampler>(false).index = src.samplerIndex;
dst.payloadMappings.explicitArgs[src.argIndex].as<ArgDescSampler>(false).size = src.size;
dst.payloadMappings.samplerTable.numSamplers = std::max<uint8_t>(dst.payloadMappings.samplerTable.numSamplers, static_cast<uint8_t>(src.samplerIndex + 1));
}
dst.kernelAttributes.numArgsStateful++;
break;
case Types::Kernel::PayloadArgument::memoryAddressingModeSharedLocalMemory:
dst.payloadMappings.explicitArgs[src.argIndex].as<ArgDescPointer>(false).slmOffset = src.offset;

View File

@ -55,9 +55,17 @@ void StateBaseAddressHelper<GfxFamily>::appendStateBaseAddressParameters(
const auto surfaceStateCount = args.ssh->getMaxAvailableSpace() / sizeof(RENDER_SURFACE_STATE);
args.stateBaseAddressCmd->setBindlessSurfaceStateSize(static_cast<uint32_t>(surfaceStateCount - 1));
}
}
if (args.dsh) {
args.stateBaseAddressCmd->setBindlessSamplerStateBaseAddress(args.dsh->getHeapGpuBase());
args.stateBaseAddressCmd->setBindlessSamplerStateBufferSize(args.dsh->getHeapSizeInPages());
args.stateBaseAddressCmd->setBindlessSamplerStateBaseAddressModifyEnable(true);
}
} else {
args.stateBaseAddressCmd->setBindlessSamplerStateBaseAddressModifyEnable(true);
args.stateBaseAddressCmd->setBindlessSamplerStateBaseAddress(args.globalHeapsBaseAddress);
args.stateBaseAddressCmd->setBindlessSamplerStateBufferSize(MemoryConstants::sizeOf4GBinPageEntities);
}
auto &productHelper = args.gmmHelper->getRootDeviceEnvironment().template getHelper<ProductHelper>();

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -48,5 +48,6 @@ struct DispatchKernelEncoderI {
virtual ImplicitArgs *getImplicitArgs() const = 0;
virtual void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const = 0;
virtual void patchSamplerBindlessOffsetsInCrossThreadData(uint64_t samplerStateOffset) const = 0;
};
} // namespace NEO

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -106,6 +106,8 @@ struct ArgDescSampler final {
CrossThreadDataOffset samplerAddressingMode = undefined<CrossThreadDataOffset>;
CrossThreadDataOffset samplerNormalizedCoords = undefined<CrossThreadDataOffset>;
} metadataPayload;
uint8_t index = undefined<uint8_t>;
uint8_t size = undefined<uint8_t>;
};
struct ArgDescValue final {

View File

@ -1757,6 +1757,37 @@ kernels:
EXPECT_EQ(1u, kernelDescriptor->kernelAttributes.numArgsStateful);
}
TEST_F(decodeZeInfoKernelEntryTest, GivenBindlessSamplerWhenDecodingZeInfoThenNumberOfStatefulArgsDoesNotCountSampler) {
ConstStringRef zeinfo = R"===(
kernels:
- name : some_kernel
execution_env:
simd_size: 8
payload_arguments:
- arg_type: arg_bypointer
offset: 0
size: 8
arg_index: 0
addrmode: bindless
addrspace: sampler
access_type: readwrite
sampler_index: 0
sampler_type: texture
- arg_type: arg_bypointer
offset: 8
size: 4
arg_index: 1
addrmode: bindless
addrspace: global
access_type: readwrite
...
)===";
auto err = decodeZeInfoKernelEntry(zeinfo);
EXPECT_EQ(NEO::DecodeError::success, err);
EXPECT_EQ(1u, kernelDescriptor->kernelAttributes.numArgsStateful);
}
TEST_F(decodeZeInfoKernelEntryTest, GivenBindlessImageAddressingWhenDecodingZeInfoThenImageAddressingModeIsBindless) {
ConstStringRef zeinfo = R"===(
kernels:
@ -5643,10 +5674,14 @@ TEST_F(decodeZeInfoKernelEntryTest, GivenValidSamplerArgumentWithMetadataThenPop
auto &sampler0 = args[0].as<ArgDescSampler>();
EXPECT_EQ(64U, sampler0.bindful);
EXPECT_EQ(0U, sampler0.index);
EXPECT_EQ(undefined<uint8_t>, sampler0.size);
auto &sampler1 = args[1].as<ArgDescSampler>();
EXPECT_TRUE(args[1].getExtendedTypeInfo().isAccelerator);
EXPECT_EQ(80U, sampler1.bindful);
EXPECT_EQ(1U, sampler1.index);
EXPECT_EQ(undefined<uint8_t>, sampler1.size);
auto &sampler2 = args[2].as<ArgDescSampler>();
EXPECT_TRUE(args[2].getExtendedTypeInfo().isAccelerator);
@ -5654,6 +5689,8 @@ TEST_F(decodeZeInfoKernelEntryTest, GivenValidSamplerArgumentWithMetadataThenPop
EXPECT_EQ(0U, sampler2.metadataPayload.samplerSnapWa);
EXPECT_EQ(4U, sampler2.metadataPayload.samplerNormalizedCoords);
EXPECT_EQ(8U, sampler2.metadataPayload.samplerAddressingMode);
EXPECT_EQ(2U, sampler2.index);
EXPECT_EQ(undefined<uint8_t>, sampler2.size);
auto &sampler3 = args[3].as<ArgDescSampler>();
EXPECT_TRUE(args[3].getExtendedTypeInfo().isAccelerator);
@ -5669,6 +5706,46 @@ TEST_F(decodeZeInfoKernelEntryTest, GivenValidSamplerArgumentWithMetadataThenPop
EXPECT_TRUE(kd.kernelAttributes.flags.usesVme);
}
TEST_F(decodeZeInfoKernelEntryTest, GivenBindlessSamplerArgumentWithMetadataThenKernelDescriptorIsPopulated) {
ConstStringRef zeinfo = R"===(
kernels:
- name : some_kernel
execution_env:
simd_size: 8
payload_arguments:
- arg_type: arg_bypointer
offset: 88
size: 8
arg_index: 0
addrmode: bindless
addrspace: sampler
access_type: readwrite
sampler_index: 3
sampler_type: texture
- arg_type: arg_bypointer
offset: 8
size: 4
arg_index: 1
addrmode: bindless
addrspace: global
access_type: readwrite
...
)===";
auto err = decodeZeInfoKernelEntry(zeinfo);
EXPECT_EQ(NEO::DecodeError::success, err);
EXPECT_EQ(1u, kernelDescriptor->kernelAttributes.numArgsStateful);
const auto &kd = *this->kernelDescriptor;
auto &args = kd.payloadMappings.explicitArgs;
auto &sampler0 = args[0].as<ArgDescSampler>();
EXPECT_EQ(undefined<DynamicStateHeapOffset>, sampler0.bindful);
EXPECT_EQ(88u, sampler0.bindless);
EXPECT_EQ(3U, sampler0.index);
EXPECT_EQ(8U, sampler0.size);
}
class IntelGTNotesFixture : public ::testing::Test {
protected:
void SetUp() override {

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2020-2023 Intel Corporation
* Copyright (C) 2020-2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -30,6 +30,7 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI {
NEO::ImplicitArgs *getImplicitArgs() const override { return nullptr; }
void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const override { return; };
void patchSamplerBindlessOffsetsInCrossThreadData(uint64_t samplerStateOffset) const override { return; };
MockGraphicsAllocation mockAllocation{};
static constexpr uint32_t crossThreadSize = 0x40;