feature: add inline samplers bindless addressing support

- inline samplers in bindless addressing mode requires bindless offset
passed in cross thread data

Related-To: NEO-11748

Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
Mateusz Hoppe 2024-06-23 16:17:27 +00:00 committed by Compute-Runtime-Automation
parent 7136dfbd38
commit 4c49a08017
7 changed files with 169 additions and 3 deletions

View File

@ -1414,6 +1414,26 @@ void KernelImp::patchSamplerBindlessOffsetsInCrossThreadData(uint64_t samplerSta
}
}
}
for (size_t index = 0; index < kernelImmData->getDescriptor().inlineSamplers.size(); index++) {
const auto &sampler = kernelImmData->getDescriptor().inlineSamplers[index];
auto crossThreadOffset = NEO::undefined<NEO::CrossThreadDataOffset>;
if (sampler.bindless != NEO::undefined<NEO::CrossThreadDataOffset>) {
crossThreadOffset = sampler.bindless;
} else {
continue;
}
auto samplerIndex = sampler.samplerIndex;
if (samplerIndex < std::numeric_limits<uint8_t>::max()) {
auto patchLocation = ptrOffset(crossThreadData, crossThreadOffset);
auto surfaceStateOffset = static_cast<uint64_t>(samplerStateOffset + samplerIndex * samplerStateSize);
auto patchValue = surfaceStateOffset;
patchWithRequiredSize(const_cast<uint8_t *>(patchLocation), sampler.size, patchValue);
}
}
}
uint32_t KernelImp::getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const {

View File

@ -4018,5 +4018,68 @@ TEST_F(BindlessKernelTest, givenBindlessKernelWhenPatchingSamplerOffsetsInCrossT
EXPECT_EQ(0u, crossThreadData[2]);
}
TEST_F(BindlessKernelTest, givenBindlessKernelWithInlineSamplersWhenPatchingSamplerOffsetsInCrossThreadDataThenCorrectBindlessOffsetsAreWritten) {
Mock<Module> mockModule(this->device, nullptr);
Mock<KernelImp> mockKernel;
mockKernel.module = &mockModule;
mockKernel.descriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless;
mockKernel.descriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless;
auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor.as<NEO::ArgDescPointer>().bindless = 0x0;
mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor);
auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer);
argDescriptor2.as<NEO::ArgDescPointer>() = NEO::ArgDescPointer();
argDescriptor2.as<NEO::ArgDescPointer>().bindful = NEO::undefined<NEO::SurfaceStateHeapOffset>;
argDescriptor2.as<NEO::ArgDescPointer>().stateless = 2 * sizeof(uint64_t);
mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor2);
mockKernel.descriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = 3 * sizeof(uint64_t);
mockKernel.descriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.bindless = 4 * sizeof(uint64_t);
NEO::KernelDescriptor::InlineSampler inlineSampler = {};
inlineSampler.samplerIndex = 0;
inlineSampler.addrMode = NEO::KernelDescriptor::InlineSampler::AddrMode::clampBorder;
inlineSampler.filterMode = NEO::KernelDescriptor::InlineSampler::FilterMode::linear;
inlineSampler.isNormalized = true;
inlineSampler.bindless = 5 * sizeof(uint64_t);
inlineSampler.size = sizeof(uint64_t);
mockKernel.descriptor.inlineSamplers.push_back(inlineSampler);
inlineSampler.samplerIndex = 1;
inlineSampler.bindless = 6 * sizeof(uint64_t);
inlineSampler.size = sizeof(uint64_t);
mockKernel.descriptor.inlineSamplers.push_back(inlineSampler);
mockKernel.descriptor.payloadMappings.samplerTable.numSamplers = 2;
mockKernel.isBindlessOffsetSet.resize(2, 0);
mockKernel.usingSurfaceStateHeap.resize(2, 0);
mockKernel.descriptor.initBindlessOffsetToSurfaceState();
mockKernel.crossThreadData = std::make_unique<uint8_t[]>(7 * sizeof(uint64_t));
mockKernel.crossThreadDataSize = 7 * sizeof(uint64_t);
memset(mockKernel.crossThreadData.get(), 0, mockKernel.crossThreadDataSize);
const uint64_t baseAddress = 0x1000;
auto &gfxCoreHelper = this->device->getGfxCoreHelper();
auto samplerStateSize = gfxCoreHelper.getSamplerStateSize();
auto patchValue1 = (static_cast<uint32_t>(baseAddress + 0 * samplerStateSize));
auto patchValue2 = (static_cast<uint32_t>(baseAddress + 1 * samplerStateSize));
mockKernel.patchSamplerBindlessOffsetsInCrossThreadData(baseAddress);
auto crossThreadData = std::make_unique<uint64_t[]>(mockKernel.crossThreadDataSize / sizeof(uint64_t));
memcpy(crossThreadData.get(), mockKernel.crossThreadData.get(), mockKernel.crossThreadDataSize);
EXPECT_EQ(patchValue1, crossThreadData[5]);
EXPECT_EQ(patchValue2, crossThreadData[6]);
}
} // namespace ult
} // namespace L0

View File

@ -129,6 +129,7 @@ inline constexpr ConstStringRef regionGroupSize("region_group_size");
inline constexpr ConstStringRef regionGroupDimension("region_group_dimension");
inline constexpr ConstStringRef regionGroupWgCount("region_group_wg_count");
inline constexpr ConstStringRef regionGroupBarrierBuffer("region_group_barrier_buffer");
inline constexpr ConstStringRef inlineSampler("inline_sampler");
namespace Image {
inline constexpr ConstStringRef width("image_width");
@ -508,6 +509,7 @@ enum ArgType : uint8_t {
argTypeRegionGroupDimension,
argTypeRegionGroupWgCount,
argTypeRegionGroupBarrierBuffer,
argTypeInlineSampler,
argTypeMax
};

View File

@ -536,12 +536,12 @@ DecodeError decodeZeInfoKernelEntry(NEO::KernelDescriptor &dst, NEO::Yaml::YamlP
return decodeError;
}
decodeError = decodeZeInfoKernelPayloadArguments(dst, yamlParser, zeInfokernelSections, outErrReason, outWarning);
decodeError = decodeZeInfoKernelInlineSamplers(dst, yamlParser, zeInfokernelSections, outErrReason, outWarning);
if (DecodeError::success != decodeError) {
return decodeError;
}
decodeError = decodeZeInfoKernelInlineSamplers(dst, yamlParser, zeInfokernelSections, outErrReason, outWarning);
decodeError = decodeZeInfoKernelPayloadArguments(dst, yamlParser, zeInfokernelSections, outErrReason, outWarning);
if (DecodeError::success != decodeError) {
return decodeError;
}
@ -1111,6 +1111,24 @@ DecodeError populateKernelPayloadArgument(NEO::KernelDescriptor &dst, const Kern
return DecodeError::success;
};
auto populateInlineSampler = [&src, &outErrReason, &kernelName](auto &dst, ConstStringRef typeName) {
if (dst.payloadMappings.samplerTable.numSamplers < src.samplerIndex) {
outErrReason.append("DeviceBinaryFormat::zebin : Invalid sampler index for argument of type " + typeName.str() + " in context of : " + kernelName + ".\n");
return DecodeError::invalidBinary;
}
if (src.addrmode == Types::Kernel::PayloadArgument::memoryAddressingModeBindless) {
for (auto &sampler : dst.inlineSamplers) {
if (static_cast<Types::Kernel::PayloadArgument::SamplerIndexT>(sampler.samplerIndex) == src.samplerIndex) {
sampler.bindless = src.offset;
sampler.size = src.size;
break;
}
}
}
return DecodeError::success;
};
switch (src.argType) {
default:
outErrReason.append("DeviceBinaryFormat::zebin : Invalid arg type in cross thread data section in context of : " + kernelName + ".\n");
@ -1398,6 +1416,9 @@ DecodeError populateKernelPayloadArgument(NEO::KernelDescriptor &dst, const Kern
case Types::Kernel::argTypeRegionGroupBarrierBuffer:
dst.kernelAttributes.flags.usesRegionGroupBarrier = true;
return populateArgPointerStateless(dst.payloadMappings.implicitArgs.regionGroupBarrierBuffer);
case Types::Kernel::argTypeInlineSampler:
return populateInlineSampler(dst, Tags::Kernel::PayloadArgument::ArgType::inlineSampler);
}
UNREACHABLE();

View File

@ -22,7 +22,7 @@ using namespace Tags::Kernel::PayloadArgument::ArgType::Sampler::Vme;
using ArgType = Types::Kernel::ArgType;
inline constexpr ConstStringRef name = "argument type";
inline constexpr LookupArray<ConstStringRef, ArgType, 45> lookup({{
inline constexpr LookupArray<ConstStringRef, ArgType, 46> lookup({{
{packedLocalIds, ArgType::argTypePackedLocalIds},
{localId, ArgType::argTypeLocalId},
{localSize, ArgType::argTypeLocalSize},
@ -68,6 +68,7 @@ inline constexpr LookupArray<ConstStringRef, ArgType, 45> lookup({{
{regionGroupDimension, ArgType::argTypeRegionGroupDimension},
{regionGroupWgCount, ArgType::argTypeRegionGroupWgCount},
{regionGroupBarrierBuffer, ArgType::argTypeRegionGroupBarrierBuffer},
{inlineSampler, ArgType::argTypeInlineSampler},
}});
static_assert(lookup.size() == ArgType::argTypeMax - 1, "Every enum field must be present");
} // namespace ArgType

View File

@ -234,6 +234,9 @@ struct KernelDescriptor {
bool isNormalized;
AddrMode addrMode;
FilterMode filterMode;
CrossThreadDataOffset bindless = undefined<CrossThreadDataOffset>;
uint8_t size = undefined<uint8_t>;
constexpr uint32_t getSamplerBindfulOffset() const {
return borderColorStateSize + samplerStateSize * samplerIndex;
}

View File

@ -6814,6 +6814,9 @@ kernels:
EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::AddrMode::clampEdge, inlineSampler.addrMode);
EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::FilterMode::nearest, inlineSampler.filterMode);
EXPECT_TRUE(inlineSampler.isNormalized);
EXPECT_EQ(undefined<CrossThreadDataOffset>, inlineSampler.bindless);
EXPECT_EQ(undefined<uint8_t>, inlineSampler.size);
}
TEST_F(decodeZeInfoKernelEntryTest, GivenInvalidInlineSamplersEntryThenPopulateKernelDescriptorFails) {
@ -6848,6 +6851,59 @@ kernels:
EXPECT_EQ(NEO::DecodeError::invalidBinary, err);
}
TEST_F(decodeZeInfoKernelEntryTest, GivenBindlessInlineSamplersThenPopulateKernelDescriptorSetsBindlessOffsetAndSize) {
ConstStringRef zeinfo = R"===(
kernels:
- name : some_kernel
execution_env:
simd_size: 8
has_sample: true
payload_arguments:
- arg_type: inline_sampler
offset: 40
size: 4
addrmode: bindless
addrspace: sampler
sampler_index: 0
- arg_type: inline_sampler
offset: 44
size: 4
addrmode: bindless
addrspace: sampler
sampler_index: 1
inline_samplers:
- sampler_index: 1
addrmode: none
filtermode: nearest
- sampler_index: 0
addrmode: repeat
filtermode: linear
normalized: true
...
)===";
auto err = decodeZeInfoKernelEntry(zeinfo);
EXPECT_EQ(NEO::DecodeError::success, err);
EXPECT_TRUE(errors.empty());
ASSERT_EQ(2U, kernelDescriptor->inlineSamplers.size());
const auto &inlineSampler1 = kernelDescriptor->inlineSamplers[0];
EXPECT_EQ(1U, inlineSampler1.samplerIndex);
EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::AddrMode::none, inlineSampler1.addrMode);
EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::FilterMode::nearest, inlineSampler1.filterMode);
EXPECT_FALSE(inlineSampler1.isNormalized);
EXPECT_EQ(44u, inlineSampler1.bindless);
EXPECT_EQ(4u, inlineSampler1.size);
const auto &inlineSampler0 = kernelDescriptor->inlineSamplers[1];
EXPECT_EQ(0U, inlineSampler0.samplerIndex);
EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::AddrMode::repeat, inlineSampler0.addrMode);
EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::FilterMode::linear, inlineSampler0.filterMode);
EXPECT_TRUE(inlineSampler0.isNormalized);
EXPECT_EQ(40u, inlineSampler0.bindless);
EXPECT_EQ(4u, inlineSampler0.size);
}
TEST_F(decodeZeInfoKernelEntryTest, givenGlobalBufferAndConstBufferWhenPopulatingKernelDescriptorThenPopulateThemProperly) {
NEO::ConstStringRef zeinfo = R"===(
kernels: