diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 3fbe0310f2..e0c6b805ad 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -1414,6 +1414,26 @@ void KernelImp::patchSamplerBindlessOffsetsInCrossThreadData(uint64_t samplerSta } } } + + for (size_t index = 0; index < kernelImmData->getDescriptor().inlineSamplers.size(); index++) { + const auto &sampler = kernelImmData->getDescriptor().inlineSamplers[index]; + + auto crossThreadOffset = NEO::undefined; + if (sampler.bindless != NEO::undefined) { + crossThreadOffset = sampler.bindless; + } else { + continue; + } + + auto samplerIndex = sampler.samplerIndex; + + if (samplerIndex < std::numeric_limits::max()) { + auto patchLocation = ptrOffset(crossThreadData, crossThreadOffset); + auto surfaceStateOffset = static_cast(samplerStateOffset + samplerIndex * samplerStateSize); + auto patchValue = surfaceStateOffset; + patchWithRequiredSize(const_cast(patchLocation), sampler.size, patchValue); + } + } } uint32_t KernelImp::getSurfaceStateIndexForBindlessOffset(NEO::CrossThreadDataOffset bindlessOffset) const { diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index fc1bd20bb9..1ce6df277c 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -4018,5 +4018,68 @@ TEST_F(BindlessKernelTest, givenBindlessKernelWhenPatchingSamplerOffsetsInCrossT EXPECT_EQ(0u, crossThreadData[2]); } +TEST_F(BindlessKernelTest, givenBindlessKernelWithInlineSamplersWhenPatchingSamplerOffsetsInCrossThreadDataThenCorrectBindlessOffsetsAreWritten) { + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + + mockKernel.descriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + mockKernel.descriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = 0x0; + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + + auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::argTPointer); + argDescriptor2.as() = NEO::ArgDescPointer(); + argDescriptor2.as().bindful = NEO::undefined; + argDescriptor2.as().stateless = 2 * sizeof(uint64_t); + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor2); + + mockKernel.descriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.bindless = 3 * sizeof(uint64_t); + mockKernel.descriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.bindless = 4 * sizeof(uint64_t); + + NEO::KernelDescriptor::InlineSampler inlineSampler = {}; + inlineSampler.samplerIndex = 0; + inlineSampler.addrMode = NEO::KernelDescriptor::InlineSampler::AddrMode::clampBorder; + inlineSampler.filterMode = NEO::KernelDescriptor::InlineSampler::FilterMode::linear; + inlineSampler.isNormalized = true; + inlineSampler.bindless = 5 * sizeof(uint64_t); + inlineSampler.size = sizeof(uint64_t); + mockKernel.descriptor.inlineSamplers.push_back(inlineSampler); + + inlineSampler.samplerIndex = 1; + inlineSampler.bindless = 6 * sizeof(uint64_t); + inlineSampler.size = sizeof(uint64_t); + mockKernel.descriptor.inlineSamplers.push_back(inlineSampler); + + mockKernel.descriptor.payloadMappings.samplerTable.numSamplers = 2; + mockKernel.isBindlessOffsetSet.resize(2, 0); + mockKernel.usingSurfaceStateHeap.resize(2, 0); + + mockKernel.descriptor.initBindlessOffsetToSurfaceState(); + + mockKernel.crossThreadData = std::make_unique(7 * sizeof(uint64_t)); + mockKernel.crossThreadDataSize = 7 * sizeof(uint64_t); + memset(mockKernel.crossThreadData.get(), 0, mockKernel.crossThreadDataSize); + + const uint64_t baseAddress = 0x1000; + auto &gfxCoreHelper = this->device->getGfxCoreHelper(); + auto samplerStateSize = gfxCoreHelper.getSamplerStateSize(); + + auto patchValue1 = (static_cast(baseAddress + 0 * samplerStateSize)); + auto patchValue2 = (static_cast(baseAddress + 1 * samplerStateSize)); + + mockKernel.patchSamplerBindlessOffsetsInCrossThreadData(baseAddress); + + auto crossThreadData = std::make_unique(mockKernel.crossThreadDataSize / sizeof(uint64_t)); + memcpy(crossThreadData.get(), mockKernel.crossThreadData.get(), mockKernel.crossThreadDataSize); + + EXPECT_EQ(patchValue1, crossThreadData[5]); + EXPECT_EQ(patchValue2, crossThreadData[6]); +} + } // namespace ult } // namespace L0 diff --git a/shared/source/device_binary_format/zebin/zeinfo.h b/shared/source/device_binary_format/zebin/zeinfo.h index 026da40cb4..16bcb95210 100644 --- a/shared/source/device_binary_format/zebin/zeinfo.h +++ b/shared/source/device_binary_format/zebin/zeinfo.h @@ -129,6 +129,7 @@ inline constexpr ConstStringRef regionGroupSize("region_group_size"); inline constexpr ConstStringRef regionGroupDimension("region_group_dimension"); inline constexpr ConstStringRef regionGroupWgCount("region_group_wg_count"); inline constexpr ConstStringRef regionGroupBarrierBuffer("region_group_barrier_buffer"); +inline constexpr ConstStringRef inlineSampler("inline_sampler"); namespace Image { inline constexpr ConstStringRef width("image_width"); @@ -508,6 +509,7 @@ enum ArgType : uint8_t { argTypeRegionGroupDimension, argTypeRegionGroupWgCount, argTypeRegionGroupBarrierBuffer, + argTypeInlineSampler, argTypeMax }; diff --git a/shared/source/device_binary_format/zebin/zeinfo_decoder.cpp b/shared/source/device_binary_format/zebin/zeinfo_decoder.cpp index 880822afda..d3c9254c6e 100644 --- a/shared/source/device_binary_format/zebin/zeinfo_decoder.cpp +++ b/shared/source/device_binary_format/zebin/zeinfo_decoder.cpp @@ -536,12 +536,12 @@ DecodeError decodeZeInfoKernelEntry(NEO::KernelDescriptor &dst, NEO::Yaml::YamlP return decodeError; } - decodeError = decodeZeInfoKernelPayloadArguments(dst, yamlParser, zeInfokernelSections, outErrReason, outWarning); + decodeError = decodeZeInfoKernelInlineSamplers(dst, yamlParser, zeInfokernelSections, outErrReason, outWarning); if (DecodeError::success != decodeError) { return decodeError; } - decodeError = decodeZeInfoKernelInlineSamplers(dst, yamlParser, zeInfokernelSections, outErrReason, outWarning); + decodeError = decodeZeInfoKernelPayloadArguments(dst, yamlParser, zeInfokernelSections, outErrReason, outWarning); if (DecodeError::success != decodeError) { return decodeError; } @@ -1111,6 +1111,24 @@ DecodeError populateKernelPayloadArgument(NEO::KernelDescriptor &dst, const Kern return DecodeError::success; }; + auto populateInlineSampler = [&src, &outErrReason, &kernelName](auto &dst, ConstStringRef typeName) { + if (dst.payloadMappings.samplerTable.numSamplers < src.samplerIndex) { + outErrReason.append("DeviceBinaryFormat::zebin : Invalid sampler index for argument of type " + typeName.str() + " in context of : " + kernelName + ".\n"); + return DecodeError::invalidBinary; + } + + if (src.addrmode == Types::Kernel::PayloadArgument::memoryAddressingModeBindless) { + for (auto &sampler : dst.inlineSamplers) { + if (static_cast(sampler.samplerIndex) == src.samplerIndex) { + sampler.bindless = src.offset; + sampler.size = src.size; + break; + } + } + } + return DecodeError::success; + }; + switch (src.argType) { default: outErrReason.append("DeviceBinaryFormat::zebin : Invalid arg type in cross thread data section in context of : " + kernelName + ".\n"); @@ -1398,6 +1416,9 @@ DecodeError populateKernelPayloadArgument(NEO::KernelDescriptor &dst, const Kern case Types::Kernel::argTypeRegionGroupBarrierBuffer: dst.kernelAttributes.flags.usesRegionGroupBarrier = true; return populateArgPointerStateless(dst.payloadMappings.implicitArgs.regionGroupBarrierBuffer); + + case Types::Kernel::argTypeInlineSampler: + return populateInlineSampler(dst, Tags::Kernel::PayloadArgument::ArgType::inlineSampler); } UNREACHABLE(); diff --git a/shared/source/device_binary_format/zebin/zeinfo_enum_lookup.h b/shared/source/device_binary_format/zebin/zeinfo_enum_lookup.h index 91eaf07172..5c249e709a 100644 --- a/shared/source/device_binary_format/zebin/zeinfo_enum_lookup.h +++ b/shared/source/device_binary_format/zebin/zeinfo_enum_lookup.h @@ -22,7 +22,7 @@ using namespace Tags::Kernel::PayloadArgument::ArgType::Sampler::Vme; using ArgType = Types::Kernel::ArgType; inline constexpr ConstStringRef name = "argument type"; -inline constexpr LookupArray lookup({{ +inline constexpr LookupArray lookup({{ {packedLocalIds, ArgType::argTypePackedLocalIds}, {localId, ArgType::argTypeLocalId}, {localSize, ArgType::argTypeLocalSize}, @@ -68,6 +68,7 @@ inline constexpr LookupArray lookup({{ {regionGroupDimension, ArgType::argTypeRegionGroupDimension}, {regionGroupWgCount, ArgType::argTypeRegionGroupWgCount}, {regionGroupBarrierBuffer, ArgType::argTypeRegionGroupBarrierBuffer}, + {inlineSampler, ArgType::argTypeInlineSampler}, }}); static_assert(lookup.size() == ArgType::argTypeMax - 1, "Every enum field must be present"); } // namespace ArgType diff --git a/shared/source/kernel/kernel_descriptor.h b/shared/source/kernel/kernel_descriptor.h index 7f93735c52..2c9ae83d1f 100644 --- a/shared/source/kernel/kernel_descriptor.h +++ b/shared/source/kernel/kernel_descriptor.h @@ -234,6 +234,9 @@ struct KernelDescriptor { bool isNormalized; AddrMode addrMode; FilterMode filterMode; + CrossThreadDataOffset bindless = undefined; + uint8_t size = undefined; + constexpr uint32_t getSamplerBindfulOffset() const { return borderColorStateSize + samplerStateSize * samplerIndex; } diff --git a/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp b/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp index e181f21805..246284852b 100644 --- a/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp +++ b/shared/test/unit_test/device_binary_format/zebin_decoder_tests.cpp @@ -6814,6 +6814,9 @@ kernels: EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::AddrMode::clampEdge, inlineSampler.addrMode); EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::FilterMode::nearest, inlineSampler.filterMode); EXPECT_TRUE(inlineSampler.isNormalized); + + EXPECT_EQ(undefined, inlineSampler.bindless); + EXPECT_EQ(undefined, inlineSampler.size); } TEST_F(decodeZeInfoKernelEntryTest, GivenInvalidInlineSamplersEntryThenPopulateKernelDescriptorFails) { @@ -6848,6 +6851,59 @@ kernels: EXPECT_EQ(NEO::DecodeError::invalidBinary, err); } +TEST_F(decodeZeInfoKernelEntryTest, GivenBindlessInlineSamplersThenPopulateKernelDescriptorSetsBindlessOffsetAndSize) { + ConstStringRef zeinfo = R"===( +kernels: + - name : some_kernel + execution_env: + simd_size: 8 + has_sample: true + payload_arguments: + - arg_type: inline_sampler + offset: 40 + size: 4 + addrmode: bindless + addrspace: sampler + sampler_index: 0 + - arg_type: inline_sampler + offset: 44 + size: 4 + addrmode: bindless + addrspace: sampler + sampler_index: 1 + inline_samplers: + - sampler_index: 1 + addrmode: none + filtermode: nearest + - sampler_index: 0 + addrmode: repeat + filtermode: linear + normalized: true +... +)==="; + auto err = decodeZeInfoKernelEntry(zeinfo); + EXPECT_EQ(NEO::DecodeError::success, err); + EXPECT_TRUE(errors.empty()); + + ASSERT_EQ(2U, kernelDescriptor->inlineSamplers.size()); + + const auto &inlineSampler1 = kernelDescriptor->inlineSamplers[0]; + EXPECT_EQ(1U, inlineSampler1.samplerIndex); + EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::AddrMode::none, inlineSampler1.addrMode); + EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::FilterMode::nearest, inlineSampler1.filterMode); + EXPECT_FALSE(inlineSampler1.isNormalized); + EXPECT_EQ(44u, inlineSampler1.bindless); + EXPECT_EQ(4u, inlineSampler1.size); + + const auto &inlineSampler0 = kernelDescriptor->inlineSamplers[1]; + EXPECT_EQ(0U, inlineSampler0.samplerIndex); + EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::AddrMode::repeat, inlineSampler0.addrMode); + EXPECT_EQ(NEO::KernelDescriptor::InlineSampler::FilterMode::linear, inlineSampler0.filterMode); + EXPECT_TRUE(inlineSampler0.isNormalized); + EXPECT_EQ(40u, inlineSampler0.bindless); + EXPECT_EQ(4u, inlineSampler0.size); +} + TEST_F(decodeZeInfoKernelEntryTest, givenGlobalBufferAndConstBufferWhenPopulatingKernelDescriptorThenPopulateThemProperly) { NEO::ConstStringRef zeinfo = R"===( kernels: