diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 9e981844c6..6a9d85d77b 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -619,9 +619,13 @@ ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle auto ssInHeap = image->getBindlessSlot(); auto patchLocation = ptrOffset(getCrossThreadData(), arg.bindless); // redescribed image's surface state is after image's implicit args and sampler - auto bindlessSlotOffset = ssInHeap->surfaceStateOffset + surfaceStateSize * NEO::BindlessImageSlot::redescribedImage; - auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(bindlessSlotOffset)); - patchWithRequiredSize(const_cast(patchLocation), sizeof(patchValue), patchValue); + uint64_t bindlessSlotOffset = ssInHeap->surfaceStateOffset + surfaceStateSize * NEO::BindlessImageSlot::redescribedImage; + uint32_t patchSize = this->heaplessEnabled ? 8u : 4u; + uint64_t patchValue = this->heaplessEnabled + ? bindlessSlotOffset + bindlessHeapsHelper->getGlobalHeapsBase() + : gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(bindlessSlotOffset)); + + patchWithRequiredSize(const_cast(patchLocation), patchSize, patchValue); image->copyRedescribedSurfaceStateToSSH(ptrOffset(ssInHeap->ssPtr, surfaceStateSize * NEO::BindlessImageSlot::redescribedImage), 0u); isBindlessOffsetSet[argIndex] = true; @@ -812,8 +816,12 @@ ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void auto ssInHeap = image->getBindlessSlot(); auto patchLocation = ptrOffset(getCrossThreadData(), arg.bindless); auto bindlessSlotOffset = ssInHeap->surfaceStateOffset; - auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(bindlessSlotOffset)); - patchWithRequiredSize(const_cast(patchLocation), sizeof(patchValue), patchValue); + uint32_t patchSize = this->heaplessEnabled ? 8u : 4u; + uint64_t patchValue = this->heaplessEnabled + ? bindlessSlotOffset + bindlessHeapsHelper->getGlobalHeapsBase() + : gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(bindlessSlotOffset)); + + patchWithRequiredSize(const_cast(patchLocation), patchSize, patchValue); image->copySurfaceStateToSSH(ssInHeap->ssPtr, 0u, isMediaBlockImage); image->copyImplicitArgsSurfaceStateToSSH(ptrOffset(ssInHeap->ssPtr, surfaceStateSize), 0u); diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 51c5bcb439..bf6039f028 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -3115,6 +3115,56 @@ HWTEST2_F(SetKernelArg, givenImageBindlessKernelAndGlobalBindlessHelperWhenSetAr EXPECT_EQ(0, std::count(kernel->argumentsResidencyContainer.begin(), kernel->argumentsResidencyContainer.end(), expectedSsInHeap.heapAllocation)); } +HWTEST2_F(SetKernelArg, givenHeaplessWhenPatchingImageWithBindlessEnabledCorrectSurfaceStateAddressIsPatchedInCrossThreadData, ImageSupport) { + + for (auto heaplessEnabled : {false, true}) { + + createKernel(); + kernel->heaplessEnabled = heaplessEnabled; + + neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(neoDevice, + neoDevice->getNumGenericSubDevices() > 1); + NEO::BindlessHeapsHelper *bindlessHeapsHelper = neoDevice->getBindlessHeapsHelper(); + ASSERT_NE(nullptr, bindlessHeapsHelper); + + auto &imageArg = const_cast(kernel->kernelImmData->getDescriptor().payloadMappings.explicitArgs[3].template as()); + auto &addressingMode = kernel->kernelImmData->getDescriptor().kernelAttributes.imageAddressingMode; + const_cast(addressingMode) = NEO::KernelDescriptor::Bindless; + imageArg.bindless = 0x8; + imageArg.bindful = undefined; + ze_image_desc_t desc = {}; + desc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + + auto imageHW = std::make_unique>(); + auto ret = imageHW->initialize(device, &desc); + auto handle = imageHW->toHandle(); + ASSERT_EQ(ZE_RESULT_SUCCESS, ret); + + ret = kernel->setArgRedescribedImage(3, handle); + EXPECT_EQ(ZE_RESULT_SUCCESS, ret); + + auto &gfxCoreHelper = neoDevice->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + auto ctd = kernel->crossThreadData.get(); + + auto ssInHeap = imageHW->getBindlessSlot(); + auto patchLocation = ptrOffset(ctd, imageArg.bindless); + uint64_t bindlessSlotOffset = ssInHeap->surfaceStateOffset + surfaceStateSize * NEO::BindlessImageSlot::redescribedImage; + uint64_t expectedPatchValue = kernel->heaplessEnabled + ? bindlessSlotOffset + bindlessHeapsHelper->getGlobalHeapsBase() + : gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(bindlessSlotOffset)); + + if (kernel->heaplessEnabled) { + uint64_t patchedValued = *(reinterpret_cast(patchLocation)); + EXPECT_EQ(expectedPatchValue, patchedValued); + } else { + uint32_t patchedValued = *(reinterpret_cast(patchLocation)); + EXPECT_EQ(static_cast(expectedPatchValue), patchedValued); + } + } +} + HWTEST2_F(SetKernelArg, givenGlobalBindlessHelperAndImageViewWhenAllocatingBindlessSlotThenViewHasDifferentSlotThanParentImage, ImageSupport) { createKernel(); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 72a1bea2b3..123db293c5 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -183,11 +183,9 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis uint32_t samplerCount = 0; - if constexpr (Family::supportsSampler && heaplessModeEnabled == false) { + if constexpr (Family::supportsSampler) { if (args.device->getDeviceInfo().imageSupport && !args.makeCommandView) { - uint32_t samplerStateOffset = 0; - if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0) { auto dsHeap = args.dynamicStateHeap; if (dsHeap == nullptr) { @@ -199,22 +197,28 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } UNRECOVERABLE_IF(!dsHeap); + auto bindlessHeapsHelper = args.device->getBindlessHeapsHelper(); samplerCount = kernelDescriptor.payloadMappings.samplerTable.numSamplers; - samplerStateOffset = EncodeStates::copySamplerState( + uint64_t samplerStateOffset = EncodeStates::copySamplerState( dsHeap, kernelDescriptor.payloadMappings.samplerTable.tableOffset, - kernelDescriptor.payloadMappings.samplerTable.numSamplers, kernelDescriptor.payloadMappings.samplerTable.borderColor, + kernelDescriptor.payloadMappings.samplerTable.numSamplers, + kernelDescriptor.payloadMappings.samplerTable.borderColor, args.dispatchInterface->getDynamicStateHeapData(), - args.device->getBindlessHeapsHelper(), rootDeviceEnvironment); + bindlessHeapsHelper, rootDeviceEnvironment); - if (args.device->getBindlessHeapsHelper() && !args.device->getBindlessHeapsHelper()->isGlobalDshSupported()) { + if (bindlessHeapsHelper && !bindlessHeapsHelper->isGlobalDshSupported()) { // add offset of graphics allocation base address relative to heap base address - samplerStateOffset += static_cast(ptrDiff(dsHeap->getGpuBase(), args.device->getBindlessHeapsHelper()->getGlobalHeapsBase())); + samplerStateOffset += static_cast(ptrDiff(dsHeap->getGpuBase(), bindlessHeapsHelper->getGlobalHeapsBase())); + } + if (heaplessModeEnabled && bindlessHeapsHelper) { + samplerStateOffset += bindlessHeapsHelper->getGlobalHeapsBase(); } args.dispatchInterface->patchSamplerBindlessOffsetsInCrossThreadData(samplerStateOffset); + if constexpr (!heaplessModeEnabled) { + idd.setSamplerStatePointer(static_cast(samplerStateOffset)); + } } - - idd.setSamplerStatePointer(samplerStateOffset); } } diff --git a/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h b/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h index 2c5f78e1bf..537ee47530 100644 --- a/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h +++ b/shared/test/unit_test/mocks/mock_dispatch_kernel_encoder_interface.h @@ -30,7 +30,9 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI { NEO::ImplicitArgs *getImplicitArgs() const override { return nullptr; } void patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceStateBaseOffset) const override { return; }; - void patchSamplerBindlessOffsetsInCrossThreadData(uint64_t samplerStateOffset) const override { return; }; + void patchSamplerBindlessOffsetsInCrossThreadData(uint64_t samplerStateOffset) const override { + samplerStateOffsetPassed = samplerStateOffset; + } MockGraphicsAllocation mockAllocation{}; static constexpr uint32_t crossThreadSize = 0x40; @@ -41,6 +43,8 @@ struct MockDispatchKernelEncoder : public DispatchKernelEncoderI { uint32_t requiredWalkGroupOrder = 0x0u; KernelDescriptor kernelDescriptor{}; + mutable uint64_t samplerStateOffsetPassed = 0u; + ADDMETHOD_CONST_NOBASE(getKernelDescriptor, const KernelDescriptor &, kernelDescriptor, ()); ADDMETHOD_CONST_NOBASE(getGroupSize, const uint32_t *, groupSizes, ()); ADDMETHOD_CONST_NOBASE(getSlmTotalSize, uint32_t, 0u, ());