From 1c37da280c2685ecd2a72d66d0ba4d7f8b237b6d Mon Sep 17 00:00:00 2001 From: Mateusz Hoppe Date: Mon, 6 Nov 2023 19:07:42 +0000 Subject: [PATCH] fix: fix bindless offset patching for images - usingSurfaceStateHeap indicates if any of the args is using local ssh in bindless kernels: without global allocator - ssh is used for all args with global bindless allocator - ssh used only for buffer with offset set in surface state, otherwise not used When any of the args is using ssh - getSurfaceStateHeapDataSize() returns non-zero size. Related-To: NEO-7063 Signed-off-by: Mateusz Hoppe --- level_zero/core/source/kernel/kernel_imp.cpp | 3 +- .../unit_tests/sources/kernel/test_kernel.cpp | 106 ++++++++++++++++++ .../command_encoder_bdw_and_later.inl | 10 +- .../command_encoder_xehp_and_later.inl | 10 +- 4 files changed, 120 insertions(+), 9 deletions(-) diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 9412539ec9..166a7139b3 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -580,7 +580,7 @@ ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle isBindlessOffsetSet[argIndex] = true; this->residencyContainer.push_back(ssInHeap->heapAllocation); } else { - + usingSurfaceStateHeap[argIndex] = true; auto ssPtr = ptrOffset(surfaceStateHeapData.get(), getSurfaceStateIndexForBindlessOffset(arg.bindless) * surfaceStateSize); image->copyRedescribedSurfaceStateToSSH(ssPtr, 0u); } @@ -781,6 +781,7 @@ ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void isBindlessOffsetSet[argIndex] = true; this->residencyContainer.push_back(ssInHeap->heapAllocation); } else { + usingSurfaceStateHeap[argIndex] = true; auto ssPtr = ptrOffset(surfaceStateHeapData.get(), getSurfaceStateIndexForBindlessOffset(arg.bindless) * surfaceStateSize); image->copySurfaceStateToSSH(ssPtr, 0u, isMediaBlockImage); } diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index 4ab54d9292..d2698cfe39 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -2044,6 +2044,32 @@ TEST_F(KernelImpPatchBindlessTest, GivenKernelImpWhenPatchBindlessOffsetCalledTh neoDevice->decRefInternal(); } +HWTEST2_F(KernelImpPatchBindlessTest, GivenBindlessKernelAndNoGlobalBindlessAllocatorWhenInitializedThenBindlessOffsetSetAndUsingSurfaceStateAreFalse, MatchAny) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + + ModuleBuildLog *moduleBuildLog = nullptr; + this->module.reset(new WhiteBox<::L0::Module>{this->device, moduleBuildLog, ModuleType::User}); + this->createModuleFromMockBinary(ModuleType::User); + + for (auto &kernelImmData : this->module->kernelImmDatas) { + auto &arg = const_cast(kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as()); + arg.bindless = 0x40; + arg.bindful = undefined; + const_cast(kernelImmData->getDescriptor()).kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + const_cast(kernelImmData->getDescriptor()).kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + } + + ze_kernel_desc_t desc = {}; + desc.pKernelName = kernelName.c_str(); + + WhiteBoxKernelHw mockKernel; + mockKernel.module = module.get(); + mockKernel.initialize(&desc); + + EXPECT_FALSE(mockKernel.isBindlessOffsetSet[0]); + EXPECT_FALSE(mockKernel.usingSurfaceStateHeap[0]); +} + HWTEST2_F(KernelImpPatchBindlessTest, GivenKernelImpWhenSetSurfaceStateBindlessThenSurfaceStateUpdated, MatchAny) { using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; @@ -2135,6 +2161,58 @@ HWTEST2_F(KernelImpPatchBindlessTest, GivenMisalignedBufferAddressWhenSettingSur EXPECT_EQ(mockKernel.surfaceStateHeapDataSize, mockKernel.getSurfaceStateHeapDataSize()); } +HWTEST2_F(KernelImpPatchBindlessTest, GivenMisalignedAndAlignedBufferAddressWhenSettingSurfaceStateThenKernelReportsNonZeroSurfaceStateHeapDataSize, MatchAny) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + + ze_kernel_desc_t desc = {}; + desc.pKernelName = kernelName.c_str(); + + WhiteBoxKernelHw mockKernel; + mockKernel.module = module.get(); + mockKernel.initialize(&desc); + auto &arg = const_cast(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as()); + arg.bindless = 0x40; + arg.bindful = undefined; + auto &arg2 = const_cast(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[1].template as()); + arg2.bindless = 0x48; + arg2.bindful = undefined; + const_cast(mockKernel.kernelImmData->getDescriptor()).kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + const_cast(mockKernel.kernelImmData->getDescriptor()).kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + const_cast(mockKernel.kernelImmData->getDescriptor()).initBindlessOffsetToSurfaceState(); + + neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(neoDevice->getMemoryManager(), + neoDevice->getNumGenericSubDevices() > 1, + neoDevice->getRootDeviceIndex(), + neoDevice->getDeviceBitfield()); + + auto &gfxCoreHelper = device->getGfxCoreHelper(); + size_t size = gfxCoreHelper.getRenderSurfaceStateSize(); + uint64_t gpuAddress = 0x2000; + void *buffer = reinterpret_cast(gpuAddress); + + NEO::MockGraphicsAllocation mockAllocation(buffer, gpuAddress, size); + auto expectedSsInHeap = device->getNEODevice()->getBindlessHeapsHelper()->allocateSSInHeap(size, &mockAllocation, NEO::BindlessHeapsHelper::GLOBAL_SSH); + mockAllocation.setBindlessInfo(expectedSsInHeap); + + memset(expectedSsInHeap.ssPtr, 0, size); + + // misaligned buffer - requires different surface state + mockKernel.setBufferSurfaceState(0, ptrOffset(buffer, 8), &mockAllocation); + // aligned buffer - using allocated bindless surface state + mockKernel.setBufferSurfaceState(1, buffer, &mockAllocation); + + auto surfaceStateOnSsh = reinterpret_cast(mockKernel.surfaceStateHeapData.get()); + + EXPECT_EQ(reinterpret_cast(ptrOffset(buffer, 8)), surfaceStateOnSsh->getSurfaceBaseAddress()); + EXPECT_FALSE(mockKernel.isBindlessOffsetSet[0]); + EXPECT_TRUE(mockKernel.usingSurfaceStateHeap[0]); + + EXPECT_TRUE(mockKernel.isBindlessOffsetSet[1]); + EXPECT_FALSE(mockKernel.usingSurfaceStateHeap[1]); + + EXPECT_EQ(mockKernel.surfaceStateHeapDataSize, mockKernel.getSurfaceStateHeapDataSize()); +} + HWTEST2_F(KernelImpPatchBindlessTest, GivenKernelImpWhenSetSurfaceStateBindfulThenSurfaceStateNotUpdated, MatchAny) { using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; ze_kernel_desc_t desc = {}; @@ -2539,6 +2617,30 @@ HWTEST2_F(SetKernelArg, givenImageAndBindlessKernelWhenSetArgImageThenCopySurfac EXPECT_EQ(imageHW->passedSurfaceStateHeap, expectedSsInHeap.ssPtr); EXPECT_EQ(imageHW->passedSurfaceStateOffset, 0u); EXPECT_TRUE(kernel->isBindlessOffsetSet[3]); + EXPECT_FALSE(kernel->usingSurfaceStateHeap[3]); +} + +HWTEST2_F(SetKernelArg, givenNoGlobalAllocatorAndBindlessKernelWhenSetArgImageThenBindlessOffsetIsNotSetAndSshIsUsed, ImageSupport) { + createKernel(); + + auto &imageArg = const_cast(kernel->kernelImmData->getDescriptor().payloadMappings.explicitArgs[3].template as()); + auto &addressingMode = kernel->kernelImmData->getDescriptor().kernelAttributes.imageAddressingMode; + const_cast(addressingMode) = NEO::KernelDescriptor::Bindless; + imageArg.bindless = 0x0; + imageArg.bindful = undefined; + ze_image_desc_t desc = {}; + desc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + + auto imageHW = std::make_unique>(); + auto ret = imageHW->initialize(device, &desc); + auto handle = imageHW->toHandle(); + ASSERT_EQ(ZE_RESULT_SUCCESS, ret); + + ret = kernel->setArgImage(3, sizeof(imageHW.get()), &handle); + EXPECT_EQ(ZE_RESULT_SUCCESS, ret); + + EXPECT_FALSE(kernel->isBindlessOffsetSet[3]); + EXPECT_TRUE(kernel->usingSurfaceStateHeap[3]); } HWTEST2_F(SetKernelArg, givenBindlessKernelAndNoAvailableSpaceOnSshWhenSetArgImageCalledThenOutOfMemoryErrorReturned, ImageSupport) { @@ -2606,6 +2708,7 @@ HWTEST2_F(SetKernelArg, givenImageBindlessKernelAndGlobalBindlessHelperWhenSetAr EXPECT_EQ(imageHW->passedRedescribedSurfaceStateHeap, ptrOffset(expectedSsInHeap.ssPtr, surfaceStateSize)); EXPECT_EQ(imageHW->passedRedescribedSurfaceStateOffset, 0u); EXPECT_TRUE(kernel->isBindlessOffsetSet[3]); + EXPECT_FALSE(kernel->usingSurfaceStateHeap[3]); } HWTEST2_F(SetKernelArg, givenGlobalBindlessHelperAndImageViewWhenAllocatingBindlessSlotThenViewHasDifferentSlotThanParentImage, ImageSupport) { @@ -2744,6 +2847,8 @@ HWTEST2_F(SetKernelArg, givenImageAndBindlessKernelWhenSetArgRedescribedImageCal mockKernel.surfaceStateHeapData = std::make_unique(surfaceStateSize); mockKernel.descriptor.initBindlessOffsetToSurfaceState(); mockKernel.residencyContainer.resize(1); + mockKernel.isBindlessOffsetSet.resize(1, 0); + mockKernel.usingSurfaceStateHeap.resize(1, false); ze_image_desc_t desc = {}; desc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; @@ -2759,6 +2864,7 @@ HWTEST2_F(SetKernelArg, givenImageAndBindlessKernelWhenSetArgRedescribedImageCal auto expectedSsInHeap = ptrOffset(mockKernel.surfaceStateHeapData.get(), mockKernel.kernelImmData->getDescriptor().getBindlessOffsetToSurfaceState().find(0x0)->second * surfaceStateSize); EXPECT_EQ(imageHW->passedRedescribedSurfaceStateHeap, expectedSsInHeap); EXPECT_EQ(imageHW->passedRedescribedSurfaceStateOffset, 0u); + EXPECT_TRUE(mockKernel.usingSurfaceStateHeap[0]); } HWTEST2_F(SetKernelArg, givenBindlessKernelAndNoAvailableSpaceOnSshWhenSetArgRedescribedImageCalledThenOutOfMemoryErrorReturned, ImageSupport) { diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 462835b21a..d870575f56 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -116,19 +116,21 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } } else { bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr; - if (args.dispatchInterface->getSurfaceStateHeapDataSize() > 0u) { + auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize(); + + if (sshHeapSize > 0u) { auto ssh = args.surfaceStateHeap; if (ssh == nullptr) { container.prepareBindfulSsh(); - ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, args.dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, sshHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); } uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); if (globalBindlessSsh) { bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress()); } // Allocate space for new ssh data - auto dstSurfaceState = ssh->getSpace(args.dispatchInterface->getSurfaceStateHeapDataSize()); - memcpy_s(dstSurfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), args.dispatchInterface->getSurfaceStateHeapData(), args.dispatchInterface->getSurfaceStateHeapDataSize()); + auto dstSurfaceState = ssh->getSpace(sshHeapSize); + memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize); args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset); } } diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 9f3ec60e55..49cba08baa 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -145,19 +145,21 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } } else { bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr; - if (args.dispatchInterface->getSurfaceStateHeapDataSize() > 0u) { + auto sshHeapSize = args.dispatchInterface->getSurfaceStateHeapDataSize(); + + if (sshHeapSize > 0u) { auto ssh = args.surfaceStateHeap; if (ssh == nullptr) { container.prepareBindfulSsh(); - ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, args.dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); + ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, sshHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); } uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); if (globalBindlessSsh) { bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress()); } // Allocate space for new ssh data - auto dstSurfaceState = ssh->getSpace(args.dispatchInterface->getSurfaceStateHeapDataSize()); - memcpy_s(dstSurfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), args.dispatchInterface->getSurfaceStateHeapData(), args.dispatchInterface->getSurfaceStateHeapDataSize()); + auto dstSurfaceState = ssh->getSpace(sshHeapSize); + memcpy_s(dstSurfaceState, sshHeapSize, args.dispatchInterface->getSurfaceStateHeapData(), sshHeapSize); args.dispatchInterface->patchBindlessOffsetsInCrossThreadData(bindlessSshBaseOffset); } }