diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index 89eda5c1f4..e0a488e81e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -70,6 +70,11 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K NEO::EncodeDispatchKernel::getSizeRequiredSsh(*kernelInfo), NEO::EncodeDispatchKernel::getDefaultSshAlignment()}; + // update SSH size - when global bindless addressing is used, kernel args may not require ssh space + if (kernel->getSurfaceStateHeapDataSize() == 0) { + sshReserveArgs.size = 0; + } + auto &dshReserveConfig = commandContainer.getDynamicStateHeapReserve(); NEO::HeapReserveArguments dshReserveArgs = { dshReserveConfig.indirectHeapReservation, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 15cd684335..a4e3b45c88 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -135,7 +135,8 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K NEO::EncodeDispatchKernel::getSizeRequiredSsh(*kernelInfo), NEO::EncodeDispatchKernel::getDefaultSshAlignment()}; - if (device->getNEODevice()->getBindlessHeapsHelper() && NEO::KernelDescriptor::isBindlessAddressingKernel(kernelImmutableData->getDescriptor())) { + // update SSH size - when global bindless addressing is used, kernel args may not require ssh space + if (kernel->getSurfaceStateHeapDataSize() == 0) { sshReserveArgs.size = 0; } diff --git a/level_zero/core/source/kernel/kernel_hw.h b/level_zero/core/source/kernel/kernel_hw.h index fb8277bd81..8a2cdbe264 100644 --- a/level_zero/core/source/kernel/kernel_hw.h +++ b/level_zero/core/source/kernel/kernel_hw.h @@ -46,11 +46,14 @@ struct KernelHw : public KernelImp { auto argInfo = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as(); bool offsetWasPatched = NEO::patchNonPointer(ArrayRef(this->crossThreadData.get(), this->crossThreadDataSize), argInfo.bufferOffset, static_cast(offset)); + bool offsetedAddress = false; if (false == offsetWasPatched) { // fallback to handling offset in surface state + offsetedAddress = baseAddress != reinterpret_cast(address); baseAddress = reinterpret_cast(address); bufferSizeForSsh -= offset; DEBUG_BREAK_IF(baseAddress != (baseAddress & sshAlignmentMask)); + offset = 0; } void *surfaceStateAddress = nullptr; @@ -61,9 +64,13 @@ struct KernelHw : public KernelImp { surfaceState = *reinterpret_cast(surfaceStateAddress); } else if (NEO::isValidOffset(argInfo.bindless)) { - if (this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper()) { + isBindlessOffsetSet[argIndex] = false; + usingSurfaceStateHeap[argIndex] = false; + if (this->module->getDevice()->getNEODevice()->getBindlessHeapsHelper() && !offsetedAddress) { surfaceStateAddress = patchBindlessSurfaceState(alloc, argInfo.bindless); + isBindlessOffsetSet[argIndex] = true; } else { + usingSurfaceStateHeap[argIndex] = true; surfaceStateAddress = ptrOffset(surfaceStateHeapData.get(), getSurfaceStateIndexForBindlessOffset(argInfo.bindless) * sizeof(typename GfxFamily::RENDER_SURFACE_STATE)); } } diff --git a/level_zero/core/source/kernel/kernel_imp.cpp b/level_zero/core/source/kernel/kernel_imp.cpp index 8daf105fc3..753cfa68dc 100644 --- a/level_zero/core/source/kernel/kernel_imp.cpp +++ b/level_zero/core/source/kernel/kernel_imp.cpp @@ -569,6 +569,7 @@ ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle patchWithRequiredSize(const_cast(patchLocation), sizeof(patchValue), patchValue); image->copyRedescribedSurfaceStateToSSH(ptrOffset(ssInHeap.ssPtr, surfaceStateSize), 0u); + isBindlessOffsetSet[argIndex] = true; this->residencyContainer.push_back(ssInHeap.heapAllocation); } else { @@ -764,6 +765,7 @@ ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void } auto ssPtr = patchBindlessSurfaceState(image->getAllocation(), arg.bindless); + isBindlessOffsetSet[argIndex] = true; image->copySurfaceStateToSSH(ssPtr, 0u, isMediaBlockImage); } else { auto &gfxCoreHelper = this->module->getDevice()->getNEODevice()->getRootDeviceEnvironmentRef().getHelper(); @@ -976,6 +978,8 @@ ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) { slmArgSizes.resize(this->kernelArgHandlers.size(), 0); kernelArgInfos.resize(this->kernelArgHandlers.size(), {}); isArgUncached.resize(this->kernelArgHandlers.size(), 0); + isBindlessOffsetSet.resize(this->kernelArgHandlers.size(), 0); + usingSurfaceStateHeap.resize(this->kernelArgHandlers.size(), 0); if (kernelImmData->getSurfaceStateHeapSize() > 0) { this->surfaceStateHeapData.reset(new uint8_t[kernelImmData->getSurfaceStateHeapSize()]); @@ -1253,7 +1257,7 @@ void KernelImp::patchBindlessOffsetsInCrossThreadData(uint64_t bindlessSurfaceSt auto patchLocation = ptrOffset(getCrossThreadData(), crossThreadOffset); auto index = getSurfaceStateIndexForBindlessOffset(crossThreadOffset); - if (index < std::numeric_limits::max()) { + if (index < std::numeric_limits::max() && !isBindlessOffsetSet[argIndex]) { auto surfaceStateOffset = static_cast(bindlessSurfaceStateBaseOffset + index * surfaceStateSize); auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(surfaceStateOffset)); diff --git a/level_zero/core/source/kernel/kernel_imp.h b/level_zero/core/source/kernel/kernel_imp.h index 437927589f..d909cd094c 100644 --- a/level_zero/core/source/kernel/kernel_imp.h +++ b/level_zero/core/source/kernel/kernel_imp.h @@ -108,7 +108,14 @@ struct KernelImp : Kernel { void patchSyncBuffer(NEO::GraphicsAllocation *gfxAllocation, size_t bufferOffset) override; const uint8_t *getSurfaceStateHeapData() const override { return surfaceStateHeapData.get(); } - uint32_t getSurfaceStateHeapDataSize() const override { return surfaceStateHeapDataSize; } + uint32_t getSurfaceStateHeapDataSize() const override { + if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelImmData->getDescriptor())) { + if (std::none_of(usingSurfaceStateHeap.cbegin(), usingSurfaceStateHeap.cend(), [](bool i) { return i; })) { + return 0; + } + } + return surfaceStateHeapDataSize; + } const uint8_t *getDynamicStateHeapData() const override { return dynamicStateHeapData.get(); } @@ -230,6 +237,8 @@ struct KernelImp : Kernel { uint32_t kernelRequiresUncachedMocsCount = 0; uint32_t kernelRequiresQueueUncachedMocsCount = 0; std::vector isArgUncached; + std::vector isBindlessOffsetSet; + std::vector usingSurfaceStateHeap; uint32_t globalOffsets[3] = {}; diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp index ac9d1088f0..a062219ba2 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.cpp @@ -124,8 +124,8 @@ void ModuleImmutableDataFixture::tearDown() { DeviceFixture::tearDown(); } -L0::Module *ModuleFixture::ProxyModuleImp::create(L0::Device *device, const ze_module_desc_t *desc, - ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result) { +ModuleFixture::ProxyModuleImp *ModuleFixture::ProxyModuleImp::create(L0::Device *device, const ze_module_desc_t *desc, + ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result) { auto module = new ProxyModuleImp(device, moduleBuildLog, type); *result = module->initialize(desc, device->getNEODevice()); diff --git a/level_zero/core/test/unit_tests/fixtures/module_fixture.h b/level_zero/core/test/unit_tests/fixtures/module_fixture.h index 18ae3a75b3..7b7252152b 100644 --- a/level_zero/core/test/unit_tests/fixtures/module_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/module_fixture.h @@ -13,6 +13,7 @@ #include "level_zero/core/source/module/module_imp.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_kernel.h" +#include "level_zero/core/test/unit_tests/mocks/mock_module.h" namespace L0 { namespace ult { @@ -122,15 +123,16 @@ struct ModuleImmutableDataFixture : public DeviceFixture { struct ModuleFixture : public DeviceFixture { - struct ProxyModuleImp : public ModuleImp { - using ModuleImp::ModuleImp; + struct ProxyModuleImp : public WhiteBox<::L0::Module> { + using BaseClass = WhiteBox<::L0::Module>; + using BaseClass::BaseClass; std::vector> &getKernelImmDatas() { return kernelImmDatas; } - static L0::Module *create(L0::Device *device, const ze_module_desc_t *desc, - ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result); + static ModuleFixture::ProxyModuleImp *create(L0::Device *device, const ze_module_desc_t *desc, + ModuleBuildLog *moduleBuildLog, ModuleType type, ze_result_t *result); }; void setUp(); @@ -145,7 +147,7 @@ struct ModuleFixture : public DeviceFixture { const std::string kernelName = "test"; const uint32_t numKernelArguments = 6; - std::unique_ptr module; + std::unique_ptr module; std::unique_ptr> kernel; std::unique_ptr zebinData; DebugManagerStateRestore restore; diff --git a/level_zero/core/test/unit_tests/mocks/mock_builtin_functions_lib_impl_timestamps.h b/level_zero/core/test/unit_tests/mocks/mock_builtin_functions_lib_impl_timestamps.h index 064372fa5c..690bc97cdf 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_builtin_functions_lib_impl_timestamps.h +++ b/level_zero/core/test/unit_tests/mocks/mock_builtin_functions_lib_impl_timestamps.h @@ -54,7 +54,7 @@ struct MockBuiltinFunctionsLibImplTimestamps : BuiltinFunctionsLibImpl { [[maybe_unused]] ze_result_t res; - Module *module; + L0::Module *module; ze_module_handle_t moduleHandle; ze_module_desc_t moduleDesc = {}; moduleDesc.format = builtInCode.type == BuiltInCodeType::Binary ? ZE_MODULE_FORMAT_NATIVE : ZE_MODULE_FORMAT_IL_SPIRV; diff --git a/level_zero/core/test/unit_tests/mocks/mock_kernel.h b/level_zero/core/test/unit_tests/mocks/mock_kernel.h index 130a6b56e5..e63817d70a 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_kernel.h +++ b/level_zero/core/test/unit_tests/mocks/mock_kernel.h @@ -48,6 +48,7 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp { using ::L0::KernelImp::dynamicStateHeapData; using ::L0::KernelImp::dynamicStateHeapDataSize; using ::L0::KernelImp::groupSize; + using ::L0::KernelImp::isBindlessOffsetSet; using ::L0::KernelImp::kernelHasIndirectAccess; using ::L0::KernelImp::kernelImmData; using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime; @@ -69,6 +70,7 @@ struct WhiteBox<::L0::KernelImp> : public ::L0::KernelImp { using ::L0::KernelImp::surfaceStateHeapData; using ::L0::KernelImp::surfaceStateHeapDataSize; using ::L0::KernelImp::unifiedMemoryControls; + using ::L0::KernelImp::usingSurfaceStateHeap; void setBufferSurfaceState(uint32_t argIndex, void *address, NEO::GraphicsAllocation *alloc) override {} diff --git a/level_zero/core/test/unit_tests/mocks/mock_module.h b/level_zero/core/test/unit_tests/mocks/mock_module.h index b172dfc33a..de70f6e007 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_module.h +++ b/level_zero/core/test/unit_tests/mocks/mock_module.h @@ -51,6 +51,7 @@ struct WhiteBox<::L0::Module> : public ::L0::ModuleImp { using BaseClass::allocatePrivateMemoryPerDispatch; using BaseClass::BaseClass; using BaseClass::builtFromSPIRv; + using BaseClass::checkIfPrivateMemoryPerDispatchIsNeeded; using BaseClass::copyPatchedSegments; using BaseClass::device; using BaseClass::exportedFunctionsSurface; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp index 253020eb0f..109e3401ee 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp @@ -1746,6 +1746,92 @@ HWTEST2_F(CommandListBindlessSshPrivateHeapTest, EXPECT_EQ(globalBindlessBase, sbaCmd->getBindlessSurfaceStateBaseAddress()); } +HWTEST2_F(CommandListBindlessSshPrivateHeapTest, + givenBindlessKernelStateBaseAddressTrackingAndGlobalBindlessEnabledWhenOneArgUsesKernelsSshThenReservedSshSizeIsNonZero, + IsAtLeastSkl) { + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + auto mockHelper = std::make_unique(device->getNEODevice()->getMemoryManager(), + device->getNEODevice()->getNumGenericSubDevices() > 1, + device->getNEODevice()->getRootDeviceIndex(), + device->getNEODevice()->getDeviceBitfield()); + mockHelper->globalBindlessDsh = false; + auto globalBindlessBase = mockHelper->getGlobalHeapsBase(); + device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getNEODevice()->getRootDeviceIndex()]->bindlessHeapsHelper.reset(mockHelper.release()); + + EXPECT_TRUE(commandList->stateBaseAddressTracking); + + auto &container = commandList->getCmdContainer(); + auto &cmdListStream = *container.getCommandStream(); + + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + + mockKernel.descriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + mockKernel.descriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = 0x0; + mockKernel.crossThreadData = std::make_unique(4 * sizeof(uint64_t)); + mockKernel.crossThreadDataSize = 4 * sizeof(uint64_t); + const auto surfStateSize = static_cast(device->getNEODevice()->getGfxCoreHelper().getRenderSurfaceStateSize()); + mockKernel.surfaceStateHeapData = std::make_unique(surfStateSize); + mockKernel.surfaceStateHeapDataSize = surfStateSize; + mockKernel.info.heapInfo.surfaceStateHeapSize = surfStateSize; + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + mockKernel.descriptor.initBindlessOffsetToSurfaceState(); + mockKernel.usingSurfaceStateHeap.resize(1, false); + mockKernel.isBindlessOffsetSet.resize(1, false); + mockKernel.usingSurfaceStateHeap[0] = true; + + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + auto result = commandList->appendLaunchKernel(mockKernel.toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + cmdListStream.getCpuBase(), + cmdListStream.getUsed())); + auto sbaCmds = findAll(cmdList.begin(), cmdList.end()); + EXPECT_EQ(1u, sbaCmds.size()); + + auto sshHeap = container.getIndirectHeap(NEO::HeapType::SURFACE_STATE); + EXPECT_NE(nullptr, sshHeap); + + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto &cmdQueueStream = commandQueue->commandStream; + size_t queueBefore = cmdQueueStream.getUsed(); + ze_command_list_handle_t cmdListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &cmdListHandle, nullptr, true); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + size_t queueAfter = cmdQueueStream.getUsed(); + + cmdList.clear(); + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, + ptrOffset(cmdQueueStream.getCpuBase(), queueBefore), + queueAfter - queueBefore)); + sbaCmds = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(expectedSbaCmds, sbaCmds.size()); + + auto sbaCmd = reinterpret_cast(*sbaCmds[0]); + EXPECT_TRUE(sbaCmd->getBindlessSurfaceStateBaseAddressModifyEnable()); + EXPECT_EQ(globalBindlessBase, sbaCmd->getBindlessSurfaceStateBaseAddress()); + + auto offsetInHeap = ptrDiff(sshHeap->getSpace(0), sshHeap->getCpuBase()) - surfStateSize; + uint64_t bindlessSshBaseOffset = ptrDiff(sshHeap->getGraphicsAllocation()->getGpuAddress(), sshHeap->getGraphicsAllocation()->getGpuBaseAddress()) + offsetInHeap; + auto patchValue = device->getNEODevice()->getGfxCoreHelper().getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(bindlessSshBaseOffset)); + auto patchLocation = reinterpret_cast(mockKernel.crossThreadData.get()); + EXPECT_EQ(patchValue, *patchLocation); +} + HWTEST2_F(CommandListStateBaseAddressPrivateHeapTest, givenStateBaseAddressTrackingWhenRegularCmdListAppendKernelChangesHeapsAndNextKernelIsAppendedThenFinalBaseAddressStateIsDispatchedInCommandListOnce, IsAtLeastSkl) { diff --git a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp index fe53c89e35..91b387e85f 100644 --- a/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/kernel/test_kernel.cpp @@ -63,6 +63,7 @@ struct WhiteBoxKernelHw : public KernelHw { using ::L0::KernelImp::dynamicStateHeapData; using ::L0::KernelImp::dynamicStateHeapDataSize; using ::L0::KernelImp::groupSize; + using ::L0::KernelImp::isBindlessOffsetSet; using ::L0::KernelImp::kernelImmData; using ::L0::KernelImp::kernelRequiresGenerationOfLocalIdsByRuntime; using ::L0::KernelImp::module; @@ -75,7 +76,9 @@ struct WhiteBoxKernelHw : public KernelHw { using ::L0::KernelImp::requiredWorkgroupOrder; using ::L0::KernelImp::residencyContainer; using ::L0::KernelImp::surfaceStateHeapData; + using ::L0::KernelImp::surfaceStateHeapDataSize; using ::L0::KernelImp::unifiedMemoryControls; + using ::L0::KernelImp::usingSurfaceStateHeap; void evaluateIfRequiresGenerationOfLocalIdsByRuntime(const NEO::KernelDescriptor &kernelDescriptor) override {} @@ -2054,6 +2057,8 @@ HWTEST2_F(KernelImpPatchBindlessTest, GivenKernelImpWhenSetSurfaceStateBindlessT auto &arg = const_cast(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as()); arg.bindless = 0x40; arg.bindful = undefined; + const_cast(mockKernel.kernelImmData->getDescriptor()).kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + const_cast(mockKernel.kernelImmData->getDescriptor()).kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(neoDevice->getMemoryManager(), neoDevice->getNumGenericSubDevices() > 1, @@ -2076,6 +2081,59 @@ HWTEST2_F(KernelImpPatchBindlessTest, GivenKernelImpWhenSetSurfaceStateBindlessT auto surfaceStateAfter = *reinterpret_cast(expectedSsInHeap.ssPtr); EXPECT_FALSE(memcmp(&surfaceStateAfter, &surfaceStateBefore, size) == 0); + EXPECT_TRUE(mockKernel.isBindlessOffsetSet[0]); + EXPECT_FALSE(mockKernel.usingSurfaceStateHeap[0]); +} + +HWTEST2_F(KernelImpPatchBindlessTest, GivenMisalignedBufferAddressWhenSettingSurfaceStateThenSurfaceStateInKernelHeapIsUsed, MatchAny) { + using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE; + + ze_kernel_desc_t desc = {}; + desc.pKernelName = kernelName.c_str(); + + WhiteBoxKernelHw mockKernel; + mockKernel.module = module.get(); + mockKernel.initialize(&desc); + auto &arg = const_cast(mockKernel.kernelImmData->getDescriptor().payloadMappings.explicitArgs[0].template as()); + arg.bindless = 0x40; + arg.bindful = undefined; + const_cast(mockKernel.kernelImmData->getDescriptor()).kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + const_cast(mockKernel.kernelImmData->getDescriptor()).kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + const_cast(mockKernel.kernelImmData->getDescriptor()).initBindlessOffsetToSurfaceState(); + + neoDevice->getExecutionEnvironment()->rootDeviceEnvironments[neoDevice->getRootDeviceIndex()]->createBindlessHeapsHelper(neoDevice->getMemoryManager(), + neoDevice->getNumGenericSubDevices() > 1, + neoDevice->getRootDeviceIndex(), + neoDevice->getDeviceBitfield()); + + auto &gfxCoreHelper = device->getGfxCoreHelper(); + size_t size = gfxCoreHelper.getRenderSurfaceStateSize(); + uint64_t gpuAddress = 0x2000; + void *buffer = reinterpret_cast(gpuAddress); + + NEO::MockGraphicsAllocation mockAllocation(buffer, gpuAddress, size); + auto expectedSsInHeap = device->getNEODevice()->getBindlessHeapsHelper()->allocateSSInHeap(size, &mockAllocation, NEO::BindlessHeapsHelper::GLOBAL_SSH); + mockAllocation.setBindlessInfo(expectedSsInHeap); + + memset(expectedSsInHeap.ssPtr, 0, size); + + EXPECT_EQ(0u, mockKernel.getSurfaceStateHeapDataSize()); + EXPECT_FALSE(mockKernel.isBindlessOffsetSet[0]); + EXPECT_FALSE(mockKernel.usingSurfaceStateHeap[0]); + + mockKernel.setBufferSurfaceState(0, buffer, &mockAllocation); + auto surfaceStateBefore = *reinterpret_cast(expectedSsInHeap.ssPtr); + + mockKernel.setBufferSurfaceState(0, ptrOffset(buffer, 8), &mockAllocation); + auto surfaceStateAfter = *reinterpret_cast(expectedSsInHeap.ssPtr); + auto surfaceStateOnSsh = *reinterpret_cast(mockKernel.surfaceStateHeapData.get()); + + EXPECT_TRUE(memcmp(&surfaceStateAfter, &surfaceStateBefore, size) == 0); + + EXPECT_EQ(reinterpret_cast(ptrOffset(buffer, 8)), surfaceStateOnSsh.getSurfaceBaseAddress()); + EXPECT_FALSE(mockKernel.isBindlessOffsetSet[0]); + EXPECT_TRUE(mockKernel.usingSurfaceStateHeap[0]); + EXPECT_EQ(mockKernel.surfaceStateHeapDataSize, mockKernel.getSurfaceStateHeapDataSize()); } HWTEST2_F(KernelImpPatchBindlessTest, GivenKernelImpWhenSetSurfaceStateBindfulThenSurfaceStateNotUpdated, MatchAny) { @@ -2481,6 +2539,7 @@ HWTEST2_F(SetKernelArg, givenImageAndBindlessKernelWhenSetArgImageThenCopySurfac auto expectedSsInHeap = imageHW->getAllocation()->getBindlessInfo(); EXPECT_EQ(imageHW->passedSurfaceStateHeap, expectedSsInHeap.ssPtr); EXPECT_EQ(imageHW->passedSurfaceStateOffset, 0u); + EXPECT_TRUE(kernel->isBindlessOffsetSet[3]); } HWTEST2_F(SetKernelArg, givenBindlessKernelAndNoAvailableSpaceOnSshWhenSetArgImageCalledThenOutOfMemoryErrorReturned, ImageSupport) { @@ -2547,6 +2606,7 @@ HWTEST2_F(SetKernelArg, givenImageBindlessKernelAndGlobalBindlessHelperWhenSetAr auto expectedSsInHeap = imageHW->getAllocation()->getBindlessInfo(); EXPECT_EQ(imageHW->passedRedescribedSurfaceStateHeap, ptrOffset(expectedSsInHeap.ssPtr, surfaceStateSize)); EXPECT_EQ(imageHW->passedRedescribedSurfaceStateOffset, 0u); + EXPECT_TRUE(kernel->isBindlessOffsetSet[3]); } HWTEST2_F(SetKernelArg, givenImageAndBindlessKernelWhenSetArgRedescribedImageCalledThenCopySurfaceStateToSSHCalledWithCorrectArgs, ImageSupport) { @@ -3265,6 +3325,9 @@ TEST_F(BindlessKernelTest, givenBindlessKernelWhenPatchingCrossThreadDataThenCor argDescriptor2.as().stateless = 2 * sizeof(uint64_t); mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor2); + mockKernel.isBindlessOffsetSet.resize(4, 0); + mockKernel.usingSurfaceStateHeap.resize(4, 0); + mockKernel.descriptor.initBindlessOffsetToSurfaceState(); mockKernel.crossThreadData = std::make_unique(4 * sizeof(uint64_t)); @@ -3287,6 +3350,59 @@ TEST_F(BindlessKernelTest, givenBindlessKernelWhenPatchingCrossThreadDataThenCor EXPECT_EQ(patchValue2, crossThreadData[1]); EXPECT_EQ(0u, crossThreadData[3]); } + +TEST_F(BindlessKernelTest, givenBindlessKernelWithPatchedBindlessOffsetsWhenPatchingCrossThreadDataThenMemoryIsNotPatched) { + Mock mockModule(this->device, nullptr); + Mock mockKernel; + mockKernel.module = &mockModule; + + mockKernel.descriptor.kernelAttributes.bufferAddressingMode = NEO::KernelDescriptor::BindlessAndStateless; + mockKernel.descriptor.kernelAttributes.imageAddressingMode = NEO::KernelDescriptor::Bindless; + + auto argDescriptor = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor.as() = NEO::ArgDescPointer(); + argDescriptor.as().bindful = NEO::undefined; + argDescriptor.as().bindless = 0x0; + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor); + + auto argDescriptorImg = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTImage); + argDescriptorImg.as() = NEO::ArgDescImage(); + argDescriptorImg.as().bindful = NEO::undefined; + argDescriptorImg.as().bindless = sizeof(uint64_t); + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptorImg); + + auto argDescriptor2 = NEO::ArgDescriptor(NEO::ArgDescriptor::ArgTPointer); + argDescriptor2.as() = NEO::ArgDescPointer(); + argDescriptor2.as().bindful = NEO::undefined; + argDescriptor2.as().stateless = 2 * sizeof(uint64_t); + mockKernel.descriptor.payloadMappings.explicitArgs.push_back(argDescriptor2); + + mockKernel.isBindlessOffsetSet.resize(4, 1); + mockKernel.isBindlessOffsetSet[1] = false; + + mockKernel.descriptor.initBindlessOffsetToSurfaceState(); + + mockKernel.crossThreadData = std::make_unique(4 * sizeof(uint64_t)); + mockKernel.crossThreadDataSize = 4 * sizeof(uint64_t); + memset(mockKernel.crossThreadData.get(), 0, mockKernel.crossThreadDataSize); + + const uint64_t baseAddress = 0x1000; + auto &gfxCoreHelper = this->device->getGfxCoreHelper(); + auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize(); + + auto patchValue2 = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast(baseAddress + surfaceStateSize)); + + mockKernel.patchBindlessOffsetsInCrossThreadData(baseAddress); + + auto crossThreadData = std::make_unique(mockKernel.crossThreadDataSize / sizeof(uint64_t)); + memcpy(crossThreadData.get(), mockKernel.crossThreadData.get(), mockKernel.crossThreadDataSize); + + EXPECT_EQ(0u, crossThreadData[0]); + EXPECT_EQ(patchValue2, crossThreadData[1]); + EXPECT_EQ(0u, crossThreadData[2]); + EXPECT_EQ(0u, crossThreadData[3]); +} + TEST_F(BindlessKernelTest, givenNoEntryInBindlessOffsetsMapWhenPatchingCrossThreadDataThenMemoryIsNotPatched) { Mock mockModule(this->device, nullptr); Mock mockKernel; diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index a9b9925683..b563d1ec70 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -115,12 +115,16 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } } else { bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr; - if (!globalBindlessSsh && args.dispatchInterface->getSurfaceStateHeapDataSize() > 0u) { + if (args.dispatchInterface->getSurfaceStateHeapDataSize() > 0u) { auto ssh = args.surfaceStateHeap; if (ssh == nullptr) { + container.prepareBindfulSsh(); ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, args.dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); } uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); + if (globalBindlessSsh) { + bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress()); + } // Allocate space for new ssh data auto dstSurfaceState = ssh->getSpace(args.dispatchInterface->getSurfaceStateHeapDataSize()); memcpy_s(dstSurfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), args.dispatchInterface->getSurfaceStateHeapData(), args.dispatchInterface->getSurfaceStateHeapDataSize()); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index a3b45aac09..033599092f 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -140,12 +140,16 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } } else { bool globalBindlessSsh = args.device->getBindlessHeapsHelper() != nullptr; - if (!globalBindlessSsh && args.dispatchInterface->getSurfaceStateHeapDataSize() > 0u) { + if (args.dispatchInterface->getSurfaceStateHeapDataSize() > 0u) { auto ssh = args.surfaceStateHeap; if (ssh == nullptr) { + container.prepareBindfulSsh(); ssh = container.getHeapWithRequiredSizeAndAlignment(HeapType::SURFACE_STATE, args.dispatchInterface->getSurfaceStateHeapDataSize(), BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); } uint64_t bindlessSshBaseOffset = ptrDiff(ssh->getSpace(0), ssh->getCpuBase()); + if (globalBindlessSsh) { + bindlessSshBaseOffset += ptrDiff(ssh->getGraphicsAllocation()->getGpuAddress(), ssh->getGraphicsAllocation()->getGpuBaseAddress()); + } // Allocate space for new ssh data auto dstSurfaceState = ssh->getSpace(args.dispatchInterface->getSurfaceStateHeapDataSize()); memcpy_s(dstSurfaceState, args.dispatchInterface->getSurfaceStateHeapDataSize(), args.dispatchInterface->getSurfaceStateHeapData(), args.dispatchInterface->getSurfaceStateHeapDataSize());