diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl index e5768f83c6..bdc7a9fcdb 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl @@ -48,7 +48,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K auto kernelInfo = kernelImmutableData->getKernelInfo(); commandContainer.ensureHeapSizePrepared( NEO::EncodeDispatchKernel::getSizeRequiredSsh(*kernelInfo), - NEO::EncodeDispatchKernel::getSizeRequiredDsh(*kernelInfo)); + NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, commandContainer.getNumIddPerBlock())); } appendEventForProfiling(event, true); auto perThreadScratchSize = std::max(this->getCommandListPerThreadScratchSize(), diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index e8d62c29d1..89a7bb8af0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -142,7 +142,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K auto kernelInfo = kernelImmutableData->getKernelInfo(); size_t dshSize = 0; if constexpr (GfxFamily::supportsSampler) { - dshSize = NEO::EncodeDispatchKernel::getSizeRequiredDsh(*kernelInfo); + dshSize = NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, commandContainer.getNumIddPerBlock()); } commandContainer.ensureHeapSizePrepared( NEO::EncodeDispatchKernel::getSizeRequiredSsh(*kernelInfo), diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp index ab6b6b8ed7..a0d2a3c827 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp @@ -895,7 +895,7 @@ HWTEST2_F(ImmediateCmdListSharedHeapsTest, givenMultipleCommandListsUsingSharedH } EXPECT_LT(0u, sshUsed); - size_t dshEstimated = NEO::EncodeDispatchKernel::getSizeRequiredDsh(*kernel->getImmutableData()->getKernelInfo()); + size_t dshEstimated = NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernel->getImmutableData()->getKernelInfo()->kernelDescriptor, 1); size_t sshEstimated = NEO::EncodeDispatchKernel::getSizeRequiredSsh(*kernel->getImmutableData()->getKernelInfo()); EXPECT_GE(dshEstimated, dshUsed); diff --git a/shared/source/command_container/cmdcontainer.cpp b/shared/source/command_container/cmdcontainer.cpp index 168c0de696..5ed391d8e7 100644 --- a/shared/source/command_container/cmdcontainer.cpp +++ b/shared/source/command_container/cmdcontainer.cpp @@ -188,10 +188,14 @@ size_t CommandContainer::getTotalCmdBufferSize() { void *CommandContainer::getHeapSpaceAllowGrow(HeapType heapType, size_t size) { - return getHeapWithRequiredSizeAndAlignment(heapType, size, 0)->getSpace(size); + return getHeapWithRequiredSize(heapType, size, 0, true)->getSpace(size); } IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment) { + return getHeapWithRequiredSize(heapType, sizeRequired, alignment, false); +} + +IndirectHeap *CommandContainer::getHeapWithRequiredSize(HeapType heapType, size_t sizeRequired, size_t alignment, bool allowGrow) { auto indirectHeap = getIndirectHeap(heapType); UNRECOVERABLE_IF(indirectHeap == nullptr); auto sizeRequested = sizeRequired; @@ -206,7 +210,9 @@ IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType hea } else { if (indirectHeap->getAvailableSpace() < sizeRequested) { size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace(); - newSize = std::max(newSize, indirectHeap->getAvailableSpace() + sizeRequested); + if (allowGrow) { + newSize = std::max(newSize, indirectHeap->getAvailableSpace() + sizeRequested); + } newSize = alignUp(newSize, MemoryConstants::pageSize); auto oldAlloc = getIndirectHeapAllocation(heapType); this->createAndAssignNewHeap(heapType, newSize); diff --git a/shared/source/command_container/cmdcontainer.h b/shared/source/command_container/cmdcontainer.h index 5149c3da46..56fe473c11 100644 --- a/shared/source/command_container/cmdcontainer.h +++ b/shared/source/command_container/cmdcontainer.h @@ -129,6 +129,7 @@ class CommandContainer : public NonCopyableOrMovableClass { protected: size_t getTotalCmdBufferSize(); + IndirectHeap *getHeapWithRequiredSize(HeapType heapType, size_t sizeRequired, size_t alignment, bool allowGrow); void createAndAssignNewHeap(HeapType heapType, size_t size); GraphicsAllocation *allocationIndirectHeaps[HeapType::NUM_TYPES] = {}; std::unique_ptr indirectHeaps[HeapType::NUM_TYPES]; diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index c15a83cb02..3e02fda3dc 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -89,7 +89,7 @@ struct EncodeDispatchKernel { static void setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const HardwareInfo &hwInfo); - static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset, const HardwareInfo &hwInfo); + static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset); static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels, const size_t *lws, @@ -128,9 +128,9 @@ struct EncodeDispatchKernel { static constexpr bool shouldUpdateGlobalAtomics(bool ¤tVal, bool refVal, bool updateCurrent); - static size_t getSizeRequiredDsh(const KernelInfo &kernelInfo); + static size_t getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t numIddsPerBlock); static size_t getSizeRequiredSsh(const KernelInfo &kernelInfo); - inline static uint32_t additionalSizeRequiredDsh(); + inline static uint32_t additionalSizeRequiredDsh(uint32_t numIddsPerBlock); }; template diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 82a54f56d8..36bf5e7f0f 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -543,9 +543,7 @@ template void EncodeSurfaceState::encodeImplicitScalingParams(const EncodeSurfaceStateArgs &args) {} template -void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset, const HardwareInfo &hwInfo) { - - using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS; +void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset) { if (container.nextIddInBlock == container.getNumIddPerBlock()) { if (ApiSpecificConfig::getBindlessConfiguration()) { @@ -557,26 +555,6 @@ void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &con sizeof(INTERFACE_DESCRIPTOR_DATA) * container.getNumIddPerBlock())); } container.nextIddInBlock = 0; - - if (container.isHeapDirty(HeapType::DYNAMIC_STATE)) { - PipeControlArgs syncArgs; - syncArgs.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); - syncArgs.hdcPipelineFlush = true; - MemorySynchronizationCommands::addSingleBarrier(*container.getCommandStream(), syncArgs); - - STATE_BASE_ADDRESS sba; - EncodeStateBaseAddressArgs encodeStateBaseAddressArgs = { - &container, - sba, - 0, - false, - false, - false}; - EncodeStateBaseAddress::encode(encodeStateBaseAddressArgs); - container.setDirtyStateForAllHeaps(false); - } - - EncodeMediaInterfaceDescriptorLoad::encode(container); } iddOffset = container.nextIddInBlock; @@ -745,17 +723,17 @@ template constexpr bool EncodeDispatchKernel::shouldUpdateGlobalAtomics(bool ¤tVal, bool refVal, bool updateCurrent) { return false; } template -size_t EncodeDispatchKernel::getSizeRequiredDsh(const KernelInfo &kernelInfo) { +size_t EncodeDispatchKernel::getSizeRequiredDsh(const KernelDescriptor &kernelDescriptor, uint32_t numIddsPerBlock) { using INTERFACE_DESCRIPTOR_DATA = typename Family::INTERFACE_DESCRIPTOR_DATA; constexpr auto samplerStateSize = sizeof(typename Family::SAMPLER_STATE); - const auto numSamplers = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.numSamplers; - const auto additionalDshSize = additionalSizeRequiredDsh(); + const auto numSamplers = kernelDescriptor.payloadMappings.samplerTable.numSamplers; + const auto additionalDshSize = additionalSizeRequiredDsh(numIddsPerBlock); if (numSamplers == 0U) { return alignUp(additionalDshSize, EncodeStates::alignInterfaceDescriptorData); } - size_t size = kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset - - kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor; + size_t size = kernelDescriptor.payloadMappings.samplerTable.tableOffset - + kernelDescriptor.payloadMappings.samplerTable.borderColor; size = alignUp(size, EncodeStates::alignIndirectStatePointer); size += numSamplers * samplerStateSize; diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index d512b5a209..6cc40edcfa 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -104,6 +104,15 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis PreemptionHelper::programInterfaceDescriptorDataPreemption(&idd, args.preemptionMode); + if (!ApiSpecificConfig::getBindlessConfiguration()) { + auto heap = container.getIndirectHeap(HeapType::DYNAMIC_STATE); + auto dshSizeRequired = NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, container.getNumIddPerBlock()); + if (heap->getAvailableSpace() <= dshSizeRequired) { + heap = container.getHeapWithRequiredSizeAndAlignment(HeapType::DYNAMIC_STATE, heap->getUsed() + heap->getAvailableSpace(), 0); + UNRECOVERABLE_IF(!heap); + } + } + uint32_t samplerStateOffset = 0; uint32_t samplerCount = 0; @@ -166,6 +175,9 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis args.dispatchInterface->getPerThreadData(), sizePerThreadDataForWholeGroup); } + uint32_t numIDD = 0u; + void *iddPtr = getInterfaceDescriptor(container, numIDD); + auto slmSizeNew = args.dispatchInterface->getSlmTotalSize(); bool dirtyHeaps = container.isAnyHeapDirty(); bool flush = container.slmSize != slmSizeNew || dirtyHeaps || args.requiresUncachedMocs; @@ -199,15 +211,12 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis if (container.slmSize != slmSizeNew) { EncodeL3State::encode(container, slmSizeNew != 0u); container.slmSize = slmSizeNew; - - if (container.nextIddInBlock != container.getNumIddPerBlock()) { - EncodeMediaInterfaceDescriptorLoad::encode(container); - } } } - uint32_t numIDD = 0u; - void *iddPtr = getInterfaceDescriptor(container, numIDD, hwInfo); + if (numIDD == 0 || flush) { + EncodeMediaInterfaceDescriptorLoad::encode(container); + } cmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); cmd.setIndirectDataLength(sizeThreadData); @@ -545,8 +554,8 @@ template void EncodeDispatchKernel::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo) {} template -uint32_t EncodeDispatchKernel::additionalSizeRequiredDsh() { - return sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA); +uint32_t EncodeDispatchKernel::additionalSizeRequiredDsh(uint32_t numIddsPerBlock) { + return sizeof(typename Family::INTERFACE_DESCRIPTOR_DATA) * numIddsPerBlock; } } // namespace NEO diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 02658118a1..b9722693e0 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -775,7 +775,7 @@ template void EncodeDispatchKernel::adjustWalkOrder(WALKER_TYPE &walkerCmd, uint32_t requiredWorkGroupOrder, const HardwareInfo &hwInfo) {} template -uint32_t EncodeDispatchKernel::additionalSizeRequiredDsh() { +uint32_t EncodeDispatchKernel::additionalSizeRequiredDsh(uint32_t) { return 0u; } diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp index de5f569b69..ff9d5b16c8 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp @@ -733,6 +733,110 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNextIddInBlockZeroWhenD ASSERT_NE(itorPC, commands.end()); } +HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumSamplersOneWhenHeapIsDirtyThenSamplerStateWasCopiedAndStateBaseAddressEncoded) { + using SAMPLER_STATE = typename FamilyType::SAMPLER_STATE; + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; + uint32_t numSamplers = 1; + SAMPLER_STATE samplerState; + memset(&samplerState, 2, sizeof(SAMPLER_STATE)); + + uint32_t dims[] = {2, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.numSamplers = numSamplers; + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.tableOffset = 0U; + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.borderColor = 0U; + const uint8_t *dshData = reinterpret_cast(&samplerState); + dispatchInterface->getDynamicStateHeapDataResult = const_cast(dshData); + + bool requiresUncachedMocs = false; + EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs); + + auto dshBeforeFlush = cmdContainer->getIndirectHeap(HeapType::DYNAMIC_STATE); + auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); + dshBeforeFlush->getSpace(dshBeforeFlush->getAvailableSpace() - NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, cmdContainer->getNumIddPerBlock())); + auto cpuBaseBeforeFlush = dshBeforeFlush->getCpuBase(); + + EncodeDispatchKernel::encode(*cmdContainer.get(), dispatchArgs, nullptr); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); + auto itorSBA = find(commands.begin(), commands.end()); + auto itorPC = find(commands.begin(), commands.end()); + EXPECT_NE(itorSBA, commands.end()); // flush needed + EXPECT_NE(itorPC, commands.end()); + + auto dshAfterFlush = cmdContainer->getIndirectHeap(HeapType::DYNAMIC_STATE); + EXPECT_NE(cpuBaseBeforeFlush, dshAfterFlush->getCpuBase()); + + auto interfaceDescriptorData = static_cast(cmdContainer->getIddBlock()); + + auto borderColorOffsetInDsh = 0; + samplerState.setIndirectStatePointer(static_cast(borderColorOffsetInDsh)); + + auto samplerStateOffset = interfaceDescriptorData->getSamplerStatePointer(); + + auto pSmplr = reinterpret_cast(ptrOffset(dshAfterFlush->getCpuBase(), samplerStateOffset)); + EXPECT_EQ(memcmp(pSmplr, &samplerState, sizeof(SAMPLER_STATE)), 0); +} + +HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNumSamplersOneAndNextIDDInBlockWhenHeapIsDirtyThenSamplerStateWasCopiedAndStateBaseAddressEncoded) { + using SAMPLER_STATE = typename FamilyType::SAMPLER_STATE; + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; + uint32_t numSamplers = 1; + SAMPLER_STATE samplerState; + memset(&samplerState, 2, sizeof(SAMPLER_STATE)); + + cmdContainer->getIndirectHeap(HeapType::DYNAMIC_STATE)->align(EncodeStates::alignInterfaceDescriptorData); + cmdContainer->setIddBlock(cmdContainer->getHeapSpaceAllowGrow(HeapType::DYNAMIC_STATE, sizeof(INTERFACE_DESCRIPTOR_DATA) * cmdContainer->getNumIddPerBlock())); + cmdContainer->nextIddInBlock = cmdContainer->getNumIddPerBlock(); + + uint32_t dims[] = {2, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.numSamplers = numSamplers; + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.tableOffset = 0U; + dispatchInterface->kernelDescriptor.payloadMappings.samplerTable.borderColor = 0U; + const uint8_t *dshData = reinterpret_cast(&samplerState); + dispatchInterface->getDynamicStateHeapDataResult = const_cast(dshData); + + bool requiresUncachedMocs = false; + EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs); + + auto dshBeforeFlush = cmdContainer->getIndirectHeap(HeapType::DYNAMIC_STATE); + auto &kernelDescriptor = dispatchInterface->getKernelDescriptor(); + auto sizeRequiredMinusIDD = dshBeforeFlush->getAvailableSpace() - NEO::EncodeDispatchKernel::getSizeRequiredDsh(kernelDescriptor, cmdContainer->getNumIddPerBlock()) + sizeof(INTERFACE_DESCRIPTOR_DATA) * cmdContainer->getNumIddPerBlock(); + dshBeforeFlush->getSpace(sizeRequiredMinusIDD); + auto cpuBaseBeforeFlush = dshBeforeFlush->getCpuBase(); + auto usedBefore = cmdContainer->getIndirectHeap(HeapType::SURFACE_STATE)->getUsed(); + + EncodeDispatchKernel::encode(*cmdContainer.get(), dispatchArgs, nullptr); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); + auto itorSBA = find(commands.begin(), commands.end()); + auto itorPC = find(commands.begin(), commands.end()); + EXPECT_NE(itorSBA, commands.end()); // flush needed + EXPECT_NE(itorPC, commands.end()); + + auto dshAfterFlush = cmdContainer->getIndirectHeap(HeapType::DYNAMIC_STATE); + EXPECT_NE(cpuBaseBeforeFlush, dshAfterFlush->getCpuBase()); + + auto interfaceDescriptorData = static_cast(cmdContainer->getIddBlock()); + + auto borderColorOffsetInDsh = usedBefore; + samplerState.setIndirectStatePointer(static_cast(borderColorOffsetInDsh)); + + auto samplerStateOffset = interfaceDescriptorData->getSamplerStatePointer(); + + auto pSmplr = reinterpret_cast(ptrOffset(dshAfterFlush->getCpuBase(), samplerStateOffset)); + EXPECT_EQ(memcmp(pSmplr, &samplerState, sizeof(SAMPLER_STATE)), 0); +} + HWTEST_F(CommandEncodeStatesTest, givenPauseOnEnqueueSetToNeverWhenEncodingWalkerThenCommandsToPatchAreNotPresent) { DebugManagerStateRestore restorer; DebugManager.flags.PauseOnEnqueue.set(-1); @@ -1371,7 +1475,7 @@ HWTEST_F(CommandEncodeStatesTest, givenKernelInfoWhenGettingRequiredDshSpaceThen // no samplers kernelInfo.kernelDescriptor.payloadMappings.samplerTable.numSamplers = 0; - size_t size = EncodeDispatchKernel::getSizeRequiredDsh(kernelInfo); + size_t size = EncodeDispatchKernel::getSizeRequiredDsh(kernelInfo.kernelDescriptor, 1); EXPECT_EQ(expectedSize, size); // two samplers, no border color state @@ -1389,7 +1493,7 @@ HWTEST_F(CommandEncodeStatesTest, givenKernelInfoWhenGettingRequiredDshSpaceThen expectedSize = alignedSamplers; } - size = EncodeDispatchKernel::getSizeRequiredDsh(kernelInfo); + size = EncodeDispatchKernel::getSizeRequiredDsh(kernelInfo.kernelDescriptor, 1); EXPECT_EQ(expectedSize, size); // three samplers, border color state @@ -1405,7 +1509,7 @@ HWTEST_F(CommandEncodeStatesTest, givenKernelInfoWhenGettingRequiredDshSpaceThen } else { expectedSize = alignedSamplers; } - size = EncodeDispatchKernel::getSizeRequiredDsh(kernelInfo); + size = EncodeDispatchKernel::getSizeRequiredDsh(kernelInfo.kernelDescriptor, 1); EXPECT_EQ(expectedSize, size); }