From 1b9c510614a116e6e96cf9d83469809d8cdbfb3a Mon Sep 17 00:00:00 2001 From: "Tratnack, Geoffrey" Date: Fri, 23 Sep 2022 23:39:38 +0000 Subject: [PATCH] Update to command_encoder, fix bug changing dynamic state memory Adding ULT for encode and command container changes Refactor getHeapSpaceAllowGrow and getHeapWithRequiredSizeAndAlignment Signed-off-by: Tratnack, Geoffrey Related-To: LOCI-3365 --- .../source/command_container/cmdcontainer.cpp | 23 ++++-------- .../command_container/command_encoder.h | 2 +- .../command_container/command_encoder.inl | 22 +++++++++++- .../command_encoder_bdw_and_later.inl | 5 +-- .../command_container_tests.cpp | 20 +++++++++++ .../encoders/test_encode_dispatch_kernel.cpp | 35 ++++++++++++++++++- 6 files changed, 85 insertions(+), 22 deletions(-) diff --git a/shared/source/command_container/cmdcontainer.cpp b/shared/source/command_container/cmdcontainer.cpp index a40973263a..c2b0ed334f 100644 --- a/shared/source/command_container/cmdcontainer.cpp +++ b/shared/source/command_container/cmdcontainer.cpp @@ -86,7 +86,10 @@ CommandContainer::ErrorCode CommandContainer::initialize(Device *device, Allocat addToResidencyContainer(cmdBufferAllocation); } if (requireHeaps) { - constexpr size_t heapSize = 65536u; + size_t heapSize = 65536u; + if (DebugManager.flags.ForceDefaultHeapSize.get() != -1) { + heapSize = DebugManager.flags.ForceDefaultHeapSize.get() * MemoryConstants::kiloByte; + } heapHelper = std::unique_ptr(new HeapHelper(device->getMemoryManager(), device->getDefaultEngine().commandStreamReceiver->getInternalAllocationStorage(), device->getNumGenericSubDevices() > 1u)); for (uint32_t i = 0; i < IndirectHeap::Type::NUM_TYPES; i++) { @@ -186,22 +189,7 @@ size_t CommandContainer::getTotalCmdBufferSize() { void *CommandContainer::getHeapSpaceAllowGrow(HeapType heapType, size_t size) { - auto indirectHeap = getIndirectHeap(heapType); - - if (immediateCmdListSharedHeap(heapType)) { - UNRECOVERABLE_IF(indirectHeap == nullptr); - UNRECOVERABLE_IF(indirectHeap->getAvailableSpace() < size); - } else { - if (indirectHeap->getAvailableSpace() < size) { - size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace(); - newSize *= 2; - newSize = std::max(newSize, indirectHeap->getAvailableSpace() + size); - newSize = alignUp(newSize, MemoryConstants::pageSize); - this->createAndAssignNewHeap(heapType, newSize); - } - } - - return indirectHeap->getSpace(size); + return getHeapWithRequiredSizeAndAlignment(heapType, size, 0)->getSpace(size); } IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment) { @@ -219,6 +207,7 @@ IndirectHeap *CommandContainer::getHeapWithRequiredSizeAndAlignment(HeapType hea } else { if (indirectHeap->getAvailableSpace() < sizeRequested) { size_t newSize = indirectHeap->getUsed() + indirectHeap->getAvailableSpace(); + newSize = std::max(newSize, indirectHeap->getAvailableSpace() + sizeRequested); newSize = alignUp(newSize, MemoryConstants::pageSize); auto oldAlloc = getIndirectHeapAllocation(heapType); this->createAndAssignNewHeap(heapType, newSize); diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index dd2769350a..d9ea068d51 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -75,7 +75,7 @@ struct EncodeDispatchKernel { static void setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t numGrf, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const HardwareInfo &hwInfo); - static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset); + static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset, const HardwareInfo &hwInfo); static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels, const size_t *lws, diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 397f19c0d3..8e38eeefdb 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -518,7 +518,9 @@ template void EncodeSurfaceState::encodeImplicitScalingParams(const EncodeSurfaceStateArgs &args) {} template -void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset) { +void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset, const HardwareInfo &hwInfo) { + + using STATE_BASE_ADDRESS = typename Family::STATE_BASE_ADDRESS; if (container.nextIddInBlock == container.getNumIddPerBlock()) { if (ApiSpecificConfig::getBindlessConfiguration()) { @@ -531,6 +533,24 @@ void *EncodeDispatchKernel::getInterfaceDescriptor(CommandContainer &con } container.nextIddInBlock = 0; + if (container.isHeapDirty(HeapType::DYNAMIC_STATE)) { + PipeControlArgs syncArgs; + syncArgs.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); + syncArgs.hdcPipelineFlush = true; + MemorySynchronizationCommands::addSingleBarrier(*container.getCommandStream(), syncArgs); + + STATE_BASE_ADDRESS sba; + EncodeStateBaseAddressArgs encodeStateBaseAddressArgs = { + &container, + sba, + 0, + false, + false, + false}; + EncodeStateBaseAddress::encode(encodeStateBaseAddressArgs); + container.setDirtyStateForAllHeaps(false); + } + EncodeMediaInterfaceDescriptorLoad::encode(container); } diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index a11208966b..c502d91658 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -203,8 +203,7 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis } uint32_t numIDD = 0u; - void *ptr = getInterfaceDescriptor(container, numIDD); - memcpy_s(ptr, sizeof(idd), &idd, sizeof(idd)); + void *iddPtr = getInterfaceDescriptor(container, numIDD, hwInfo); cmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); cmd.setIndirectDataLength(sizeThreadData); @@ -233,6 +232,8 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis auto threadGroupCount = cmd.getThreadGroupIdXDimension() * cmd.getThreadGroupIdYDimension() * cmd.getThreadGroupIdZDimension(); EncodeDispatchKernel::adjustInterfaceDescriptorData(idd, hwInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired); + memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd)); + if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::DebugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) { void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false)); args.additionalCommands->push_back(commandBuffer); diff --git a/shared/test/unit_test/command_container/command_container_tests.cpp b/shared/test/unit_test/command_container/command_container_tests.cpp index c405bb8bb6..01120532f0 100644 --- a/shared/test/unit_test/command_container/command_container_tests.cpp +++ b/shared/test/unit_test/command_container/command_container_tests.cpp @@ -187,6 +187,26 @@ TEST_F(CommandContainerTest, givenEnabledLocalMemoryAndIsaInSystemMemoryWhenCmdC EXPECT_EQ(instructionHeapBaseAddress, cmdContainer.getInstructionHeapBaseAddress()); } +TEST_F(CommandContainerTest, givenForceDefaultHeapSizeWhenCmdContainerIsInitializedThenHeapIsCreatedWithProperSize) { + DebugManagerStateRestore restorer; + DebugManager.flags.ForceDefaultHeapSize.set(32); // in KB + + auto executionEnvironment = new NEO::ExecutionEnvironment(); + const size_t numDevices = 1; + executionEnvironment->prepareRootDeviceEnvironments(numDevices); + executionEnvironment->rootDeviceEnvironments[0]->setHwInfo(defaultHwInfo.get()); + executionEnvironment->rootDeviceEnvironments[0]->initGmm(); + + auto device = std::unique_ptr(Device::create(executionEnvironment, 0u)); + + CommandContainer cmdContainer; + auto status = cmdContainer.initialize(device.get(), nullptr, true); + EXPECT_EQ(CommandContainer::ErrorCode::SUCCESS, status); + + auto indirectHeap = cmdContainer.getIndirectHeap(IndirectHeap::Type::INDIRECT_OBJECT); + EXPECT_EQ(indirectHeap->getAvailableSpace(), 32 * MemoryConstants::kiloByte); +} + TEST_F(CommandContainerTest, givenCommandContainerDuringInitWhenAllocateGfxMemoryFailsThenErrorIsReturned) { CommandContainer cmdContainer; pDevice->executionEnvironment->memoryManager.reset(new FailMemoryManager(0, *pDevice->executionEnvironment)); diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp index 2c4b14f703..d4585d2e26 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp @@ -676,9 +676,10 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenCleanHeapsAndSlmChange EXPECT_EQ(slmSizeBefore + 1, cmdContainer->slmSize); } -HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNextIddInBlockZeorWhenDispatchKernelThenMediaInterfaceDescriptorEncoded) { +HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNextIddInBlockZeroWhenDispatchKernelThenMediaInterfaceDescriptorEncoded) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; uint32_t dims[] = {2, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); @@ -695,7 +696,39 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNextIddInBlockZeorWhenD GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); + auto itorSBA = find(commands.begin(), commands.end()); auto itorPC = find(commands.begin(), commands.end()); + ASSERT_EQ(itorSBA, commands.end()); // no flush needed + ASSERT_NE(itorPC, commands.end()); +} + +HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, giveNextIddInBlockZeroWhenDispatchKernelAndDynamicStateHeapDirtyThenStateBaseAddressEncodedAndMediaInterfaceDescriptorEncoded) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; + uint32_t dims[] = {2, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + + cmdContainer->getIndirectHeap(HeapType::DYNAMIC_STATE)->align(EncodeStates::alignInterfaceDescriptorData); + cmdContainer->setIddBlock(cmdContainer->getHeapSpaceAllowGrow(HeapType::DYNAMIC_STATE, sizeof(INTERFACE_DESCRIPTOR_DATA) * cmdContainer->getNumIddPerBlock())); + cmdContainer->nextIddInBlock = cmdContainer->getNumIddPerBlock(); + + // ensure heap has no available space left so that it will be reallocated and set to dirty + auto heap = cmdContainer->getIndirectHeap(HeapType::DYNAMIC_STATE); + heap->getSpace(heap->getAvailableSpace()); + + bool requiresUncachedMocs = false; + EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs); + + EncodeDispatchKernel::encode(*cmdContainer.get(), dispatchArgs, nullptr); + + GenCmdList commands; + CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); + + auto itorSBA = find(commands.begin(), commands.end()); + auto itorPC = find(commands.begin(), commands.end()); + ASSERT_NE(itorSBA, commands.end()); // flush needed ASSERT_NE(itorPC, commands.end()); }