From 73d558058c183376df1be7549de6fea8b5bb051d Mon Sep 17 00:00:00 2001 From: Zbigniew Zdanowicz Date: Tue, 2 Apr 2024 22:53:24 +0000 Subject: [PATCH] feature: add heapless and global stateless scratch address patching Related-To: NEO-10381 Signed-off-by: Zbigniew Zdanowicz --- level_zero/core/source/cmdlist/cmdlist.h | 5 ++ level_zero/core/source/cmdlist/cmdlist_hw.inl | 2 + .../cmdlist/cmdlist_hw_skl_to_tgllp.inl | 1 + .../cmdlist/cmdlist_hw_xehp_and_later.inl | 20 +++++ .../source/cmdlist/cmdlist_launch_params.h | 3 + level_zero/core/source/cmdqueue/cmdqueue_hw.h | 2 + .../core/source/cmdqueue/cmdqueue_hw.inl | 48 +++++++--- .../cmdqueue_xe_hp_core_and_later.inl | 6 ++ .../unit_tests/fixtures/cmdlist_fixture.cpp | 39 ++++++++ .../unit_tests/fixtures/cmdlist_fixture.h | 20 +++++ .../unit_tests/fixtures/cmdlist_fixture.inl | 88 +++++++++++++++++++ .../core/test/unit_tests/mocks/mock_cmdlist.h | 6 ++ .../test/unit_tests/mocks/mock_cmdqueue.h | 2 + .../sources/cmdlist/test_cmdlist_1.cpp | 2 + .../sources/cmdlist/test_cmdlist_4.cpp | 27 ++++++ .../test_cmdlist_append_launch_kernel_1.cpp | 1 + .../test_cmdlist_append_launch_kernel_3.cpp | 1 + .../command_container/command_encoder.h | 1 + .../command_encoder_xehp_and_later.inl | 23 +++-- shared/source/command_stream/CMakeLists.txt | 2 +- .../command_stream_receiver_hw_base.inl | 2 +- shared/test/common/helpers/unit_test_helper.h | 1 + .../unit_test_helper_bdw_and_later.inl | 5 ++ .../unit_test_helper_xehp_and_later.inl | 5 ++ .../libult/ult_command_stream_receiver.h | 2 +- .../command_stream_receiver_tests.cpp | 27 ++++++ .../fixtures/command_container_fixture.cpp | 1 + 27 files changed, 318 insertions(+), 24 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 13c013fddf..249b6802c9 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -360,6 +360,10 @@ struct CommandList : _ze_command_list_handle_t { return stateBaseAddressTracking; } + bool getCmdListScratchAddressPatchingEnabled() const { + return scratchAddressPatchingEnabled; + } + protected: NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize); NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed); @@ -441,6 +445,7 @@ struct CommandList : _ze_command_list_handle_t { bool useOnlyGlobalTimestamps = false; bool heaplessModeEnabled = false; bool heaplessStateInitEnabled = false; + bool scratchAddressPatchingEnabled = false; }; using CommandListAllocatorFn = CommandList *(*)(uint32_t); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index d2afce7757..f138ba0ab0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -3265,6 +3265,8 @@ void CommandListCoreFamily::clearCommandsToPatch() { case CommandToPatch::PauseOnEnqueuePipeControlEnd: UNRECOVERABLE_IF(commandToPatch.pCommand == nullptr); break; + case CommandToPatch::ComputeWalkerInlineDataScratch: + break; default: UNRECOVERABLE_IF(true); } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index acbef836b0..5dd196248b 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -219,6 +219,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K this->dcFlushSupport, // dcFlushEnable this->heaplessModeEnabled, // isHeaplessModeEnabled false, // interruptEvent + !this->scratchAddressPatchingEnabled, // immediateScratchAddressPatching }; NEO::EncodeDispatchKernel::encodeCommon(commandContainer, dispatchKernelArgs); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 7e3af11488..e630714c5f 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -127,11 +127,15 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K ", SIMD: ", kernelInfo->getMaxSimdSize()); bool needScratchSpace = false; + bool kernelNeedsScratchSpace = false; for (uint32_t slotId = 0u; slotId < 2; slotId++) { commandListPerThreadScratchSize[slotId] = std::max(commandListPerThreadScratchSize[slotId], kernelDescriptor.kernelAttributes.perThreadScratchSize[slotId]); if (commandListPerThreadScratchSize[slotId] > 0) { needScratchSpace = true; } + if (kernelDescriptor.kernelAttributes.perThreadScratchSize[slotId] > 0) { + kernelNeedsScratchSpace = true; + } } if ((this->cmdListHeapAddressModel == NEO::HeapAddressModel::privateHeaps) && needScratchSpace) { @@ -342,11 +346,27 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K this->dcFlushSupport, // dcFlushEnable this->heaplessModeEnabled, // isHeaplessModeEnabled interruptEvent, // interruptEvent + !this->scratchAddressPatchingEnabled, // immediateScratchAddressPatching }; NEO::EncodeDispatchKernel::encodeCommon(commandContainer, dispatchKernelArgs); launchParams.outWalker = dispatchKernelArgs.outWalkerPtr; + if (this->heaplessModeEnabled && this->scratchAddressPatchingEnabled && kernelNeedsScratchSpace) { + CommandToPatch scratchInlineData; + scratchInlineData.pDestination = dispatchKernelArgs.outWalkerPtr; + scratchInlineData.pCommand = nullptr; + scratchInlineData.type = CommandToPatch::CommandType::ComputeWalkerInlineDataScratch; + scratchInlineData.offset = NEO::EncodeDispatchKernel::getInlineDataOffset(dispatchKernelArgs) + + kernelDescriptor.payloadMappings.implicitArgs.scratchPointerAddress.offset; + scratchInlineData.patchSize = kernelDescriptor.payloadMappings.implicitArgs.scratchPointerAddress.pointerSize; + auto ssh = commandContainer.getIndirectHeap(NEO::HeapType::surfaceState); + if (ssh != nullptr) { + scratchInlineData.baseAddress = ssh->getGpuBase(); + } + commandsToPatch.push_back(scratchInlineData); + } + if (!this->isFlushTaskSubmissionEnabled) { this->containsStatelessUncachedResource = dispatchKernelArgs.requiresUncachedMocs; } diff --git a/level_zero/core/source/cmdlist/cmdlist_launch_params.h b/level_zero/core/source/cmdlist/cmdlist_launch_params.h index 130838cd7b..8676b6ce04 100644 --- a/level_zero/core/source/cmdlist/cmdlist_launch_params.h +++ b/level_zero/core/source/cmdlist/cmdlist_launch_params.h @@ -30,6 +30,7 @@ struct CommandToPatch { CbEventTimestampClearStoreDataImm, CbWaitEventSemaphoreWait, CbWaitEventLoadRegisterImm, + ComputeWalkerInlineDataScratch, Invalid }; void *pDestination = nullptr; @@ -37,6 +38,8 @@ struct CommandToPatch { size_t offset = 0; CommandType type = Invalid; size_t inOrderPatchListIndex = 0; + size_t patchSize = 0; + uint64_t baseAddress = 0; }; using CommandToPatchContainer = std::vector; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index c8d423583f..bb9e8e69c5 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -114,6 +114,7 @@ struct CommandQueueHw : public CommandQueueImp { bool rtDispatchRequired = false; bool globalInit = false; bool lockScratchController = false; + bool cmdListScratchAddressPatchingEnabled = false; }; ze_result_t executeCommandListsRegularHeapless(CommandListExecutionContext &ctx, @@ -251,6 +252,7 @@ struct CommandQueueHw : public CommandQueueImp { CommandListRequiredStateChange &cmdListRequired); inline void updateBaseAddressState(CommandList *lastCommandList); inline void updateDebugSurfaceState(CommandListExecutionContext &ctx); + inline void patchCommands(CommandList &commandList, CommandListExecutionContext &ctx); size_t alignedChildStreamPadding{}; }; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 0b21247496..ad6e287d43 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -146,6 +146,10 @@ ze_result_t CommandQueueHw::executeCommandListsRegularHeapless( this->csr->getResidencyAllocations().reserve(ctx.spaceForResidency); + if (ctx.cmdListScratchAddressPatchingEnabled == true) { + this->handleScratchSpaceAndUpdateGSBAStateDirtyFlag(ctx); + } + NEO::LinearStream child(nullptr); if (const auto ret = this->makeAlignedChildStreamAndSetGpuBase(child, linearStreamSizeEstimate); ret != ZE_RESULT_SUCCESS) { return ret; @@ -168,7 +172,7 @@ ze_result_t CommandQueueHw::executeCommandListsRegularHeapless( ctx.childGpuAddressPositionBeforeDynamicPreamble = child.getCurrentGpuAddressPosition(); - this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress()); + this->patchCommands(*commandList, ctx); this->programOneCmdListBatchBufferStart(commandList, child, ctx); this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList); @@ -254,9 +258,10 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( size_t linearStreamSizeEstimate = this->estimateLinearStreamSizeInitial(ctx); - if (this->heaplessModeEnabled == false) { + if (this->heaplessModeEnabled == false || ctx.cmdListScratchAddressPatchingEnabled == true) { this->handleScratchSpaceAndUpdateGSBAStateDirtyFlag(ctx); } + this->setFrontEndStateProperties(ctx); auto neoDevice = this->device->getNEODevice(); @@ -360,7 +365,7 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( } } - this->patchCommands(*commandList, this->csr->getScratchSpaceController()->getScratchPatchAddress()); + this->patchCommands(*commandList, ctx); this->programOneCmdListBatchBufferStart(commandList, child, ctx); this->prefetchMemoryToDeviceAssociatedWithCmdList(commandList); @@ -487,9 +492,8 @@ void CommandQueueHw::programFrontEndAndClearDirtyFlag( if (!shouldFrontEndBeProgrammed) { return; } - auto scratchSpaceController = this->csr->getScratchSpaceController(); - programFrontEnd(scratchSpaceController->getScratchPatchAddress(), - scratchSpaceController->getPerThreadScratchSpaceSizeSlot0(), + programFrontEnd(ctx.scratchSpaceController->getScratchPatchAddress(), + ctx.scratchSpaceController->getPerThreadScratchSpaceSizeSlot0(), cmdStream, csrState); ctx.frontEndStateDirty = false; @@ -740,6 +744,8 @@ void CommandQueueHw::setupCmdListsAndContextParams( } this->partitionCount = std::max(this->partitionCount, commandList->getPartitionCount()); + + ctx.cmdListScratchAddressPatchingEnabled |= commandList->getCmdListScratchAddressPatchingEnabled(); } makeResidentAndMigrate(ctx.isMigrationRequested, commandContainer.getResidencyContainer()); @@ -828,15 +834,23 @@ void CommandQueueHw::handleScratchSpaceAndUpdateGSBAStateDirtyFla if (ctx.lockScratchController) { defaultCsrLock = device->getNEODevice()->getDefaultEngine().commandStreamReceiver->obtainUniqueOwnership(); } + + bool localGsbaDirty = false; + bool localFrontEndDirty = false; handleScratchSpace(this->heapContainer, ctx.scratchSpaceController, ctx.globalStatelessAllocation, - ctx.gsbaStateDirty, ctx.frontEndStateDirty, + localGsbaDirty, localFrontEndDirty, ctx.perThreadScratchSpaceSlot0Size, ctx.perThreadScratchSpaceSlot1Size); - ctx.gsbaStateDirty |= this->csr->getGSBAStateDirty(); - ctx.scratchGsba = ctx.scratchSpaceController->calculateNewGSH(); - ctx.globalInit |= ctx.gsbaStateDirty; + if (this->heaplessModeEnabled == false) { + ctx.gsbaStateDirty |= localGsbaDirty; + ctx.frontEndStateDirty |= localFrontEndDirty; + + ctx.gsbaStateDirty |= this->csr->getGSBAStateDirty(); + ctx.globalInit |= ctx.gsbaStateDirty; + } + ctx.scratchGsba = ctx.scratchSpaceController->calculateNewGSH(); } template @@ -1172,10 +1186,9 @@ void CommandQueueHw::programOneCmdListBatchBufferStartSecondaryBa }) != returnPoints.end(); if (cmdBufferHasRestarts) { while (returnPointIdx < returnPointsSize && allocation == returnPoints[returnPointIdx].currentCmdBuffer) { - auto scratchSpaceController = this->csr->getScratchSpaceController(); ctx.cmdListBeginState.frontEndState.copyPropertiesComputeDispatchAllWalkerEnableDisableEuFusion(returnPoints[returnPointIdx].configSnapshot.frontEndState); - programFrontEnd(scratchSpaceController->getScratchPatchAddress(), - scratchSpaceController->getPerThreadScratchSpaceSizeSlot0(), + programFrontEnd(ctx.scratchSpaceController->getScratchPatchAddress(), + ctx.scratchSpaceController->getPerThreadScratchSpaceSizeSlot0(), commandStream, ctx.cmdListBeginState); NEO::EncodeBatchBufferStartOrEnd::programBatchBufferStart(&commandStream, @@ -1686,4 +1699,13 @@ size_t CommandQueueHw::estimateStateBaseAddressDebugTracking() { return size; } +template +void CommandQueueHw::patchCommands(CommandList &commandList, CommandListExecutionContext &ctx) { + uint64_t scratchAddress = ctx.scratchSpaceController->getScratchPatchAddress(); + if (this->heaplessModeEnabled && this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) { + scratchAddress += ctx.globalStatelessAllocation->getGpuAddress(); + } + patchCommands(commandList, scratchAddress); +} + } // namespace L0 diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index b847dbf20f..e27cc937ee 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -228,6 +228,12 @@ void CommandQueueHw::patchCommands(CommandList &commandList, uint args); break; } + case CommandToPatch::ComputeWalkerInlineDataScratch: { + uint64_t fullScratchAddress = scratchAddress + commandToPatch.baseAddress; + void *scratchAddressPatch = ptrOffset(commandToPatch.pDestination, commandToPatch.offset); + std::memcpy(scratchAddressPatch, &fullScratchAddress, commandToPatch.patchSize); + break; + } default: UNRECOVERABLE_IF(true); } diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp index 1bda85f35b..9208aa0296 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.cpp @@ -8,9 +8,11 @@ #include "level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h" #include "shared/source/built_ins/sip.h" +#include "shared/source/command_container/cmdcontainer.h" #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/ray_tracing_helper.h" +#include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/os_interface/os_interface.h" @@ -579,5 +581,42 @@ void CommandQueueThreadArbitrationPolicyFixture::tearDown() { L0::globalDriver = nullptr; } +void CommandListScratchPatchFixtureInit::setUpParams(int32_t globalStatelessMode, int32_t heaplessStateInitEnabled) { + fixtureGlobalStatelessMode = globalStatelessMode; + debugManager.flags.SelectCmdListHeapAddressModel.set(globalStatelessMode); + + ModuleMutableCommandListFixture::setUp(); + + commandList->scratchAddressPatchingEnabled = true; + commandList->heaplessModeEnabled = true; + commandList->heaplessStateInitEnabled = !!heaplessStateInitEnabled; + + commandListImmediate->heaplessModeEnabled = true; + commandListImmediate->heaplessStateInitEnabled = !!heaplessStateInitEnabled; + + commandQueue->heaplessModeEnabled = true; + commandQueue->heaplessStateInitEnabled = !!heaplessStateInitEnabled; + + mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x40; + mockKernelImmData->kernelDescriptor->payloadMappings.implicitArgs.scratchPointerAddress.pointerSize = 0x8; + mockKernelImmData->kernelDescriptor->payloadMappings.implicitArgs.scratchPointerAddress.offset = scratchInlineOffset; +} + +void CommandListScratchPatchFixtureInit::tearDown() { + ModuleMutableCommandListFixture::tearDown(); +} + +uint64_t CommandListScratchPatchFixtureInit::getSurfStateGpuBase(bool useImmediate) { + if (fixtureGlobalStatelessMode == 1) { + return device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getGlobalStatelessHeapAllocation()->getGpuAddress(); + } else { + if (useImmediate) { + return device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getIndirectHeap(NEO::surfaceState, 0).getGpuBase(); + } else { + return commandList->commandContainer.getIndirectHeap(NEO::surfaceState)->getGpuBase(); + } + } +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h index 16d7eb9f72..57daaea749 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.h @@ -362,5 +362,25 @@ struct CommandQueueThreadArbitrationPolicyFixture { L0::Device *device = nullptr; }; +struct CommandListScratchPatchFixtureInit : public ModuleMutableCommandListFixture { + void setUpParams(int32_t globalStatelessMode, int32_t heaplessStateInitEnabled); + void tearDown(); + + uint64_t getSurfStateGpuBase(bool useImmediate); + + template + void testScratchInline(bool useImmediate); + + int32_t fixtureGlobalStatelessMode = 0; + uint32_t scratchInlineOffset = 8; +}; + +template +struct CommandListScratchPatchFixture : public CommandListScratchPatchFixtureInit { + void setUp() { + CommandListScratchPatchFixtureInit::setUpParams(globalStatelessMode, heaplessStateInitEnabled); + } +}; + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl index 0dc8f5be80..b534cfb467 100644 --- a/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl +++ b/level_zero/core/test/unit_tests/fixtures/cmdlist_fixture.inl @@ -6,6 +6,9 @@ */ #include "shared/source/command_container/cmdcontainer.h" +#include "shared/source/command_container/command_encoder.h" +#include "shared/source/command_stream/command_stream_receiver.h" +#include "shared/source/command_stream/scratch_space_controller.h" #include "shared/source/command_stream/thread_arbitration_policy.h" #include "shared/source/helpers/register_offsets.h" #include "shared/source/indirect_heap/indirect_heap.h" @@ -1405,5 +1408,90 @@ void ImmediateCmdListSharedHeapsFlushTaskFixtureInit::testBody(NonKernelOperatio validateDispatchFlags(true, ultCsr.recordedImmediateDispatchFlags, ultCsr.recordedSsh); } +template +void CommandListScratchPatchFixtureInit::testScratchInline(bool useImmediate) { + auto csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver; + auto scratchController = csr->getScratchSpaceController(); + + auto ultCsr = static_cast *>(commandQueue->csr); + ultCsr->storeMakeResidentAllocations = true; + + NEO::EncodeDispatchKernelArgs dispatchKernelArgs = {}; + dispatchKernelArgs.isHeaplessModeEnabled = true; + + size_t inlineOffset = NEO::EncodeDispatchKernel::getInlineDataOffset(dispatchKernelArgs); + + uint64_t surfaceHeapGpuBase = getSurfStateGpuBase(useImmediate); + + auto scratchCmdList = static_cast(commandList.get()); + auto cmdListStream = commandList->commandContainer.getCommandStream(); + if (useImmediate) { + scratchCmdList = static_cast(commandListImmediate.get()); + cmdListStream = commandListImmediate->commandContainer.getCommandStream(); + } + + const ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + + auto result = ZE_RESULT_SUCCESS; + size_t usedBefore = cmdListStream->getUsed(); + result = scratchCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + size_t usedAfter = cmdListStream->getUsed(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( + cmdList, + ptrOffset(cmdListStream->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + auto walkerIterator = NEO::UnitTestHelper::findWalkerCmd(cmdList.begin(), cmdList.end(), true); + ASSERT_NE(cmdList.end(), walkerIterator); + void *walkerPtrWithScratch = *walkerIterator; + + mockKernelImmData->kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x0; + + usedBefore = cmdListStream->getUsed(); + result = scratchCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + usedAfter = cmdListStream->getUsed(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + cmdList.clear(); + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( + cmdList, + ptrOffset(cmdListStream->getCpuBase(), usedBefore), + usedAfter - usedBefore)); + + walkerIterator = NEO::UnitTestHelper::findWalkerCmd(cmdList.begin(), cmdList.end(), true); + ASSERT_NE(cmdList.end(), walkerIterator); + void *walkerPtrWithoutScratch = *walkerIterator; + + if (!useImmediate) { + result = commandList->close(); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + auto commandListHandle = commandList->toHandle(); + result = commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + } + + auto scratchAddress = scratchController->getScratchPatchAddress(); + auto fullScratchAddress = surfaceHeapGpuBase + scratchAddress; + + uint64_t scratchInlineValue = 0; + + void *scratchInlinePtr = ptrOffset(walkerPtrWithScratch, (inlineOffset + scratchInlineOffset)); + std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue)); + EXPECT_EQ(fullScratchAddress, scratchInlineValue); + + scratchInlinePtr = ptrOffset(walkerPtrWithoutScratch, (inlineOffset + scratchInlineOffset)); + std::memcpy(&scratchInlineValue, scratchInlinePtr, sizeof(scratchInlineValue)); + EXPECT_EQ(0u, scratchInlineValue); + + auto scratch0Allocation = scratchController->getScratchSpaceSlot0Allocation(); + bool scratchInResidency = ultCsr->isMadeResident(scratch0Allocation); + EXPECT_TRUE(scratchInResidency); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 4091531316..1f3928b16d 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -80,6 +80,8 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::getDcFlushRequired; using BaseClass::getHostPtrAlloc; using BaseClass::getInOrderIncrementValue; + using BaseClass::heaplessModeEnabled; + using BaseClass::heaplessStateInitEnabled; using BaseClass::hostPtrMap; using BaseClass::immediateCmdListHeapSharing; using BaseClass::indirectAllocationsAllowed; @@ -102,6 +104,7 @@ struct WhiteBox<::L0::CommandListCoreFamily> using BaseClass::pipelineSelectStateTracking; using BaseClass::requiredStreamState; using BaseClass::requiresQueueUncachedMocs; + using BaseClass::scratchAddressPatchingEnabled; using BaseClass::setupTimestampEventForMultiTile; using BaseClass::signalAllEventPackets; using BaseClass::stateBaseAddressTracking; @@ -273,6 +276,8 @@ struct WhiteBox<::L0::CommandListImp> : public ::L0::CommandListImp { using BaseClass::finalStreamState; using BaseClass::frontEndStateTracking; using BaseClass::getDcFlushRequired; + using BaseClass::heaplessModeEnabled; + using BaseClass::heaplessStateInitEnabled; using BaseClass::immediateCmdListHeapSharing; using BaseClass::initialize; using BaseClass::isFlushTaskSubmissionEnabled; @@ -283,6 +288,7 @@ struct WhiteBox<::L0::CommandListImp> : public ::L0::CommandListImp { using BaseClass::pipelineSelectStateTracking; using BaseClass::requiredStreamState; using BaseClass::requiresQueueUncachedMocs; + using BaseClass::scratchAddressPatchingEnabled; using BaseClass::signalAllEventPackets; using BaseClass::stateBaseAddressTracking; using BaseClass::stateComputeModeTracking; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h index 6e2c524fe1..a2f4e2c654 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h @@ -39,6 +39,8 @@ struct WhiteBox<::L0::CommandQueue> : public ::L0::CommandQueueImp { using CommandQueue::dispatchCmdListBatchBufferAsPrimary; using CommandQueue::doubleSbaWa; using CommandQueue::frontEndStateTracking; + using CommandQueue::heaplessModeEnabled; + using CommandQueue::heaplessStateInitEnabled; using CommandQueue::internalQueueForImmediateCommandList; using CommandQueue::internalUsage; using CommandQueue::partitionCount; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index d251906b9a..c2ba420ac5 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -3000,6 +3000,8 @@ TEST_F(CommandListCreate, givenCreatedCommandListWhenGettingTrackingFlagsThenDef auto expectedDispatchCmdListBatchBufferAsPrimary = L0GfxCoreHelper::dispatchCmdListBatchBufferAsPrimary(rootDeviceEnvironment, true); EXPECT_EQ(expectedDispatchCmdListBatchBufferAsPrimary, commandList->getCmdListBatchBufferFlag()); + + EXPECT_FALSE(commandList->scratchAddressPatchingEnabled); } TEST(BuiltinTypeHelperTest, givenNonStatelessAndNonHeaplessWhenAdjustBuiltinTypeIsCalledThenCorrectBuiltinTypeIsReturned) { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp index 5167ca74ff..e9ca3ee087 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_4.cpp @@ -1552,5 +1552,32 @@ HWTEST_F(CommandListCreate, givenDeviceWhenCreatingCommandListForNotInternalUsag EXPECT_FALSE(whiteboxCommandList->internalUsage); whiteboxCommandList->destroy(); } + +using CommandListScratchPatchPrivateHeapsTest = Test>; +using CommandListScratchPatchGlobalStatelessHeapsTest = Test>; + +using CommandListScratchPatchPrivateHeapsStateInitTest = Test>; +using CommandListScratchPatchGlobalStatelessHeapsStateInitTest = Test>; + +HWTEST2_F(CommandListScratchPatchPrivateHeapsTest, + givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchThenExpectCorrectAddressPatched, IsAtLeastXeHpcCore) { + testScratchInline(false); +} + +HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsTest, + givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchThenExpectCorrectAddressPatched, IsAtLeastXeHpcCore) { + testScratchInline(false); +} + +HWTEST2_F(CommandListScratchPatchPrivateHeapsStateInitTest, + givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchThenExpectCorrectAddressPatched, IsAtLeastXeHpcCore) { + testScratchInline(false); +} + +HWTEST2_F(CommandListScratchPatchGlobalStatelessHeapsStateInitTest, + givenHeaplessWithScratchPatchEnabledOnRegularCmdListWhenAppendingAndExecutingKernelWithScratchThenExpectCorrectAddressPatched, IsAtLeastXeHpcCore) { + testScratchInline(false); +} + } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp index 8b36dd8a9b..2986de3605 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp @@ -214,6 +214,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenA commandList->getDcFlushRequired(true), // dcFlushEnable false, // isHeaplessModeEnabled false, // interruptEvent + false, // immediateScratchAddressPatching }; NEO::EncodeDispatchKernel::template encode(commandContainer, dispatchKernelArgs); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 75bc316f94..f1044109bc 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -711,6 +711,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenA commandList->getDcFlushRequired(true), // dcFlushEnable false, // isHeaplessModeEnabled false, // interruptEvent + false, // immediateScratchAddressPatching }; EXPECT_THROW(NEO::EncodeDispatchKernel::template encode(commandContainer, dispatchKernelArgs), std::exception); } diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index 4d1b693217..7559f777a9 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -73,6 +73,7 @@ struct EncodeDispatchKernelArgs { bool dcFlushEnable = false; bool isHeaplessModeEnabled = false; bool interruptEvent = false; + bool immediateScratchAddressPatching = false; bool requiresSystemMemoryFence() const { return (isHostScopeSignalEvent && isKernelUsingSystemAllocation); diff --git a/shared/source/command_container/command_encoder_xehp_and_later.inl b/shared/source/command_container/command_encoder_xehp_and_later.inl index 1facb685c9..91006d6a2b 100644 --- a/shared/source/command_container/command_encoder_xehp_and_later.inl +++ b/shared/source/command_container/command_encoder_xehp_and_later.inl @@ -332,15 +332,22 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeDis auto address = heap->getHeapGpuBase() + offsetThreadData; std::memcpy(inlineDataPointer + indirectDataPointerAddress.offset, &address, indirectDataPointerAddress.pointerSize); - auto requiredScratchSlot0Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[0]; - auto requiredScratchSlot1Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[1]; - auto csr = args.device->getDefaultEngine().commandStreamReceiver; - auto ssh = container.getIndirectHeap(HeapType::surfaceState); + if (args.immediateScratchAddressPatching) { + auto requiredScratchSlot0Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[0]; + auto requiredScratchSlot1Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[1]; + auto csr = args.device->getDefaultEngine().commandStreamReceiver; + NEO::IndirectHeap *ssh = nullptr; + if (csr->getGlobalStatelessHeapAllocation() != nullptr) { + ssh = csr->getGlobalStatelessHeap(); + } else { + ssh = args.surfaceStateHeap ? args.surfaceStateHeap : container.getIndirectHeap(HeapType::surfaceState); + } - uint64_t scratchAddress = 0u; - EncodeDispatchKernel::template setScratchAddress(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, *csr); - auto scratchPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.scratchPointerAddress; - std::memcpy(inlineDataPointer + scratchPointerAddress.offset, &scratchAddress, scratchPointerAddress.pointerSize); + uint64_t scratchAddress = 0u; + EncodeDispatchKernel::template setScratchAddress(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, *csr); + auto scratchPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.scratchPointerAddress; + std::memcpy(inlineDataPointer + scratchPointerAddress.offset, &scratchAddress, scratchPointerAddress.pointerSize); + } } else { walkerCmd.setIndirectDataStartAddress(static_cast(offsetThreadData)); walkerCmd.setIndirectDataLength(sizeThreadData); diff --git a/shared/source/command_stream/CMakeLists.txt b/shared/source/command_stream/CMakeLists.txt index 23bed9fb96..1286a19478 100644 --- a/shared/source/command_stream/CMakeLists.txt +++ b/shared/source/command_stream/CMakeLists.txt @@ -87,7 +87,7 @@ if(SUPPORT_DG2_AND_LATER) ) endif() -if(NOT SUPPORT_HEAPLESS) +if(NOT SUPPORTED_HEAPLESS) list(APPEND NEO_CORE_COMMAND_STREAM ${CMAKE_CURRENT_SOURCE_DIR}/command_stream_receiver_hw_heap_addressing.inl ) diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 16e9671288..fdf3c427ea 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -303,7 +303,7 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( flushData.stateComputeModeFullConfigurationNeeded = getStateComputeModeDirty(); flushData.stateBaseAddressFullConfigurationNeeded = getGSBAStateDirty(); - if (dispatchFlags.sshCpuBase != nullptr && (this->requiredScratchSlot0Size > 0 || this->requiredScratchSlot1Size > 0)) { + if (!this->heaplessModeEnabled && dispatchFlags.sshCpuBase != nullptr && (this->requiredScratchSlot0Size > 0 || this->requiredScratchSlot1Size > 0)) { bool checkFeStateDirty = false; bool checkSbaStateDirty = false; scratchSpaceController->setRequiredScratchSpace(dispatchFlags.sshCpuBase, diff --git a/shared/test/common/helpers/unit_test_helper.h b/shared/test/common/helpers/unit_test_helper.h index a987ac1afd..c9e25eaab8 100644 --- a/shared/test/common/helpers/unit_test_helper.h +++ b/shared/test/common/helpers/unit_test_helper.h @@ -100,6 +100,7 @@ struct UnitTestHelper { static bool findStateCacheFlushPipeControl(LinearStream &csrStream); static void verifyDummyBlitWa(const RootDeviceEnvironment *rootDeviceEnvironment, GenCmdList::iterator &cmdIterator); + static GenCmdList::iterator findWalkerCmd(GenCmdList::iterator begin, GenCmdList::iterator end, bool heapless); }; } // namespace NEO diff --git a/shared/test/common/helpers/unit_test_helper_bdw_and_later.inl b/shared/test/common/helpers/unit_test_helper_bdw_and_later.inl index 50e9d80590..d904db9ec7 100644 --- a/shared/test/common/helpers/unit_test_helper_bdw_and_later.inl +++ b/shared/test/common/helpers/unit_test_helper_bdw_and_later.inl @@ -154,4 +154,9 @@ template void UnitTestHelper::verifyDummyBlitWa(const RootDeviceEnvironment *rootDeviceEnvironment, GenCmdList::iterator &cmdIterator) { } +template +GenCmdList::iterator UnitTestHelper::findWalkerCmd(GenCmdList::iterator begin, GenCmdList::iterator end, bool heapless) { + return find(begin, end); +} + } // namespace NEO diff --git a/shared/test/common/helpers/unit_test_helper_xehp_and_later.inl b/shared/test/common/helpers/unit_test_helper_xehp_and_later.inl index b4297f694a..6194669c3d 100644 --- a/shared/test/common/helpers/unit_test_helper_xehp_and_later.inl +++ b/shared/test/common/helpers/unit_test_helper_xehp_and_later.inl @@ -138,4 +138,9 @@ void UnitTestHelper::verifyDummyBlitWa(const RootDeviceEnvironment *r } } +template +GenCmdList::iterator UnitTestHelper::findWalkerCmd(GenCmdList::iterator begin, GenCmdList::iterator end, bool heapless) { + return find(begin, end); +} + } // namespace NEO diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index 1de70ded39..de9c3b5e6a 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -88,7 +88,6 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::sshState; using BaseClass::staticWorkPartitioningEnabled; using BaseClass::streamProperties; - using BaseClass::wasSubmittedToSingleSubdevice; using BaseClass::CommandStreamReceiver::activePartitions; using BaseClass::CommandStreamReceiver::activePartitionsConfig; @@ -115,6 +114,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ using BaseClass::CommandStreamReceiver::globalFenceAllocation; using BaseClass::CommandStreamReceiver::gpuHangCheckPeriod; using BaseClass::CommandStreamReceiver::gsbaFor32BitProgrammed; + using BaseClass::CommandStreamReceiver::heaplessModeEnabled; using BaseClass::CommandStreamReceiver::immWritePostSyncWriteOffset; using BaseClass::CommandStreamReceiver::initDirectSubmission; using BaseClass::CommandStreamReceiver::internalAllocationStorage; diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 7cd41a5d37..e47a3088a4 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -5095,3 +5095,30 @@ HWTEST_F(CommandStreamReceiverHwHeaplessTest, whenHeaplessCommandStreamReceiverF EXPECT_ANY_THROW(csr->flushImmediateTaskStateless(commandStream, 0, csr->recordedImmediateDispatchFlags, *pDevice)); EXPECT_ANY_THROW(csr->handleImmediateFlushStatelessAllocationsResidency(0, commandStream)); } + +HWTEST2_F(CommandStreamReceiverHwTest, + givenImmediateFlushTaskInHeaplessModeWhenNextDispatchRequiresScratchSpaceThenNoScratchIsAllocated, + IsAtLeastXeHpCore) { + using CFE_STATE = typename FamilyType::CFE_STATE; + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + commandStreamReceiver.storeMakeResidentAllocations = true; + commandStreamReceiver.heaplessModeEnabled = true; + + commandStreamReceiver.flushImmediateTask(commandStream, commandStream.getUsed(), immediateFlushTaskFlags, *pDevice); + + commandStreamReceiver.setRequiredScratchSizes(0x100, 0); + + size_t usedSize = commandStreamReceiver.commandStream.getUsed(); + commandStreamReceiver.flushImmediateTask(commandStream, + commandStream.getUsed(), + immediateFlushTaskFlags, + *pDevice); + + HardwareParse hwParserCsr; + hwParserCsr.parseCommands(commandStreamReceiver.commandStream, usedSize); + auto frontEndCmd = hwParserCsr.getCommand(); + ASSERT_EQ(nullptr, frontEndCmd); + + EXPECT_EQ(nullptr, commandStreamReceiver.getScratchSpaceController()->getScratchSpaceSlot0Allocation()); +} diff --git a/shared/test/unit_test/fixtures/command_container_fixture.cpp b/shared/test/unit_test/fixtures/command_container_fixture.cpp index 74949db69f..0a8c224507 100644 --- a/shared/test/unit_test/fixtures/command_container_fixture.cpp +++ b/shared/test/unit_test/fixtures/command_container_fixture.cpp @@ -68,6 +68,7 @@ EncodeDispatchKernelArgs CommandEncodeStatesFixture::createDefaultDispatchKernel false, // dcFlushEnable false, // isHeaplessModeEnabled false, // interruptEvent + false, // immediateScratchAddressPatching }; return args;