diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 2dac2e3b99..dcf201e209 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -168,6 +168,14 @@ struct CommandList : _ze_command_list_handle_t { commandListPerThreadScratchSize = size; } + uint32_t getCommandListPerThreadPrivateScratchSize() const { + return commandListPerThreadPrivateScratchSize; + } + + void setCommandListPerThreadPrivateScratchSize(uint32_t size) { + commandListPerThreadPrivateScratchSize = size; + } + uint32_t getCommandListSLMEnable() const { return commandListSLMEnabled; } @@ -245,6 +253,7 @@ struct CommandList : _ze_command_list_handle_t { NEO::PreemptionMode commandListPreemptionMode = NEO::PreemptionMode::Initial; uint32_t cmdListType = CommandListType::TYPE_REGULAR; uint32_t commandListPerThreadScratchSize = 0u; + uint32_t commandListPerThreadPrivateScratchSize = 0u; uint32_t threadArbitrationPolicy = NEO::ThreadArbitrationPolicy::RoundRobin; uint32_t partitionCount = 1; bool isFlushTaskSubmissionEnabled = false; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 4e14d39b12..4782a4bd6e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -143,6 +143,7 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(z const auto functionImmutableData = kernel->getImmutableData(); auto &kernelDescriptor = kernel->getKernelDescriptor(); commandListPerThreadScratchSize = std::max(commandListPerThreadScratchSize, kernelDescriptor.kernelAttributes.perThreadScratchSize[0]); + commandListPerThreadPrivateScratchSize = std::max(commandListPerThreadPrivateScratchSize, kernelDescriptor.kernelAttributes.perThreadScratchSize[1]); auto functionPreemptionMode = obtainFunctionPreemptionMode(kernel); commandListPreemptionMode = std::min(commandListPreemptionMode, functionPreemptionMode); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.h b/level_zero/core/source/cmdqueue/cmdqueue_hw.h index 8233c826e2..dd7a2a5ef2 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.h @@ -48,7 +48,8 @@ struct CommandQueueHw : public CommandQueueImp { MOCKABLE_VIRTUAL void handleScratchSpace(NEO::HeapContainer &heapContainer, NEO::ScratchSpaceController *scratchController, bool &gsbaState, bool &frontEndState, - uint32_t perThreadScratchSpaceSize); + uint32_t perThreadScratchSpaceSize, + uint32_t perThreadPrivateScratchSize); bool getPreemptionCmdProgramming() override; void patchCommands(CommandList &commandList, uint64_t scratchAddress); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index df807a7481..7d79a84013 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -157,6 +157,7 @@ ze_result_t CommandQueueHw::executeCommandLists( size_t totalCmdBuffers = 0; uint32_t perThreadScratchSpaceSize = 0; + uint32_t perThreadPrivateScratchSize = 0; NEO::PageFaultManager *pageFaultManager = nullptr; if (performMigration) { pageFaultManager = device->getDriverHandle()->getMemoryManager()->getPageFaultManager(); @@ -188,11 +189,11 @@ ze_result_t CommandQueueHw::executeCommandLists( statePreemption = commandListPreemption; } - if (perThreadScratchSpaceSize < commandList->getCommandListPerThreadScratchSize()) { - perThreadScratchSpaceSize = commandList->getCommandListPerThreadScratchSize(); - } + perThreadScratchSpaceSize = std::max(perThreadScratchSpaceSize, commandList->getCommandListPerThreadScratchSize()); - if (commandList->getCommandListPerThreadScratchSize() != 0) { + perThreadPrivateScratchSize = std::max(perThreadPrivateScratchSize, commandList->getCommandListPerThreadPrivateScratchSize()); + + if (commandList->getCommandListPerThreadScratchSize() != 0 || commandList->getCommandListPerThreadPrivateScratchSize() != 0) { if (commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE) != nullptr) { heapContainer.push_back(commandList->commandContainer.getIndirectHeap(NEO::HeapType::SURFACE_STATE)->getGraphicsAllocation()); } @@ -237,7 +238,7 @@ ze_result_t CommandQueueHw::executeCommandLists( handleScratchSpace(heapContainer, scratchSpaceController, gsbaStateDirty, frontEndStateDirty, - perThreadScratchSpaceSize); + perThreadScratchSpaceSize, perThreadPrivateScratchSize); auto &streamProperties = csr->getStreamProperties(); const auto &hwInfoConfig = *NEO::HwInfoConfig::get(hwInfo.platform.eProductFamily); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl index 12570f9027..6c453c48a9 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl @@ -107,7 +107,7 @@ template void CommandQueueHw::handleScratchSpace(NEO::HeapContainer &heapContainer, NEO::ScratchSpaceController *scratchController, bool &gsbaState, bool &frontEndState, - uint32_t perThreadScratchSpaceSize) { + uint32_t perThreadScratchSpaceSize, uint32_t perThreadPrivateScratchSize) { if (perThreadScratchSpaceSize > 0) { scratchController->setRequiredScratchSpace(nullptr, 0u, perThreadScratchSpaceSize, 0u, csr->peekTaskCount(), diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index 05898d6ecc..a7b3c94b10 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -111,19 +111,27 @@ template void CommandQueueHw::handleScratchSpace(NEO::HeapContainer &sshHeaps, NEO::ScratchSpaceController *scratchController, bool &gsbaState, bool &frontEndState, - uint32_t perThreadScratchSpaceSize) { - if (perThreadScratchSpaceSize > 0) { + uint32_t perThreadScratchSpaceSize, uint32_t perThreadPrivateScratchSize) { + if (perThreadScratchSpaceSize > 0 || perThreadPrivateScratchSize > 0) { if (sshHeaps.size() > 0) { uint32_t offsetIndex = maxPtssIndex * csr->getOsContext().getEngineType() + 1u; - scratchController->programHeaps(sshHeaps, offsetIndex, perThreadScratchSpaceSize, 0u, csr->peekTaskCount(), + scratchController->programHeaps(sshHeaps, offsetIndex, perThreadScratchSpaceSize, perThreadPrivateScratchSize, csr->peekTaskCount(), csr->getOsContext(), gsbaState, frontEndState); } if (NEO::ApiSpecificConfig::getBindlessConfiguration()) { - scratchController->programBindlessSurfaceStateForScratch(device->getNEODevice()->getBindlessHeapsHelper(), perThreadScratchSpaceSize, 0u, csr->peekTaskCount(), + scratchController->programBindlessSurfaceStateForScratch(device->getNEODevice()->getBindlessHeapsHelper(), perThreadScratchSpaceSize, perThreadPrivateScratchSize, csr->peekTaskCount(), csr->getOsContext(), gsbaState, frontEndState, csr); } auto scratchAllocation = scratchController->getScratchSpaceAllocation(); - csr->makeResident(*scratchAllocation); + if (scratchAllocation != nullptr) { + csr->makeResident(*scratchAllocation); + } + + auto privateScratchAllocation = scratchController->getPrivateScratchSpaceAllocation(); + + if (privateScratchAllocation != nullptr) { + csr->makeResident(*privateScratchAllocation); + } } } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp index 90efc9bda4..1c34abb856 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp @@ -1037,6 +1037,40 @@ HWTEST_F(CmdlistAppendLaunchKernelTests, givenKernelWithoutImplicitArgsWhenAppen EXPECT_EQ(0u, sizeForImplicitArgPatching); } +HWTEST2_F(CmdlistAppendLaunchKernelTests, givenKernelWitchScratchAndPrivateWhenAppendLaunchKernelThenCmdListHasCorrectPrivateAndScratchSizesSet, IsAtLeastXeHpCore) { + std::unique_ptr mockKernelImmData = std::make_unique(0u); + auto kernelDescriptor = mockKernelImmData->kernelDescriptor; + kernelDescriptor->kernelAttributes.flags.requiresImplicitArgs = false; + kernelDescriptor->kernelAttributes.perThreadScratchSize[0] = 0x200; + kernelDescriptor->kernelAttributes.perThreadScratchSize[1] = 0x100; + createModuleFromBinary(0u, false, mockKernelImmData.get()); + + auto kernel = std::make_unique(module.get()); + + ze_kernel_desc_t kernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernel->initialize(&kernelDesc); + + EXPECT_FALSE(kernel->getKernelDescriptor().kernelAttributes.flags.requiresImplicitArgs); + EXPECT_EQ(nullptr, kernel->getImplicitArgs()); + + kernel->setGroupSize(4, 5, 6); + kernel->setGroupCount(3, 2, 1); + kernel->setGlobalOffsetExp(1, 2, 3); + kernel->patchGlobalOffset(); + + ze_result_t result{}; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, result)); + + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + ze_group_count_t groupCount = {3, 2, 1}; + result = commandList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + EXPECT_EQ(commandList->getCommandListPerThreadPrivateScratchSize(), static_cast(0x100)); + EXPECT_EQ(commandList->getCommandListPerThreadScratchSize(), static_cast(0x200)); +} + HWTEST_F(CmdlistAppendLaunchKernelTests, whenEncodingWorkDimForIndirectDispatchThenSizeIsProperlyEstimated) { Mock<::L0::Kernel> kernel; diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp index ef99b3d011..0325aebf55 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_1.cpp @@ -947,7 +947,8 @@ class MockCommandQueue : public L0::CommandQueueHw { void handleScratchSpace(NEO::HeapContainer &heapContainer, NEO::ScratchSpaceController *scratchController, bool &gsbaState, bool &frontEndState, - uint32_t perThreadScratchSpaceSize) override { + uint32_t perThreadScratchSpaceSize, + uint32_t perThreadPrivateScratchSize) override { this->mockHeapContainer = heapContainer; } @@ -984,6 +985,34 @@ HWTEST2_F(CommandQueueDestroy, givenCommandQueueAndCommandListWithSshAndScratchW alignedFree(alloc); } +using CommandQueueExecuteTest = Test; + +HWTEST2_F(CommandQueueDestroy, givenCommandQueueAndCommandListWithSshAndPrivateScratchWhenExecuteThenSshWasUsed, IsAtLeastXeHpCore) { + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + auto commandQueue = new MockCommandQueue(device, csr, &desc); + commandQueue->initialize(false, false); + auto commandList = new CommandListCoreFamily(); + commandList->initialize(device, NEO::EngineGroupType::Compute, 0u); + commandList->setCommandListPerThreadPrivateScratchSize(100u); + auto commandListHandle = commandList->toHandle(); + + void *alloc = alignedMalloc(0x100, 0x100); + NEO::GraphicsAllocation graphicsAllocation1(0, NEO::GraphicsAllocation::AllocationType::BUFFER, alloc, 0u, 0u, 1u, MemoryPool::System4KBPages, 1u); + NEO::GraphicsAllocation graphicsAllocation2(0, NEO::GraphicsAllocation::AllocationType::BUFFER, alloc, 0u, 0u, 1u, MemoryPool::System4KBPages, 1u); + + commandList->commandContainer.sshAllocations.push_back(&graphicsAllocation1); + commandList->commandContainer.sshAllocations.push_back(&graphicsAllocation2); + + commandQueue->executeCommandLists(1, &commandListHandle, nullptr, false); + + EXPECT_EQ(commandQueue->mockHeapContainer.size(), 3u); + commandQueue->destroy(); + commandList->destroy(); + alignedFree(alloc); +} + HWTEST2_F(CommandQueueDestroy, givenCommandQueueAndCommandListWithWhenBindlessEnabledThenHeapContainerIsEmpty, IsAtLeastSkl) { DebugManagerStateRestore dbgRestorer; DebugManager.flags.UseBindlessMode.set(1); @@ -1495,6 +1524,72 @@ HWTEST2_F(ExecuteCommandListTests, givenTwoCommandQueuesHavingTwoB2BCommandLists commandQueue1->destroy(); } +HWTEST2_F(ExecuteCommandListTests, givenTwoCommandQueuesHavingTwoB2BCommandListsAndWithPrivateScratchUniquePerCmdListThenCFEIsProgrammedOncePerSubmission, IsAtLeastXeHpCore) { + using CFE_STATE = typename FamilyType::CFE_STATE; + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + ze_result_t returnValue; + auto commandQueue = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + auto commandList0 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue))); + auto commandList1 = std::unique_ptr(whitebox_cast(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, 0u, returnValue))); + commandList0->setCommandListPerThreadPrivateScratchSize(0u); + commandList1->setCommandListPerThreadPrivateScratchSize(512u); + auto commandListHandle0 = commandList0->toHandle(); + auto commandListHandle1 = commandList1->toHandle(); + + commandQueue->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(0u, csr->getScratchSpaceController()->getPerThreadPrivateScratchSize()); + commandQueue->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(512u, csr->getScratchSpaceController()->getPerThreadPrivateScratchSize()); + + auto usedSpaceAfter = commandQueue->commandStream->getUsed(); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandQueue->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + auto mediaVfeStates = findAll(cmdList.begin(), cmdList.end()); + + ASSERT_EQ(2u, mediaVfeStates.size()); + + commandList0->reset(); + commandList0->setCommandListPerThreadPrivateScratchSize(1024u); + commandList1->reset(); + commandList1->setCommandListPerThreadPrivateScratchSize(2048u); + + auto commandQueue1 = whitebox_cast(CommandQueue::create(productFamily, + device, + csr, + &desc, + false, + false, + returnValue)); + commandQueue1->executeCommandLists(1, &commandListHandle0, nullptr, false); + EXPECT_EQ(1024u, csr->getScratchSpaceController()->getPerThreadPrivateScratchSize()); + commandQueue1->executeCommandLists(1, &commandListHandle1, nullptr, false); + EXPECT_EQ(2048u, csr->getScratchSpaceController()->getPerThreadPrivateScratchSize()); + + usedSpaceAfter = commandQueue1->commandStream->getUsed(); + + GenCmdList cmdList1; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList1, ptrOffset(commandQueue1->commandStream->getCpuBase(), 0), usedSpaceAfter)); + + mediaVfeStates = findAll(cmdList1.begin(), cmdList1.end()); + + ASSERT_EQ(2u, mediaVfeStates.size()); + + commandQueue->destroy(); + commandQueue1->destroy(); +} + TEST_F(CommandQueueCreate, givenOverrideCmdQueueSyncModeToDefaultWhenCommandQueueIsCreatedWithSynchronousModeThenDefaultModeIsSelected) { DebugManagerStateRestore restore; NEO::DebugManager.flags.OverrideCmdQueueSynchronousMode.set(0); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp index 12c36a1227..fab7c93e51 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp @@ -5,8 +5,10 @@ * */ +#include "shared/source/command_stream/scratch_space_controller_xehp_and_later.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/libult/ult_command_stream_receiver.h" +#include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_memory_manager.h" #include "shared/test/common/mocks/mock_memory_operations_handler.h" #include "shared/test/common/mocks/ult_device_factory.h" @@ -14,9 +16,12 @@ #include "level_zero/core/test/unit_tests/fixtures/aub_csr_fixture.h" #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" +#include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h" #include "level_zero/core/test/unit_tests/mocks/mock_kernel.h" +#include "test_traits_common.h" + namespace L0 { namespace ult { @@ -597,6 +602,235 @@ HWTEST2_F(DeviceWithDualStorage, givenCmdListWithAppendedKernelAndUsmTransferAnd ASSERT_EQ(ZE_RESULT_SUCCESS, res); commandQueue->destroy(); } +using CommandQueueScratchTests = Test; + +using Platforms = IsAtLeastProduct; + +HWTEST2_F(CommandQueueScratchTests, givenCommandQueueWhenHandleScratchSpaceThenProperScratchSlotIsSetAndScratchAllocationReturned, Platforms) { + class MockScratchSpaceControllerXeHPAndLater : public NEO::ScratchSpaceControllerXeHPAndLater { + public: + uint32_t scratchSlot = 0u; + bool programHeapsCalled = false; + NEO::GraphicsAllocation *scratchAllocation = nullptr; + MockScratchSpaceControllerXeHPAndLater(uint32_t rootDeviceIndex, + NEO::ExecutionEnvironment &environment, + InternalAllocationStorage &allocationStorage) : NEO::ScratchSpaceControllerXeHPAndLater(rootDeviceIndex, environment, allocationStorage) {} + + void programHeaps(HeapContainer &heapContainer, + uint32_t scratchSlot, + uint32_t requiredPerThreadScratchSize, + uint32_t requiredPerThreadPrivateScratchSize, + uint32_t currentTaskCount, + OsContext &osContext, + bool &stateBaseAddressDirty, + bool &vfeStateDirty) override { + this->scratchSlot = scratchSlot; + programHeapsCalled = true; + } + + NEO::GraphicsAllocation *getScratchSpaceAllocation() override { + return scratchAllocation; + } + + protected: + }; + + MockCommandStreamReceiver csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr.initializeTagAllocation(); + csr.setupContext(*neoDevice->getDefaultEngine().osContext); + + NEO::ExecutionEnvironment *execEnv = static_cast(device->getExecEnvironment()); + std::unique_ptr scratchController = std::make_unique(device->getRootDeviceIndex(), + *execEnv, + *csr.getInternalAllocationStorage()); + + const ze_command_queue_desc_t desc = {}; + + std::unique_ptr commandQueue = std::make_unique>(device, &csr, &desc); + auto commandQueueHw = static_cast *>(commandQueue.get()); + + NEO::ResidencyContainer residencyContainer; + NEO::HeapContainer heapContainer; + + void *surfaceHeap = alignedMalloc(0x1000, 0x1000); + NEO::GraphicsAllocation graphicsAllocationHeap(0, NEO::GraphicsAllocation::AllocationType::BUFFER, surfaceHeap, 0u, 0u, 1u, MemoryPool::System4KBPages, 1u); + heapContainer.push_back(&graphicsAllocationHeap); + bool gsbaStateDirty = false; + bool frontEndStateDirty = false; + + NEO::GraphicsAllocation graphicsAllocation(1u, NEO::GraphicsAllocation::AllocationType::BUFFER, nullptr, 0u, 0u, 0u, MemoryPool::System4KBPages, 0u); + + auto scratch = static_cast(scratchController.get()); + scratch->scratchAllocation = &graphicsAllocation; + commandQueueHw->handleScratchSpace(heapContainer, scratchController.get(), gsbaStateDirty, frontEndStateDirty, 0x1000, 0u); + + EXPECT_TRUE(scratch->programHeapsCalled); + EXPECT_GT(csr.makeResidentCalledTimes, 0u); + + alignedFree(surfaceHeap); +} + +HWTEST2_F(CommandQueueScratchTests, givenCommandQueueWhenHandleScratchSpaceAndHeapContainerIsZeroSizeThenNoFunctionIsCalled, Platforms) { + class MockScratchSpaceControllerXeHPAndLater : public NEO::ScratchSpaceControllerXeHPAndLater { + public: + using NEO::ScratchSpaceControllerXeHPAndLater::scratchAllocation; + bool programHeapsCalled = false; + MockScratchSpaceControllerXeHPAndLater(uint32_t rootDeviceIndex, + NEO::ExecutionEnvironment &environment, + InternalAllocationStorage &allocationStorage) : NEO::ScratchSpaceControllerXeHPAndLater(rootDeviceIndex, environment, allocationStorage) {} + + void programHeaps(HeapContainer &heapContainer, + uint32_t scratchSlot, + uint32_t requiredPerThreadScratchSize, + uint32_t requiredPerThreadPrivateScratchSize, + uint32_t currentTaskCount, + OsContext &osContext, + bool &stateBaseAddressDirty, + bool &vfeStateDirty) override { + programHeapsCalled = true; + } + + protected: + }; + + MockCommandStreamReceiver csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr.initializeTagAllocation(); + csr.setupContext(*neoDevice->getDefaultEngine().osContext); + + NEO::ExecutionEnvironment *execEnv = static_cast(device->getExecEnvironment()); + std::unique_ptr scratchController = std::make_unique(device->getRootDeviceIndex(), + *execEnv, + *csr.getInternalAllocationStorage()); + + const ze_command_queue_desc_t desc = {}; + + std::unique_ptr commandQueue = std::make_unique>(device, &csr, &desc); + auto commandQueueHw = static_cast *>(commandQueue.get()); + + NEO::ResidencyContainer residencyContainer; + NEO::HeapContainer heapContainer; + bool gsbaStateDirty = false; + bool frontEndStateDirty = false; + + NEO::GraphicsAllocation graphicsAllocation(1u, NEO::GraphicsAllocation::AllocationType::BUFFER, nullptr, 0u, 0u, 0u, MemoryPool::System4KBPages, 0u); + + auto scratch = static_cast(scratchController.get()); + scratch->scratchAllocation = &graphicsAllocation; + commandQueueHw->handleScratchSpace(heapContainer, scratchController.get(), gsbaStateDirty, frontEndStateDirty, 0x1000, 0u); + + EXPECT_FALSE(scratch->programHeapsCalled); + scratch->scratchAllocation = nullptr; +} + +HWTEST2_F(CommandQueueScratchTests, givenCommandQueueWhenBindlessEnabledThenHandleScratchSpaceCallsProgramBindlessSurfaceStateForScratch, Platforms) { + DebugManagerStateRestore restorer; + DebugManager.flags.UseBindlessMode.set(1); + class MockScratchSpaceControllerXeHPAndLater : public NEO::ScratchSpaceControllerXeHPAndLater { + public: + bool programHeapsCalled = false; + NEO::MockGraphicsAllocation alloc; + MockScratchSpaceControllerXeHPAndLater(uint32_t rootDeviceIndex, + NEO::ExecutionEnvironment &environment, + InternalAllocationStorage &allocationStorage) : NEO::ScratchSpaceControllerXeHPAndLater(rootDeviceIndex, environment, allocationStorage) {} + + void programBindlessSurfaceStateForScratch(BindlessHeapsHelper *heapsHelper, + uint32_t requiredPerThreadScratchSize, + uint32_t requiredPerThreadPrivateScratchSize, + uint32_t currentTaskCount, + OsContext &osContext, + bool &stateBaseAddressDirty, + bool &vfeStateDirty, + NEO::CommandStreamReceiver *csr) override { + programHeapsCalled = true; + } + + NEO::GraphicsAllocation *getScratchSpaceAllocation() override { + return &alloc; + } + + protected: + }; + MockCsrHw2 csr(*neoDevice->getExecutionEnvironment(), 0, neoDevice->getDeviceBitfield()); + csr.initializeTagAllocation(); + csr.setupContext(*neoDevice->getDefaultEngine().osContext); + + NEO::ExecutionEnvironment *execEnv = static_cast(device->getExecEnvironment()); + std::unique_ptr scratchController = std::make_unique(device->getRootDeviceIndex(), + *execEnv, + *csr.getInternalAllocationStorage()); + const ze_command_queue_desc_t desc = {}; + + std::unique_ptr commandQueue = std::make_unique>(device, &csr, &desc); + auto commandQueueHw = static_cast *>(commandQueue.get()); + + bool gsbaStateDirty = false; + bool frontEndStateDirty = false; + NEO::ResidencyContainer residency; + NEO::HeapContainer heapContainer; + + // scratch part + commandQueueHw->handleScratchSpace(heapContainer, scratchController.get(), gsbaStateDirty, frontEndStateDirty, 0x1000, 0u); + + EXPECT_TRUE(static_cast(scratchController.get())->programHeapsCalled); + + // private part + static_cast(scratchController.get())->programHeapsCalled = false; + + commandQueueHw->handleScratchSpace(heapContainer, scratchController.get(), gsbaStateDirty, frontEndStateDirty, 0x0, 0x1000); + + EXPECT_TRUE(static_cast(scratchController.get())->programHeapsCalled); +} + +HWTEST2_F(CommandQueueScratchTests, whenPatchCommandsIsCalledThenCommandsAreCorrectlyPatched, IsAtLeastXeHpCore) { + using CFE_STATE = typename FamilyType::CFE_STATE; + + ze_command_queue_desc_t desc = {}; + NEO::CommandStreamReceiver *csr = nullptr; + device->getCsrForOrdinalAndIndex(&csr, 0u, 0u); + auto commandQueue = std::make_unique>(device, csr, &desc); + auto commandList = std::make_unique>>(); + + EXPECT_NO_THROW(commandQueue->patchCommands(*commandList, 0)); + commandList->commandsToPatch.push_back({}); + EXPECT_ANY_THROW(commandQueue->patchCommands(*commandList, 0)); + commandList->commandsToPatch.clear(); + + CFE_STATE destinationCfeStates[4]; + int32_t initialScratchAddress = 0x123400; + for (size_t i = 0; i < 4; i++) { + auto sourceCfeState = new CFE_STATE; + *sourceCfeState = FamilyType::cmdInitCfeState; + if constexpr (TestTraits::numberOfWalkersInCfeStateSupported) { + sourceCfeState->setNumberOfWalkers(2); + } + sourceCfeState->setMaximumNumberOfThreads(16); + sourceCfeState->setScratchSpaceBuffer(initialScratchAddress); + + destinationCfeStates[i] = FamilyType::cmdInitCfeState; + if constexpr (TestTraits::numberOfWalkersInCfeStateSupported) { + EXPECT_NE(destinationCfeStates[i].getNumberOfWalkers(), sourceCfeState->getNumberOfWalkers()); + } + EXPECT_NE(destinationCfeStates[i].getMaximumNumberOfThreads(), sourceCfeState->getMaximumNumberOfThreads()); + + CommandList::CommandToPatch commandToPatch; + commandToPatch.pDestination = &destinationCfeStates[i]; + commandToPatch.pCommand = sourceCfeState; + commandToPatch.type = CommandList::CommandToPatch::CommandType::FrontEndState; + commandList->commandsToPatch.push_back(commandToPatch); + } + + uint64_t patchedScratchAddress = 0xABCD00; + commandQueue->patchCommands(*commandList, patchedScratchAddress); + for (size_t i = 0; i < 4; i++) { + EXPECT_EQ(patchedScratchAddress, destinationCfeStates[i].getScratchSpaceBuffer()); + auto &sourceCfeState = *reinterpret_cast(commandList->commandsToPatch[i].pCommand); + if constexpr (TestTraits::numberOfWalkersInCfeStateSupported) { + EXPECT_EQ(destinationCfeStates[i].getNumberOfWalkers(), sourceCfeState.getNumberOfWalkers()); + } + EXPECT_EQ(destinationCfeStates[i].getMaximumNumberOfThreads(), sourceCfeState.getMaximumNumberOfThreads()); + EXPECT_EQ(destinationCfeStates[i].getScratchSpaceBuffer(), sourceCfeState.getScratchSpaceBuffer()); + } +} } // namespace ult } // namespace L0 diff --git a/shared/source/command_stream/scratch_space_controller.h b/shared/source/command_stream/scratch_space_controller.h index 4c8e0c0938..c684dbd1ea 100644 --- a/shared/source/command_stream/scratch_space_controller.h +++ b/shared/source/command_stream/scratch_space_controller.h @@ -53,6 +53,9 @@ class ScratchSpaceController { inline uint32_t getPerThreadScratchSpaceSize() { return static_cast(scratchSizeBytes / computeUnitsUsedForScratch); } + inline uint32_t getPerThreadPrivateScratchSize() { + return static_cast(privateScratchSizeBytes / computeUnitsUsedForScratch); + } virtual void reserveHeap(IndirectHeap::Type heapType, IndirectHeap *&indirectHeap) = 0; virtual void programHeaps(HeapContainer &heapContainer,