From bc619fcbeca5ff634e081ae8e2e2f492137bed96 Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Sat, 19 Nov 2022 18:25:04 +0000 Subject: [PATCH] Queue stall mode for RelaxedOrdering Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz --- .../source/cmdlist/cmdlist_hw_immediate.h | 4 +- .../source/cmdlist/cmdlist_hw_immediate.inl | 41 ++-- level_zero/core/source/cmdqueue/cmdqueue.cpp | 2 +- .../core/test/unit_tests/mocks/mock_cmdlist.h | 2 +- .../sources/cmdlist/test_cmdlist_1.cpp | 96 ++++++++ .../sources/cmdlist/test_cmdlist_6.cpp | 18 +- opencl/source/command_queue/enqueue_common.h | 6 +- opencl/source/helpers/task_information.cpp | 9 +- .../command_queue/blit_enqueue_1_tests.cpp | 16 +- .../linux/drm_command_stream_tests_2.cpp | 15 +- .../command_stream_receiver_hw_base.inl | 6 +- .../source/command_stream/csr_definitions.h | 69 +++--- .../command_stream/submissions_aggregator.cpp | 4 +- .../command_stream/submissions_aggregator.h | 4 +- .../direct_submission/direct_submission_hw.h | 4 + .../direct_submission_hw.inl | 29 +++ .../test/common/helpers/batch_buffer_helper.h | 11 +- .../common/helpers/dispatch_flags_helper.h | 61 +++--- .../aub_command_stream_receiver_2_tests.cpp | 18 +- .../command_stream/aub_file_stream_tests.cpp | 20 +- .../command_stream_receiver_tests.cpp | 10 + .../command_stream/compute_mode_tests.h | 2 +- .../tbx_command_stream_tests.cpp | 4 +- .../direct_submission_tests_2.cpp | 206 +++++++++++++++++- .../linux/drm_command_stream_tests_1.cpp | 46 ++-- ...and_stream_xehp_and_later_prelim_tests.cpp | 9 +- ...m_memory_manager_localmem_prelim_tests.cpp | 3 +- .../windows/device_command_stream_tests.cpp | 34 +-- 28 files changed, 549 insertions(+), 200 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h index 6b79dceffc..6d84d74503 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.h @@ -124,12 +124,12 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily::updateDispatchFlagsWithRequi } template -ze_result_t CommandListCoreFamilyImmediate::executeCommandListImmediateWithFlushTask(bool performMigration) { +ze_result_t CommandListCoreFamilyImmediate::executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds) { NEO::DispatchFlags dispatchFlags( {}, // csrDependencies nullptr, // barrierTimestampPacketNodes @@ -99,7 +99,8 @@ ze_result_t CommandListCoreFamilyImmediate::executeCommandListImm false, // useGlobalAtomics this->device->getNEODevice()->getNumGenericSubDevices() > 1, // areMultipleSubDevicesInContext false, // memoryMigrationRequired - false // textureCacheFlush + false, // textureCacheFlush + hasStallingCmds // hasStallingCmds ); this->updateDispatchFlagsWithRequiredStreamState(dispatchFlags); @@ -260,7 +261,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernel( auto ret = CommandListCoreFamily::appendLaunchKernel(kernelHandle, threadGroupDimensions, hSignalEvent, numWaitEvents, phWaitEvents, launchParams); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template @@ -273,7 +274,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchKernelInd } auto ret = CommandListCoreFamily::appendLaunchKernelIndirect(kernelHandle, pDispatchArgumentsBuffer, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template @@ -289,7 +290,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendBarrier( ret = CommandListCoreFamily::appendBarrier(hSignalEvent, numWaitEvents, phWaitEvents); this->dependenciesPresent = true; - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, true, hSignalEvent); } template @@ -323,7 +324,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopy( ret = CommandListCoreFamily::appendMemoryCopy(dstptr, srcptr, size, hSignalEvent, numWaitEvents, phWaitEvents); } - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template @@ -366,7 +367,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryCopyRegio hSignalEvent, numWaitEvents, phWaitEvents); } - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template @@ -381,7 +382,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryFill(void } auto ret = CommandListCoreFamily::appendMemoryFill(ptr, pattern, patternSize, size, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template @@ -393,7 +394,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendSignalEvent(ze_ checkAvailableSpace(); } ret = CommandListCoreFamily::appendSignalEvent(hSignalEvent); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, true, hSignalEvent); } template @@ -405,7 +406,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendEventReset(ze_e checkAvailableSpace(); } ret = CommandListCoreFamily::appendEventReset(hSignalEvent); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, true, hSignalEvent); } template @@ -431,7 +432,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendPageFaultCopy(N } else { ret = CommandListCoreFamily::appendPageFaultCopy(dstAllocation, srcAllocation, size, flushHost); } - return flushImmediate(ret, false, nullptr); + return flushImmediate(ret, false, false, nullptr); } template @@ -441,7 +442,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendWaitOnEvents(ui } auto ret = CommandListCoreFamily::appendWaitOnEvents(numEvents, phWaitEvents); this->dependenciesPresent = true; - return flushImmediate(ret, true, nullptr); + return flushImmediate(ret, true, true, nullptr); } template @@ -454,7 +455,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendWriteGlobalTime } auto ret = CommandListCoreFamily::appendWriteGlobalTimestamp(dstptr, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, true, hSignalEvent); } template @@ -490,7 +491,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyRegion } auto ret = CommandListCoreFamily::appendImageCopyRegion(hDstImage, hSrcImage, pDstRegion, pSrcRegion, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template @@ -508,7 +509,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyFromMe auto ret = CommandListCoreFamily::appendImageCopyFromMemory(hDstImage, srcPtr, pDstRegion, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template @@ -526,7 +527,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyToMemo auto ret = CommandListCoreFamily::appendImageCopyToMemory(dstPtr, hSrcImage, pSrcRegion, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template @@ -540,7 +541,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendMemoryRangesBar checkAvailableSpace(); } auto ret = CommandListCoreFamily::appendMemoryRangesBarrier(numRanges, pRangeSizes, pRanges, hSignalEvent, numWaitEvents, phWaitEvents); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, true, hSignalEvent); } template @@ -553,14 +554,14 @@ ze_result_t CommandListCoreFamilyImmediate::appendLaunchCooperati checkAvailableSpace(); } auto ret = CommandListCoreFamily::appendLaunchCooperativeKernel(kernelHandle, launchKernelArgs, hSignalEvent, numWaitEvents, waitEventHandles); - return flushImmediate(ret, true, hSignalEvent); + return flushImmediate(ret, true, false, hSignalEvent); } template -ze_result_t CommandListCoreFamilyImmediate::flushImmediate(ze_result_t inputRet, bool performMigration, ze_event_handle_t hSignalEvent) { +ze_result_t CommandListCoreFamilyImmediate::flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds, ze_event_handle_t hSignalEvent) { if (inputRet == ZE_RESULT_SUCCESS) { if (this->isFlushTaskSubmissionEnabled) { - inputRet = executeCommandListImmediateWithFlushTask(performMigration); + inputRet = executeCommandListImmediateWithFlushTask(performMigration, hasStallingCmds); } else { inputRet = executeCommandListImmediate(performMigration); } diff --git a/level_zero/core/source/cmdqueue/cmdqueue.cpp b/level_zero/core/source/cmdqueue/cmdqueue.cpp index 4bd331ca06..5b528bce9b 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.cpp +++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp @@ -99,7 +99,7 @@ NEO::SubmissionStatus CommandQueueImp::submitBatchBuffer(size_t offset, NEO::Res NEO::BatchBuffer batchBuffer(commandStream.getGraphicsAllocation(), offset, 0, 0, nullptr, false, false, NEO::QueueThrottle::HIGH, NEO::QueueSliceCount::defaultSliceCount, - commandStream.getUsed(), &commandStream, endingCmdPtr, isCooperative); + commandStream.getUsed(), &commandStream, endingCmdPtr, isCooperative, false); commandStream.getGraphicsAllocation()->updateTaskCount(csr->peekTaskCount() + 1, csr->getOsContext().getContextId()); commandStream.getGraphicsAllocation()->updateResidencyTaskCount(csr->peekTaskCount() + 1, csr->getOsContext().getContextId()); diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index a48007667b..5fb9355741 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -514,7 +514,7 @@ class MockCommandListImmediateHw : public WhiteBox<::L0::CommandListCoreFamilyIm return executeCommandListImmediateReturnValue; } - ze_result_t executeCommandListImmediateWithFlushTask(bool performMigration) override { + ze_result_t executeCommandListImmediateWithFlushTask(bool performMigration, bool hasStallingCmds) override { ++executeCommandListImmediateWithFlushTaskCalledCount; return executeCommandListImmediateWithFlushTaskReturnValue; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index 62addfb6dc..bc11f52f9b 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -9,6 +9,7 @@ #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_command_stream_receiver.h" #include "shared/test/common/mocks/mock_cpu_page_fault_manager.h" #include "shared/test/common/mocks/mock_memory_manager.h" @@ -20,6 +21,8 @@ #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h" +#include "level_zero/core/test/unit_tests/mocks/mock_image.h" +#include "level_zero/core/test/unit_tests/mocks/mock_kernel.h" namespace L0 { namespace ult { @@ -972,6 +975,99 @@ TEST_F(CommandListCreate, whenCreatingImmCmdListWithSyncModeAndAppendBarrierThen commandList->appendBarrier(nullptr, 0, nullptr); } +HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingThenPassStallingCmdsInfo, IsAtLeastXeHpcCore) { + ze_command_queue_desc_t desc = {}; + desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; + ze_result_t returnValue; + std::unique_ptr commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue)); + ASSERT_NE(nullptr, commandList); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_event_handle_t event = nullptr; + + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event)); + std::unique_ptr eventObject(L0::Event::fromHandle(event)); + + Mock<::L0::Kernel> kernel; + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + + uint8_t srcPtr[64] = {}; + uint8_t dstPtr[64] = {}; + const ze_copy_region_t region = {0U, 0U, 0U, 1, 1, 0U}; + + driverHandle->importExternalPointer(dstPtr, MemoryConstants::pageSize); + + auto ultCsr = static_cast *>(commandList->csr); + ultCsr->recordFlusheBatchBuffer = true; + + auto verifyFlags = [&ultCsr](ze_result_t result, bool dispatchFlag, bool bbFlag) { + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(ultCsr->recordedDispatchFlags.hasStallingCmds, dispatchFlag); + EXPECT_EQ(ultCsr->latestFlushedBatchBuffer.hasStallingCmds, bbFlag); + }; + // non-pipelined state + verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr, launchParams), false, true); + + // non-pipelined state already programmed + verifyFlags(commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr, launchParams), false, false); + + verifyFlags(commandList->appendLaunchKernelIndirect(kernel.toHandle(), &groupCount, nullptr, 0, nullptr), false, false); + + verifyFlags(commandList->appendBarrier(nullptr, 0, nullptr), true, true); + + verifyFlags(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, nullptr, 0, nullptr), false, false); + + verifyFlags(commandList->appendMemoryCopyRegion(dstPtr, ®ion, 0, 0, srcPtr, ®ion, 0, 0, nullptr, 0, nullptr), false, false); + + verifyFlags(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, nullptr, 0, nullptr), false, false); + + verifyFlags(commandList->appendEventReset(event), true, true); + + verifyFlags(commandList->appendSignalEvent(event), true, true); + + verifyFlags(commandList->appendPageFaultCopy(kernel.getIsaAllocation(), kernel.getIsaAllocation(), 1, false), false, false); + + verifyFlags(commandList->appendWaitOnEvents(1, &event), true, true); + + verifyFlags(commandList->appendWriteGlobalTimestamp(reinterpret_cast(dstPtr), nullptr, 0, nullptr), true, true); + + if constexpr (FamilyType::supportsSampler) { + auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::CopyImageRegion); + auto mockBuiltinKernel = static_cast *>(kernel); + mockBuiltinKernel->setArgRedescribedImageCallBase = false; + + auto image = std::make_unique>>(); + ze_image_region_t imgRegion = {1, 1, 1, 1, 1, 1}; + ze_image_desc_t zeDesc = {}; + zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + image->initialize(device, &zeDesc); + + verifyFlags(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, nullptr, 0, nullptr), false, false); + + verifyFlags(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, nullptr, 0, nullptr), false, false); + + verifyFlags(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, nullptr, 0, nullptr), false, false); + } + + size_t rangeSizes = 1; + const void **ranges = reinterpret_cast(&dstPtr[0]); + verifyFlags(commandList->appendMemoryRangesBarrier(1, &rangeSizes, ranges, nullptr, 0, nullptr), true, true); + + verifyFlags(commandList->appendLaunchCooperativeKernel(kernel.toHandle(), &groupCount, nullptr, 0, nullptr), false, false); + + driverHandle->releaseImportedPointer(dstPtr); +} + TEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmCmdListWithSyncModeAndAppendBarrierThenAppendBarrierReturnsDeviceLost) { DebugManagerStateRestore restorer; DebugManager.flags.EnableFlushTaskSubmission.set(1); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp index ab6b6b8ed7..408d306354 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp @@ -73,7 +73,7 @@ HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlus commandListImmediate.requiredStreamState.stateComputeMode.isCoherencyRequired.value = 1; commandListImmediate.requiredStreamState.stateComputeMode.largeGrfMode.value = 1; commandListImmediate.requiredStreamState.stateComputeMode.threadArbitrationPolicy.value = NEO::ThreadArbitrationPolicy::RoundRobin; - commandListImmediate.executeCommandListImmediateWithFlushTask(false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false); NEO::StateComputeModePropertiesSupport scmPropertiesSupport = {}; hwInfoConfig.fillScmPropertiesSupportStructure(scmPropertiesSupport); @@ -102,7 +102,7 @@ HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlus commandListImmediate.requiredStreamState.stateComputeMode.isCoherencyRequired.value = 0; commandListImmediate.requiredStreamState.stateComputeMode.largeGrfMode.value = 0; commandListImmediate.requiredStreamState.stateComputeMode.threadArbitrationPolicy.value = NEO::ThreadArbitrationPolicy::AgeBased; - commandListImmediate.executeCommandListImmediateWithFlushTask(false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false); expectedLargeGrfMode = scmPropertiesSupport.largeGrfMode ? 0 : -1; expectedIsCoherencyRequired = scmPropertiesSupport.coherencyRequired ? 0 : -1; @@ -128,7 +128,7 @@ HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlus auto &commandListImmediate = static_cast &>(*commandList); commandListImmediate.containsAnyKernel = true; - commandListImmediate.executeCommandListImmediateWithFlushTask(false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false); EXPECT_FALSE(commandListImmediate.containsAnyKernel); } @@ -139,7 +139,7 @@ HWTEST2_F(CommandListExecuteImmediate, whenExecutingCommandListImmediateWithFlus commandList.reset(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue)); auto &commandListImmediate = static_cast &>(*commandList); - EXPECT_EQ(ZE_RESULT_SUCCESS, commandListImmediate.executeCommandListImmediateWithFlushTask(false)); + EXPECT_EQ(ZE_RESULT_SUCCESS, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false)); } HWTEST2_F(CommandListExecuteImmediate, givenOutOfHostMemoryErrorOnFlushWhenExecutingCommandListImmediateWithFlushTaskThenProperErrorIsReturned, IsAtLeastSkl) { @@ -151,7 +151,7 @@ HWTEST2_F(CommandListExecuteImmediate, givenOutOfHostMemoryErrorOnFlushWhenExecu auto &commandStreamReceiver = neoDevice->getUltCommandStreamReceiver(); commandStreamReceiver.flushReturnValue = SubmissionStatus::OUT_OF_HOST_MEMORY; - EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, commandListImmediate.executeCommandListImmediateWithFlushTask(false)); + EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_HOST_MEMORY, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false)); } HWTEST2_F(CommandListExecuteImmediate, givenOutOfDeviceMemoryErrorOnFlushWhenExecutingCommandListImmediateWithFlushTaskThenProperErrorIsReturned, IsAtLeastSkl) { @@ -163,7 +163,7 @@ HWTEST2_F(CommandListExecuteImmediate, givenOutOfDeviceMemoryErrorOnFlushWhenExe auto &commandStreamReceiver = neoDevice->getUltCommandStreamReceiver(); commandStreamReceiver.flushReturnValue = SubmissionStatus::OUT_OF_MEMORY; - EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, commandListImmediate.executeCommandListImmediateWithFlushTask(false)); + EXPECT_EQ(ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY, commandListImmediate.executeCommandListImmediateWithFlushTask(false, false)); } using CommandListTest = Test; @@ -351,7 +351,7 @@ HWTEST2_F(CommandListTest, givenImmediateCommandListWhenFlushImmediateThenOverri MockCommandStreamReceiver mockCommandStreamReceiver(*neoDevice->executionEnvironment, neoDevice->getRootDeviceIndex(), neoDevice->getDeviceBitfield()); cmdList.csr = event->csr; event->csr = &mockCommandStreamReceiver; - cmdList.flushImmediate(ZE_RESULT_SUCCESS, false, event->toHandle()); + cmdList.flushImmediate(ZE_RESULT_SUCCESS, false, false, event->toHandle()); EXPECT_EQ(event->csr, cmdList.csr); } @@ -772,7 +772,7 @@ HWTEST2_F(CommandListTest, givenCmdListWithIndirectAccessWhenExecutingCommandLis auto oldCommandQueue = commandList->cmdQImmediate; commandList->cmdQImmediate = &mockCommandQueue; commandListImmediate.indirectAllocationsAllowed = true; - commandListImmediate.executeCommandListImmediateWithFlushTask(false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false); EXPECT_EQ(mockCommandQueue.handleIndirectAllocationResidencyCalledTimes, 1u); commandList->cmdQImmediate = oldCommandQueue; } @@ -791,7 +791,7 @@ HWTEST2_F(CommandListTest, givenCmdListWithNoIndirectAccessWhenExecutingCommandL auto oldCommandQueue = commandList->cmdQImmediate; commandList->cmdQImmediate = &mockCommandQueue; commandListImmediate.indirectAllocationsAllowed = false; - commandListImmediate.executeCommandListImmediateWithFlushTask(false); + commandListImmediate.executeCommandListImmediateWithFlushTask(false, false); EXPECT_EQ(mockCommandQueue.handleIndirectAllocationResidencyCalledTimes, 0u); commandList->cmdQImmediate = oldCommandQueue; } diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index e6c0eb981b..ee7a3313b4 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -803,7 +803,8 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( useGlobalAtomics, // useGlobalAtomics kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext kernel->requiresMemoryMigration(), // memoryMigrationRequired - isTextureCacheFlushNeeded(commandType)); // textureCacheFlush + isTextureCacheFlushNeeded(commandType), // textureCacheFlush + false); // hasStallingCmds dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode = systolicPipelineSelectMode; @@ -1048,7 +1049,8 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( false, // useGlobalAtomics context->containsMultipleSubDevices(rootDeviceIndex), // areMultipleSubDevicesInContext false, // memoryMigrationRequired - false); // textureCacheFlush + false, // textureCacheFlush + false); // hasStallingCmds const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 574df8ac42..282c24366d 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -79,7 +79,8 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) { false, // useGlobalAtomics false, // areMultipleSubDevicesInContext false, // memoryMigrationRequired - false); // textureCacheFlush + false, // textureCacheFlush + false); // hasStallingCmds DEBUG_BREAK_IF(taskLevel >= CompletionStamp::notReady); @@ -208,7 +209,8 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate kernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, // useGlobalAtomics kernel->areMultipleSubDevicesInContext(), // areMultipleSubDevicesInContext kernel->requiresMemoryMigration(), // memoryMigrationRequired - commandQueue.isTextureCacheFlushNeeded(this->commandType)); // textureCacheFlush + commandQueue.isTextureCacheFlushNeeded(this->commandType), // textureCacheFlush + false); // hasStallingCmds if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); @@ -382,7 +384,8 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate false, // useGlobalAtomics commandQueue.getContext().containsMultipleSubDevices(rootDeviceIndex), // areMultipleSubDevicesInContext false, // memoryMigrationRequired - false); // textureCacheFlush + false, // textureCacheFlush + false); // hasStallingCmds if (commandQueue.getContext().getRootDeviceIndices().size() > 1) { eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp index e88da86e22..04331b9b31 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp @@ -227,6 +227,16 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, whenFlushTagUpdateThenMiFlushDwIsFlu EXPECT_NE(cmdFound, cmdListBcs.end()); } +HWTEST_TEMPLATED_F(BlitAuxTranslationTests, whenFlushTagUpdateThenSetStallingCmdsFlag) { + auto ultCsr = static_cast *>(bcsCsr); + + ultCsr->recordFlusheBatchBuffer = true; + + EXPECT_EQ(SubmissionStatus::SUCCESS, bcsCsr->flushTagUpdate()); + + EXPECT_TRUE(ultCsr->latestFlushedBatchBuffer.hasStallingCmds); +} + HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeBcsOutput) { using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; @@ -1412,7 +1422,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenMarkerThatFollowsCopyOperatio auto offset = mockCmdQueue->getCS(0).getUsed(); - //marker needs to program semaphore + // marker needs to program semaphore commandQueue->enqueueMarkerWithWaitList(0, nullptr, &outEvent1); auto cmdListQueue = getCmdList(mockCmdQueue->getCS(0), offset); @@ -1475,7 +1485,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenMarkerThatFollowsCopyOperatio auto ultGpgpuCsr = static_cast *>(gpgpuCsr); auto ultBcsCsr = static_cast *>(bcsCsr); - //make sure we wait for both + // make sure we wait for both clWaitForEvents(1, &outEvent1); EXPECT_EQ(ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount, ultBcsCsr->taskCount); EXPECT_EQ(ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount, ultGpgpuCsr->taskCount); @@ -1503,7 +1513,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, givenMarkerThatFollowsCopyOperatio auto ultGpgpuCsr = static_cast *>(gpgpuCsr); auto ultBcsCsr = static_cast *>(bcsCsr); - //make sure we wait for both + // make sure we wait for both clWaitForEvents(1, &outEvent2); EXPECT_EQ(ultBcsCsr->latestWaitForCompletionWithTimeoutTaskCount, ultBcsCsr->taskCount); EXPECT_EQ(ultGpgpuCsr->latestWaitForCompletionWithTimeoutTaskCount, ultGpgpuCsr->taskCount); diff --git a/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests_2.cpp b/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests_2.cpp index 655a8c6f31..03acd5577c 100644 --- a/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests_2.cpp +++ b/opencl/test/unit_test/os_interface/linux/drm_command_stream_tests_2.cpp @@ -558,7 +558,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamEnhancedTest, GivenFlushMultipleTimesThenSucc cs.replaceGraphicsAllocation(commandBuffer); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer2{cs.getGraphicsAllocation(), 8, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer2 = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 8; csr->flush(batchBuffer2, csr->getResidencyAllocations()); auto allocation = mm->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); @@ -577,7 +578,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamEnhancedTest, GivenFlushMultipleTimesThenSucc cs.replaceGraphicsAllocation(commandBuffer2); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer3{cs.getGraphicsAllocation(), 16, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer3 = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 16; csr->flush(batchBuffer3, csr->getResidencyAllocations()); csr->makeSurfacePackNonResident(csr->getResidencyAllocations(), true); mm->freeGraphicsMemory(allocation); @@ -590,7 +592,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamEnhancedTest, GivenFlushMultipleTimesThenSucc cs.replaceGraphicsAllocation(commandBuffer2); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer4{cs.getGraphicsAllocation(), 24, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer4 = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 24; csr->flush(batchBuffer4, csr->getResidencyAllocations()); } @@ -644,7 +647,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamEnhancedTest, GivenNotAlignedWhenFlushingThen CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; csr->flush(batchBuffer, csr->getResidencyAllocations()); } @@ -661,7 +665,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamEnhancedTest, GivenCheckDrmFreeWhenFlushingTh csr->makeResident(*allocation); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; csr->flush(batchBuffer, csr->getResidencyAllocations()); csr->makeNonResident(*allocation); mm->freeGraphicsMemory(allocation); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 019f6cf308..e4e7429d3a 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -610,7 +610,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto &streamToSubmit = submitCommandStreamFromCsr ? commandStreamCSR : commandStreamTask; BatchBuffer batchBuffer{streamToSubmit.getGraphicsAllocation(), startOffset, chainedBatchBufferStartOffset, taskStartAddress, chainedBatchBuffer, dispatchFlags.requiresCoherency, dispatchFlags.lowPriority, dispatchFlags.throttle, dispatchFlags.sliceCount, - streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, dispatchFlags.useSingleSubdevice}; + streamToSubmit.getUsed(), &streamToSubmit, bbEndLocation, dispatchFlags.useSingleSubdevice, (submitCSR || dispatchFlags.hasStallingCmds)}; streamToSubmit.getGraphicsAllocation()->updateTaskCount(this->taskCount + 1, this->osContext->getContextId()); streamToSubmit.getGraphicsAllocation()->updateResidencyTaskCount(this->taskCount + 1, this->osContext->getContextId()); @@ -1173,7 +1173,7 @@ uint32_t CommandStreamReceiverHw::flushBcsTask(const BlitPropertiesCo uint64_t taskStartAddress = commandStream.getGpuBase() + commandStreamStart; BatchBuffer batchBuffer{commandStream.getGraphicsAllocation(), commandStreamStart, 0, taskStartAddress, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, - commandStream.getUsed(), &commandStream, endingCmdPtr, false}; + commandStream.getUsed(), &commandStream, endingCmdPtr, false, false}; commandStream.getGraphicsAllocation()->updateTaskCount(newTaskCount, this->osContext->getContextId()); commandStream.getGraphicsAllocation()->updateResidencyTaskCount(newTaskCount, this->osContext->getContextId()); @@ -1285,7 +1285,7 @@ SubmissionStatus CommandStreamReceiverHw::flushSmallTask(LinearStream BatchBuffer batchBuffer{commandStreamTask.getGraphicsAllocation(), commandStreamStartTask, 0, taskStartAddress, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, - commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, false}; + commandStreamTask.getUsed(), &commandStreamTask, endingCmdPtr, false, true}; this->latestSentTaskCount = taskCount + 1; auto submissionStatus = flushHandler(batchBuffer, getResidencyAllocations()); diff --git a/shared/source/command_stream/csr_definitions.h b/shared/source/command_stream/csr_definitions.h index 9264fd2b5d..887e658d1c 100644 --- a/shared/source/command_stream/csr_definitions.h +++ b/shared/source/command_stream/csr_definitions.h @@ -24,10 +24,10 @@ namespace NEO { struct FlushStampTrackingObj; namespace CSRequirements { -//cleanup section usually contains 1-2 pipeControls BB end and place for BB start -//that makes 16 * 2 + 4 + 8 = 40 bytes -//then command buffer is aligned to cacheline that can take up to 63 bytes -//to be sure everything fits minimal size is at 2 x cacheline. +// cleanup section usually contains 1-2 pipeControls BB end and place for BB start +// that makes 16 * 2 + 4 + 8 = 40 bytes +// then command buffer is aligned to cacheline that can take up to 63 bytes +// to be sure everything fits minimal size is at 2 x cacheline. constexpr auto minCommandQueueCommandStreamSize = 2 * MemoryConstants::cacheLineSize; constexpr auto csOverfetchSize = MemoryConstants::pageSize; @@ -56,35 +56,37 @@ struct DispatchFlags { KernelExecutionType kernelExecutionTypeP, MemoryCompressionState memoryCompressionStateP, uint64_t sliceCountP, bool blockingP, bool dcFlushP, bool useSLMP, bool guardCommandBufferWithPipeControlP, bool gsba32BitRequiredP, bool requiresCoherencyP, bool lowPriorityP, bool implicitFlushP, bool outOfOrderExecutionAllowedP, bool epilogueRequiredP, - bool usePerDSSbackedBufferP, bool useSingleSubdeviceP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP, bool textureCacheFlush) : csrDependencies(csrDependenciesP), - barrierTimestampPacketNodes(barrierTimestampPacketNodesP), - pipelineSelectArgs(pipelineSelectArgsP), - flushStampReference(flushStampReferenceP), - throttle(throttleP), - preemptionMode(preemptionModeP), - numGrfRequired(numGrfRequiredP), - l3CacheSettings(l3CacheSettingsP), - threadArbitrationPolicy(threadArbitrationPolicyP), - additionalKernelExecInfo(additionalKernelExecInfoP), - kernelExecutionType(kernelExecutionTypeP), - memoryCompressionState(memoryCompressionStateP), - sliceCount(sliceCountP), - blocking(blockingP), - dcFlush(dcFlushP), - useSLM(useSLMP), - guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), - gsba32BitRequired(gsba32BitRequiredP), - requiresCoherency(requiresCoherencyP), - lowPriority(lowPriorityP), - implicitFlush(implicitFlushP), - outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), - epilogueRequired(epilogueRequiredP), - usePerDssBackedBuffer(usePerDSSbackedBufferP), - useSingleSubdevice(useSingleSubdeviceP), - useGlobalAtomics(useGlobalAtomicsP), - areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), - memoryMigrationRequired(memoryMigrationRequiredP), - textureCacheFlush(textureCacheFlush){}; + bool usePerDSSbackedBufferP, bool useSingleSubdeviceP, bool useGlobalAtomicsP, bool areMultipleSubDevicesInContextP, bool memoryMigrationRequiredP, bool textureCacheFlush, + bool hasStallingCmds) : csrDependencies(csrDependenciesP), + barrierTimestampPacketNodes(barrierTimestampPacketNodesP), + pipelineSelectArgs(pipelineSelectArgsP), + flushStampReference(flushStampReferenceP), + throttle(throttleP), + preemptionMode(preemptionModeP), + numGrfRequired(numGrfRequiredP), + l3CacheSettings(l3CacheSettingsP), + threadArbitrationPolicy(threadArbitrationPolicyP), + additionalKernelExecInfo(additionalKernelExecInfoP), + kernelExecutionType(kernelExecutionTypeP), + memoryCompressionState(memoryCompressionStateP), + sliceCount(sliceCountP), + blocking(blockingP), + dcFlush(dcFlushP), + useSLM(useSLMP), + guardCommandBufferWithPipeControl(guardCommandBufferWithPipeControlP), + gsba32BitRequired(gsba32BitRequiredP), + requiresCoherency(requiresCoherencyP), + lowPriority(lowPriorityP), + implicitFlush(implicitFlushP), + outOfOrderExecutionAllowed(outOfOrderExecutionAllowedP), + epilogueRequired(epilogueRequiredP), + usePerDssBackedBuffer(usePerDSSbackedBufferP), + useSingleSubdevice(useSingleSubdeviceP), + useGlobalAtomics(useGlobalAtomicsP), + areMultipleSubDevicesInContext(areMultipleSubDevicesInContextP), + memoryMigrationRequired(memoryMigrationRequiredP), + textureCacheFlush(textureCacheFlush), + hasStallingCmds(hasStallingCmds){}; CsrDependencies csrDependencies; TimestampPacketContainer *barrierTimestampPacketNodes = nullptr; @@ -116,6 +118,7 @@ struct DispatchFlags { bool areMultipleSubDevicesInContext = false; bool memoryMigrationRequired = false; bool textureCacheFlush = false; + bool hasStallingCmds = false; bool disableEUFusion = false; }; diff --git a/shared/source/command_stream/submissions_aggregator.cpp b/shared/source/command_stream/submissions_aggregator.cpp index e5d03de430..088f925574 100644 --- a/shared/source/command_stream/submissions_aggregator.cpp +++ b/shared/source/command_stream/submissions_aggregator.cpp @@ -102,12 +102,12 @@ NEO::BatchBuffer::BatchBuffer(GraphicsAllocation *commandBufferAllocation, size_ size_t chainedBatchBufferStartOffset, uint64_t taskStartAddress, GraphicsAllocation *chainedBatchBuffer, bool requiresCoherency, bool lowPriority, QueueThrottle throttle, uint64_t sliceCount, - size_t usedSize, LinearStream *stream, void *endCmdPtr, bool useSingleSubdevice) + size_t usedSize, LinearStream *stream, void *endCmdPtr, bool useSingleSubdevice, bool hasStallingCmds) : commandBufferAllocation(commandBufferAllocation), startOffset(startOffset), chainedBatchBufferStartOffset(chainedBatchBufferStartOffset), taskStartAddress(taskStartAddress), chainedBatchBuffer(chainedBatchBuffer), requiresCoherency(requiresCoherency), low_priority(lowPriority), throttle(throttle), sliceCount(sliceCount), - usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), useSingleSubdevice(useSingleSubdevice) {} + usedSize(usedSize), stream(stream), endCmdPtr(endCmdPtr), useSingleSubdevice(useSingleSubdevice), hasStallingCmds(hasStallingCmds) {} NEO::CommandBuffer::CommandBuffer(Device &device) : device(device) { flushStamp.reset(new FlushStampTracker(false)); diff --git a/shared/source/command_stream/submissions_aggregator.h b/shared/source/command_stream/submissions_aggregator.h index 610336a0b3..154f29c9a6 100644 --- a/shared/source/command_stream/submissions_aggregator.h +++ b/shared/source/command_stream/submissions_aggregator.h @@ -31,7 +31,8 @@ struct BatchBuffer { size_t usedSize, LinearStream *stream, void *endCmdPtr, - bool useSingleSubdevice); + bool useSingleSubdevice, + bool hasStallingCmds); BatchBuffer() {} GraphicsAllocation *commandBufferAllocation = nullptr; size_t startOffset = 0u; @@ -50,6 +51,7 @@ struct BatchBuffer { void *endCmdPtr = nullptr; bool useSingleSubdevice = false; + bool hasStallingCmds = false; bool ringBufferRestartRequest = false; }; diff --git a/shared/source/direct_submission/direct_submission_hw.h b/shared/source/direct_submission/direct_submission_hw.h index bfac379227..11f649830a 100644 --- a/shared/source/direct_submission/direct_submission_hw.h +++ b/shared/source/direct_submission/direct_submission_hw.h @@ -127,6 +127,9 @@ class DirectSubmissionHw { void dispatchSwitchRingBufferSection(uint64_t nextBufferGpuAddress); size_t getSizeSwitchRingBufferSection(); + void dispatchRelaxedOrderingQueueStall(); + size_t getSizeDispatchRelaxedOrderingQueueStall(); + void dispatchTaskStoreSection(uint64_t taskStartSectionVa); MOCKABLE_VIRTUAL void preinitializeTaskStoreSection(); @@ -221,5 +224,6 @@ class DirectSubmissionHw { bool dcFlushRequired = false; bool relaxedOrderingEnabled = false; bool relaxedOrderingInitialized = false; + bool firstSubmissionAfterRingStart = true; }; } // namespace NEO diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index 402af3bee5..603fbca6e7 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -433,6 +433,8 @@ bool DirectSubmissionHw::startRingBuffer() { ringStart = submit(gpuStartVa, startSize); + firstSubmissionAfterRingStart = true; + return ringStart; } @@ -442,6 +444,10 @@ bool DirectSubmissionHw::stopRingBuffer() { return true; } + if (this->relaxedOrderingEnabled && !firstSubmissionAfterRingStart) { + dispatchRelaxedOrderingQueueStall(); + } + void *flushPtr = ringCommandStream.getSpace(0); Dispatcher::dispatchCacheFlush(ringCommandStream, *hwInfo, gpuVaForMiFlush); if (disableMonitorFence) { @@ -546,6 +552,9 @@ inline size_t DirectSubmissionHw::getSizeEnd() { if (disableMonitorFence) { size += Dispatcher::getSizeMonitorFence(*hwInfo); } + if (this->relaxedOrderingEnabled) { + size += getSizeDispatchRelaxedOrderingQueueStall(); + } return size; } @@ -648,6 +657,17 @@ void *DirectSubmissionHw::dispatchWorkloadSection(BatchBu return currentPosition; } +template +void DirectSubmissionHw::dispatchRelaxedOrderingQueueStall() { + LriHelper::program(&ringCommandStream, CS_GPR_R5, 1, true); + dispatchSemaphoreSection(currentQueueWorkCount, false); +} + +template +size_t DirectSubmissionHw::getSizeDispatchRelaxedOrderingQueueStall() { + return getSizeSemaphoreSection(false) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); +} + template void DirectSubmissionHw::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) { LriHelper::program(&cmdStream, CS_GPR_R4, static_cast(returnPtr & 0xFFFF'FFFFULL), true); @@ -736,6 +756,9 @@ bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffe size_t cycleSize = getSizeSwitchRingBufferSection(); size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd(); if (this->relaxedOrderingEnabled) { + if (batchBuffer.hasStallingCmds && !firstSubmissionAfterRingStart) { + requiredMinimalSize += getSizeDispatchRelaxedOrderingQueueStall(); + } requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection() + RelaxedOrderingHelper::getSizeReturnPtrRegs(); } @@ -745,6 +768,10 @@ bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffe switchRingBuffers(); } + if (this->relaxedOrderingEnabled && batchBuffer.hasStallingCmds && !firstSubmissionAfterRingStart) { + dispatchRelaxedOrderingQueueStall(); + } + handleNewResourcesSubmission(); void *currentPosition = dispatchWorkloadSection(batchBuffer); @@ -771,6 +798,8 @@ bool DirectSubmissionHw::dispatchCommandBuffer(BatchBuffe uint64_t flushValue = updateTagValue(); flushStamp.setStamp(flushValue); + firstSubmissionAfterRingStart = false; + return ringStart; } diff --git a/shared/test/common/helpers/batch_buffer_helper.h b/shared/test/common/helpers/batch_buffer_helper.h index abf58aa4a3..292c4c3a82 100644 --- a/shared/test/common/helpers/batch_buffer_helper.h +++ b/shared/test/common/helpers/batch_buffer_helper.h @@ -11,11 +11,11 @@ using namespace NEO; struct BatchBufferHelper { - static BatchBuffer createDefaultBatchBuffer(GraphicsAllocation *commandBufferAllocation, LinearStream *stream, size_t usedSize) { + static BatchBuffer createDefaultBatchBuffer(GraphicsAllocation *commandBufferAllocation, LinearStream *stream, size_t usedSize, size_t chainedBatchBufferStartOffset) { return BatchBuffer( commandBufferAllocation, // commandBufferAllocation 0, // startOffset - 0, // chainedBatchBufferStartOffset + chainedBatchBufferStartOffset, // chainedBatchBufferStartOffset 0, // taskStartAddress nullptr, // chainedBatchBuffer false, // requiresCoherency @@ -25,7 +25,12 @@ struct BatchBufferHelper { usedSize, // usedSize stream, // stream nullptr, // endCmdPtr - false // useSingleSubdevice + false, // useSingleSubdevice + false // hasStallingCmds ); } + + static BatchBuffer createDefaultBatchBuffer(GraphicsAllocation *commandBufferAllocation, LinearStream *stream, size_t usedSize) { + return createDefaultBatchBuffer(commandBufferAllocation, stream, usedSize, 0); + } }; diff --git a/shared/test/common/helpers/dispatch_flags_helper.h b/shared/test/common/helpers/dispatch_flags_helper.h index e7725f24cc..280e1e1c37 100644 --- a/shared/test/common/helpers/dispatch_flags_helper.h +++ b/shared/test/common/helpers/dispatch_flags_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2021 Intel Corporation + * Copyright (C) 2019-2022 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -14,35 +14,36 @@ using namespace NEO; struct DispatchFlagsHelper { static DispatchFlags createDefaultDispatchFlags() { return DispatchFlags( - {}, //csrDependencies - nullptr, //barrierTimestampPacketNodes - {}, //pipelineSelectArgs - nullptr, //flushStampReference - QueueThrottle::MEDIUM, //throttle - PreemptionMode::Disabled, //preemptionMode - GrfConfig::DefaultGrfNumber, //numGrfRequired - L3CachingSettings::l3CacheOn, //l3CacheSettings - ThreadArbitrationPolicy::NotPresent, //threadArbitrationPolicy - AdditionalKernelExecInfo::NotApplicable, //additionalKernelExecInfo - KernelExecutionType::NotApplicable, //kernelExecutionType - MemoryCompressionState::NotApplicable, //memoryCompressionState - QueueSliceCount::defaultSliceCount, //sliceCount - false, //blocking - false, //dcFlush - false, //useSLM - false, //guardCommandBufferWithPipeControl - false, //gsba32BitRequired - false, //requiresCoherency - false, //lowPriority - false, //implicitFlush - false, //outOfOrderExecutionAllowed - false, //epilogueRequired - false, //usePerDssBackedBuffer - false, //useSingleSubdevice - false, //useGlobalAtomics - false, //areMultipleSubDevicesInContext - false, //memoryMigrationRequired - false //textureCacheFlush + {}, // csrDependencies + nullptr, // barrierTimestampPacketNodes + {}, // pipelineSelectArgs + nullptr, // flushStampReference + QueueThrottle::MEDIUM, // throttle + PreemptionMode::Disabled, // preemptionMode + GrfConfig::DefaultGrfNumber, // numGrfRequired + L3CachingSettings::l3CacheOn, // l3CacheSettings + ThreadArbitrationPolicy::NotPresent, // threadArbitrationPolicy + AdditionalKernelExecInfo::NotApplicable, // additionalKernelExecInfo + KernelExecutionType::NotApplicable, // kernelExecutionType + MemoryCompressionState::NotApplicable, // memoryCompressionState + QueueSliceCount::defaultSliceCount, // sliceCount + false, // blocking + false, // dcFlush + false, // useSLM + false, // guardCommandBufferWithPipeControl + false, // gsba32BitRequired + false, // requiresCoherency + false, // lowPriority + false, // implicitFlush + false, // outOfOrderExecutionAllowed + false, // epilogueRequired + false, // usePerDssBackedBuffer + false, // useSingleSubdevice + false, // useGlobalAtomics + false, // areMultipleSubDevicesInContext + false, // memoryMigrationRequired + false, // textureCacheFlush + false // hasStallingCmds ); } }; diff --git a/shared/test/unit_test/command_stream/aub_command_stream_receiver_2_tests.cpp b/shared/test/unit_test/command_stream/aub_command_stream_receiver_2_tests.cpp index b4477a14bf..774d30f95d 100644 --- a/shared/test/unit_test/command_stream/aub_command_stream_receiver_2_tests.cpp +++ b/shared/test/unit_test/command_stream/aub_command_stream_receiver_2_tests.cpp @@ -59,8 +59,8 @@ HWTEST_F(AubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenForcedB ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, chainedBatchBuffer, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; - + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed(), 128); + batchBuffer.chainedBatchBuffer = chainedBatchBuffer; size_t sizeBatchBuffer = 0xffffu; std::unique_ptr> flatBatchBuffer( @@ -84,7 +84,7 @@ HWTEST_F(AubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenForcedB ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed(), 128); size_t sizeBatchBuffer = 0xffffu; @@ -111,7 +111,7 @@ HWTEST_F(AubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenForcedB ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, chainedBatchBuffer, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed(), 128); size_t sizeBatchBuffer = 0xffffu; @@ -133,7 +133,7 @@ HWTEST_F(FlatBatchBufferHelperAubTests, givenAubCommandStreamReceiverWhenRegiste auto aubCsr = aubExecutionEnvironment->template getCsr>(); LinearStream cs(aubExecutionEnvironment->commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed(), 128); aubCsr->getFlatBatchBufferHelper().registerCommandChunk(batchBuffer, sizeof(MI_BATCH_BUFFER_START)); ASSERT_EQ(1u, aubCsr->getFlatBatchBufferHelper().getCommandChunkList().size()); @@ -227,7 +227,7 @@ HWTEST_F(FlatBatchBufferHelperAubTests, givenAubCommandStreamReceiverWhenForcedB ASSERT_EQ(3u, aubCsr->getFlatBatchBufferHelper().getPatchInfoCollection().size()); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed(), 128); size_t sizeBatchBuffer = 0u; @@ -350,7 +350,7 @@ HWTEST_F(AubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenForcedF auto chainedBatchBuffer = aubExecutionEnvironment->executionEnvironment->memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize}); ASSERT_NE(nullptr, chainedBatchBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, chainedBatchBuffer, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed(), 128); aubCsr->makeResident(*chainedBatchBuffer); @@ -385,7 +385,7 @@ HWTEST_F(AubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenForcedF aubCsr->overwriteFlatBatchBufferHelper(mockHelper); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed(), 128); aubCsr->flush(batchBuffer, allocationsForResidency); @@ -408,7 +408,7 @@ HWTEST_F(AubCommandStreamReceiverTests, givenAubCommandStreamReceiverWhenForcedF aubCsr->overwriteFlatBatchBufferHelper(mockHelper); ResidencyContainer allocationsForResidency; - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed(), 128); aubCsr->flush(batchBuffer, allocationsForResidency); diff --git a/shared/test/unit_test/command_stream/aub_file_stream_tests.cpp b/shared/test/unit_test/command_stream/aub_file_stream_tests.cpp index 0420efd1e2..7f5cad16e8 100644 --- a/shared/test/unit_test/command_stream/aub_file_stream_tests.cpp +++ b/shared/test/unit_test/command_stream/aub_file_stream_tests.cpp @@ -592,7 +592,7 @@ HWTEST_F(AubFileStreamTests, givenAubCommandStreamReceiverWhenFlushIsCalledThenI aubCsr.initializeTagAllocation(); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 1, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 1, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false, false}; ResidencyContainer allocationsForResidency; aubCsr.flush(batchBuffer, allocationsForResidency); @@ -761,9 +761,6 @@ HWTEST_F(AddPatchInfoCommentsAubTests, givenAddPatchInfoCommentsCalledWhenNoPatc auto aubExecutionEnvironment = getEnvironment>(false, true, true); auto aubCsr = aubExecutionEnvironment->template getCsr>(); - LinearStream cs(aubExecutionEnvironment->commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; - aubCsr->stream = mockAubFileStream.get(); bool result = aubCsr->addPatchInfoComments(); @@ -781,9 +778,6 @@ HWTEST_F(AddPatchInfoCommentsAubTests, givenAddPatchInfoCommentsCalledWhenFirstA auto aubExecutionEnvironment = getEnvironment>(false, true, true); auto aubCsr = aubExecutionEnvironment->template getCsr>(); - LinearStream cs(aubExecutionEnvironment->commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; - aubCsr->stream = mockAubFileStream.get(); mockAubFileStream->addCommentResult = false; @@ -797,9 +791,6 @@ HWTEST_F(AddPatchInfoCommentsAubTests, givenAddPatchInfoCommentsCalledWhenSecond auto aubExecutionEnvironment = getEnvironment>(false, true, true); auto aubCsr = aubExecutionEnvironment->template getCsr>(); - LinearStream cs(aubExecutionEnvironment->commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; - aubCsr->stream = mockAubFileStream.get(); mockAubFileStream->addCommentResults = {true, false}; @@ -814,9 +805,6 @@ HWTEST_F(AddPatchInfoCommentsAubTests, givenAddPatchInfoCommentsCalledWhenPatchI auto aubExecutionEnvironment = getEnvironment>(false, true, true); auto aubCsr = aubExecutionEnvironment->template getCsr>(); - LinearStream cs(aubExecutionEnvironment->commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; - aubCsr->stream = mockAubFileStream.get(); PatchInfoData patchInfoData[2] = {{0xAAAAAAAA, 128u, PatchInfoAllocationType::Default, 0xBBBBBBBB, 256u, PatchInfoAllocationType::Default}, @@ -879,9 +867,6 @@ HWTEST_F(AddPatchInfoCommentsAubTests, givenAddPatchInfoCommentsCalledWhenSource auto aubExecutionEnvironment = getEnvironment>(false, true, true); auto aubCsr = aubExecutionEnvironment->template getCsr>(); - LinearStream cs(aubExecutionEnvironment->commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; - aubCsr->stream = mockAubFileStream.get(); PatchInfoData patchInfoData = {0x0, 0u, PatchInfoAllocationType::Default, 0xBBBBBBBB, 0u, PatchInfoAllocationType::Default}; @@ -926,9 +911,6 @@ HWTEST_F(AddPatchInfoCommentsAubTests, givenAddPatchInfoCommentsCalledWhenTarget auto aubExecutionEnvironment = getEnvironment>(false, true, true); auto aubCsr = aubExecutionEnvironment->template getCsr>(); - LinearStream cs(aubExecutionEnvironment->commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 128u, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; - aubCsr->stream = mockAubFileStream.get(); PatchInfoData patchInfoData = {0xAAAAAAAA, 0u, PatchInfoAllocationType::Default, 0x0, 0u, PatchInfoAllocationType::Default}; diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 486ab4ddbd..3a5f9bea3e 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -2425,4 +2425,14 @@ HWTEST_F(CommandStreamReceiverHwTest, givenOutOfMemoryFailureOnFlushWhenInitiali commandStreamReceiver.flushReturnValue = SubmissionStatus::OUT_OF_HOST_MEMORY; EXPECT_EQ(SubmissionStatus::OUT_OF_HOST_MEMORY, commandStreamReceiver.initializeDeviceWithFirstSubmission()); +} + +HWTEST_F(CommandStreamReceiverHwTest, whenFlushTagUpdateThenSetStallingCmdsFlag) { + auto &ultCsr = pDevice->getUltCommandStreamReceiver(); + + ultCsr.recordFlusheBatchBuffer = true; + + EXPECT_EQ(SubmissionStatus::SUCCESS, ultCsr.flushTagUpdate()); + + EXPECT_TRUE(ultCsr.latestFlushedBatchBuffer.hasStallingCmds); } \ No newline at end of file diff --git a/shared/test/unit_test/command_stream/compute_mode_tests.h b/shared/test/unit_test/command_stream/compute_mode_tests.h index 2cadf67291..a8d16450b0 100644 --- a/shared/test/unit_test/command_stream/compute_mode_tests.h +++ b/shared/test/unit_test/command_stream/compute_mode_tests.h @@ -95,6 +95,6 @@ struct ComputeModeRequirements : public ::testing::Test { CommandStreamReceiver *csr = nullptr; std::unique_ptr device; - DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; + DispatchFlags flags{{}, nullptr, {}, nullptr, QueueThrottle::MEDIUM, PreemptionMode::Disabled, GrfConfig::DefaultGrfNumber, L3CachingSettings::l3CacheOn, ThreadArbitrationPolicy::NotPresent, AdditionalKernelExecInfo::NotApplicable, KernelExecutionType::NotApplicable, MemoryCompressionState::NotApplicable, QueueSliceCount::defaultSliceCount, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false}; GraphicsAllocation *alloc = nullptr; }; diff --git a/shared/test/unit_test/command_stream/tbx_command_stream_tests.cpp b/shared/test/unit_test/command_stream/tbx_command_stream_tests.cpp index 74af81de1f..26341c000d 100644 --- a/shared/test/unit_test/command_stream/tbx_command_stream_tests.cpp +++ b/shared/test/unit_test/command_stream/tbx_command_stream_tests.cpp @@ -467,7 +467,7 @@ HWTEST_F(TbxCommandStreamTests, givenTbxCommandStreamReceiverWhenFlushIsCalledTh auto commandBuffer = pDevice->executionEnvironment->memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize, pDevice->getDeviceBitfield()}); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 1, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 1, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false, false}; MockGraphicsAllocation allocation(reinterpret_cast(0x1000), 0x1000); ResidencyContainer allocationsForResidency = {&allocation}; @@ -493,7 +493,7 @@ HWTEST_F(TbxCommandStreamTests, givenTbxCommandStreamReceiverInBatchedModeWhenFl auto commandBuffer = pDevice->executionEnvironment->memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{pDevice->getRootDeviceIndex(), MemoryConstants::pageSize, pDevice->getDeviceBitfield()}); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 1, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 1, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false, false}; ResidencyContainer allocationsForResidency; tbxCsr.flush(batchBuffer, allocationsForResidency); diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 2e18de14ac..6643dd95e7 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -913,7 +913,7 @@ struct DirectSubmissionRelaxedOrderingTests : public DirectSubmissionDispatchBuf } template - bool verifySchedulerProgramming(LinearStream &cs, uint64_t deferredTaskListVa, uint64_t semaphoreGpuVa, uint32_t semaphoreValue, size_t offset); + bool verifySchedulerProgramming(LinearStream &cs, uint64_t deferredTaskListVa, uint64_t semaphoreGpuVa, uint32_t semaphoreValue, size_t offset, size_t &endOffset); template bool verifyMiPredicate(void *miPredicateCmd, MiPredicateType predicateType); @@ -1187,7 +1187,7 @@ bool DirectSubmissionRelaxedOrderingTests::verifyConditionalDataRegBbStart(void } template -bool DirectSubmissionRelaxedOrderingTests::verifySchedulerProgramming(LinearStream &cs, uint64_t deferredTaskListVa, uint64_t semaphoreGpuVa, uint32_t semaphoreValue, size_t offset) { +bool DirectSubmissionRelaxedOrderingTests::verifySchedulerProgramming(LinearStream &cs, uint64_t deferredTaskListVa, uint64_t semaphoreGpuVa, uint32_t semaphoreValue, size_t offset, size_t &endOffset) { using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_SET_PREDICATE = typename FamilyType::MI_SET_PREDICATE; @@ -1501,6 +1501,8 @@ bool DirectSubmissionRelaxedOrderingTests::verifySchedulerProgramming(LinearStre if (!verifyLri(lriCmd, CS_GPR_R5, 0)) { continue; } + lriCmd++; + endOffset = ptrDiff(lriCmd, cs.getCpuBase()); success = true; break; @@ -1704,9 +1706,6 @@ HWTEST_F(DirectSubmissionRelaxedOrderingTests, givenNotEnoughSpaceForTaskStoreSe } HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenDispatchingWorkThenDispatchScheduler, IsAtLeastXeHpcCore) { - using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; - using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE; - using MI_MATH = typename FamilyType::MI_MATH; using Dispatcher = RenderDispatcher; MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); @@ -1717,18 +1716,209 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenDispatchingWorkThenDispatchS uint64_t deferredTasksListVa = directSubmission.deferredTasksListAllocation->getGpuAddress(); uint64_t semaphoreGpuVa = directSubmission.semaphoreGpuVa; - EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, 0)); + size_t endOffset = 0; + + EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, 0, endOffset)); FlushStampTracker flushStamp(true); directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); - EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, offset)); + EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, offset, endOffset)); offset = directSubmission.ringCommandStream.getUsed(); directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); - EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, offset)); + EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, offset, endOffset)); +} + +HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithStallingCmdsWhenDispatchingThenProgramSchedulerWithR5, IsAtLeastXeHpcCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + + using Dispatcher = RenderDispatcher; + + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + + directSubmission.initialize(true, false); + size_t offset = directSubmission.ringCommandStream.getUsed(); + + uint64_t deferredTasksListVa = directSubmission.deferredTasksListAllocation->getGpuAddress(); + uint64_t semaphoreGpuVa = directSubmission.semaphoreGpuVa; + + size_t endOffset = 0; + + EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, 0, endOffset)); + + FlushStampTracker flushStamp(true); + batchBuffer.hasStallingCmds = false; + directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); + + EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, offset, endOffset)); + EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, endOffset, endOffset)); + + offset = directSubmission.ringCommandStream.getUsed(); + + batchBuffer.hasStallingCmds = true; + directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); + + HardwareParse hwParse; + hwParse.parseCommands(directSubmission.ringCommandStream, offset); + hwParse.findHardwareCommands(); + + bool success = false; + MI_LOAD_REGISTER_IMM *lriCmd = nullptr; + + for (auto &it : hwParse.cmdList) { + lriCmd = genCmdCast(it); + if (lriCmd) { + if (CS_GPR_R5 == lriCmd->getRegisterOffset() && lriCmd->getDataDword() == 1) { + success = true; + break; + } + } + } + + ASSERT_TRUE(success); + offset = ptrDiff(++lriCmd, directSubmission.ringCommandStream.getCpuBase()); + EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount - 1, offset, endOffset)); + + EXPECT_TRUE(endOffset > offset); + + EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, endOffset, endOffset)); +} + +HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenFirstBbWithStallingCmdsWhenDispatchingThenDontProgramSchedulerWithR5, IsAtLeastXeHpcCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + + using Dispatcher = RenderDispatcher; + + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + + directSubmission.initialize(true, false); + size_t offset = directSubmission.ringCommandStream.getUsed(); + + uint64_t deferredTasksListVa = directSubmission.deferredTasksListAllocation->getGpuAddress(); + uint64_t semaphoreGpuVa = directSubmission.semaphoreGpuVa; + + size_t endOffset = 0; + + EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, 0, endOffset)); + + FlushStampTracker flushStamp(true); + batchBuffer.hasStallingCmds = true; + directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); + + HardwareParse hwParse; + hwParse.parseCommands(directSubmission.ringCommandStream, offset); + hwParse.findHardwareCommands(); + + bool success = false; + MI_LOAD_REGISTER_IMM *lriCmd = nullptr; + + for (auto &it : hwParse.cmdList) { + lriCmd = genCmdCast(it); + if (lriCmd) { + if (CS_GPR_R5 == lriCmd->getRegisterOffset() && lriCmd->getDataDword() == 1) { + success = true; + break; + } + } + } + + EXPECT_FALSE(success); +} + +HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenStoppingRingThenProgramSchedulerWithR5, IsAtLeastXeHpcCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + + using Dispatcher = RenderDispatcher; + + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + + directSubmission.initialize(true, false); + size_t offset = directSubmission.ringCommandStream.getUsed(); + + uint64_t deferredTasksListVa = directSubmission.deferredTasksListAllocation->getGpuAddress(); + uint64_t semaphoreGpuVa = directSubmission.semaphoreGpuVa; + + size_t endOffset = 0; + + EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, 0, endOffset)); + + FlushStampTracker flushStamp(true); + batchBuffer.hasStallingCmds = false; + directSubmission.dispatchCommandBuffer(batchBuffer, flushStamp); + + EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, offset, endOffset)); + EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, endOffset, endOffset)); + + offset = directSubmission.ringCommandStream.getUsed(); + + directSubmission.stopRingBuffer(); + + HardwareParse hwParse; + hwParse.parseCommands(directSubmission.ringCommandStream, offset); + hwParse.findHardwareCommands(); + + bool success = false; + MI_LOAD_REGISTER_IMM *lriCmd = nullptr; + + for (auto &it : hwParse.cmdList) { + lriCmd = genCmdCast(it); + if (lriCmd) { + if (CS_GPR_R5 == lriCmd->getRegisterOffset() && lriCmd->getDataDword() == 1) { + success = true; + break; + } + } + } + + ASSERT_TRUE(success); + offset = ptrDiff(lriCmd, directSubmission.ringCommandStream.getCpuBase()); + EXPECT_TRUE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, offset, endOffset)); + + EXPECT_TRUE(endOffset > offset); + + EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, endOffset, endOffset)); +} + +HWTEST2_F(DirectSubmissionRelaxedOrderingTests, WhenStoppingRingWithoutSubmissionThenDontProgramSchedulerWithR5, IsAtLeastXeHpcCore) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + + using Dispatcher = RenderDispatcher; + + MockDirectSubmissionHw directSubmission(*pDevice->getDefaultEngine().commandStreamReceiver); + + directSubmission.initialize(true, false); + size_t offset = directSubmission.ringCommandStream.getUsed(); + + uint64_t deferredTasksListVa = directSubmission.deferredTasksListAllocation->getGpuAddress(); + uint64_t semaphoreGpuVa = directSubmission.semaphoreGpuVa; + + size_t endOffset = 0; + + EXPECT_FALSE(verifySchedulerProgramming(directSubmission.ringCommandStream, deferredTasksListVa, semaphoreGpuVa, directSubmission.currentQueueWorkCount, 0, endOffset)); + + directSubmission.stopRingBuffer(); + + HardwareParse hwParse; + hwParse.parseCommands(directSubmission.ringCommandStream, offset); + hwParse.findHardwareCommands(); + + bool success = false; + MI_LOAD_REGISTER_IMM *lriCmd = nullptr; + + for (auto &it : hwParse.cmdList) { + lriCmd = genCmdCast(it); + if (lriCmd) { + if (CS_GPR_R5 == lriCmd->getRegisterOffset() && lriCmd->getDataDword() == 1) { + success = true; + break; + } + } + } + + EXPECT_FALSE(success); } HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenProgrammingEndingCmdsThenSetReturnRegisters, IsAtLeastXeHpcCore) { diff --git a/shared/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp b/shared/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp index 521dd71b2f..65ef67f795 100644 --- a/shared/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_command_stream_tests_1.cpp @@ -212,7 +212,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenLowPriorityContextWhenFlushingThen CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, nullptr, false, true, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); csr->flush(batchBuffer, csr->getResidencyAllocations()); EXPECT_NE(cs.getCpuBase(), nullptr); @@ -228,7 +228,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenLowPriorityContextWhenFlushingThen } HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenInvalidAddressWhenFlushingThenSucceeds) { - //allocate command buffer manually + // allocate command buffer manually char *commandBuffer = new (std::nothrow) char[1024]; ASSERT_NE(nullptr, commandBuffer); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) @@ -300,7 +300,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenNotAlignedWhenFlushingThenSucceeds auto &cs = csr->getCS(); auto commandBuffer = static_cast(cs.getGraphicsAllocation()); - //make sure command buffer with offset is not page aligned + // make sure command buffer with offset is not page aligned ASSERT_NE(0u, (reinterpret_cast(commandBuffer->getUnderlyingBuffer()) + 4) & (this->alignment - 1)); ASSERT_EQ(4u, (reinterpret_cast(commandBuffer->getUnderlyingBuffer()) + 4) & 0x7F); @@ -309,7 +309,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenNotAlignedWhenFlushingThenSucceeds CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; csr->flush(batchBuffer, csr->getResidencyAllocations()); EXPECT_EQ(1, mock->ioctlCount.gemUserptr); @@ -344,7 +345,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenCheckDrmFreeWhenFlushingThenSuccee auto &cs = csr->getCS(); auto commandBuffer = static_cast(cs.getGraphicsAllocation()); - //make sure command buffer with offset is not page aligned + // make sure command buffer with offset is not page aligned ASSERT_NE(0u, (reinterpret_cast(commandBuffer->getUnderlyingBuffer()) + 4) & (this->alignment - 1)); ASSERT_EQ(4u, (reinterpret_cast(commandBuffer->getUnderlyingBuffer()) + 4) & 0x7F); @@ -356,7 +357,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenCheckDrmFreeWhenFlushingThenSuccee csr->makeResident(allocation); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; csr->flush(batchBuffer, csr->getResidencyAllocations()); EXPECT_EQ(1, mock->ioctlCount.gemUserptr); @@ -384,7 +386,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenCheckDrmFreeCloseFailedWhenFlushin auto &cs = csr->getCS(); auto commandBuffer = static_cast(cs.getGraphicsAllocation()); - //make sure command buffer with offset is not page aligned + // make sure command buffer with offset is not page aligned ASSERT_NE(0u, (reinterpret_cast(commandBuffer->getUnderlyingBuffer()) + 4) & (this->alignment - 1)); ASSERT_EQ(4u, (reinterpret_cast(commandBuffer->getUnderlyingBuffer()) + 4) & 0x7F); @@ -398,7 +400,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamTest, GivenCheckDrmFreeCloseFailedWhenFlushin csr->makeResident(allocation); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; csr->flush(batchBuffer, csr->getResidencyAllocations()); EXPECT_EQ(1, mock->ioctlCount.gemUserptr); @@ -486,12 +489,12 @@ HWTEST_TEMPLATED_F(DrmCommandStreamBatchingTests, givenCsrWhenDispatchPolicyIsSe csr->flushTask(cs, 0u, &cs, &cs, &cs, 0u, dispatchFlags, *device); - //make sure command buffer is recorded + // make sure command buffer is recorded auto &cmdBuffers = mockedSubmissionsAggregator->peekCommandBuffers(); EXPECT_FALSE(cmdBuffers.peekIsEmpty()); EXPECT_NE(nullptr, cmdBuffers.peekHead()); - //preemption allocation + // preemption allocation size_t csrSurfaceCount = (device->getPreemptionMode() == PreemptionMode::MidThread) ? 2 : 0; csrSurfaceCount += testedCsr->globalFenceAllocation ? 1 : 0; csrSurfaceCount += testedCsr->clearColorAllocation ? 1 : 0; @@ -500,7 +503,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamBatchingTests, givenCsrWhenDispatchPolicyIsSe auto recordedCmdBuffer = cmdBuffers.peekHead(); EXPECT_EQ(3u + csrSurfaceCount, recordedCmdBuffer->surfaces.size()); - //try to find all allocations + // try to find all allocations auto elementInVector = std::find(recordedCmdBuffer->surfaces.begin(), recordedCmdBuffer->surfaces.end(), dummyAllocation); EXPECT_NE(elementInVector, recordedCmdBuffer->surfaces.end()); @@ -544,7 +547,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamBatchingTests, givenRecordedCommandBufferWhen csr->setTagAllocation(static_cast(allocations->getGraphicsAllocation(csr->getRootDeviceIndex()))); auto &submittedCommandBuffer = csr->getCS(1024); - //use some bytes + // use some bytes submittedCommandBuffer.getSpace(4); DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); @@ -566,13 +569,13 @@ HWTEST_TEMPLATED_F(DrmCommandStreamBatchingTests, givenRecordedCommandBufferWhen auto commandBufferGraphicsAllocation = submittedCommandBuffer.getGraphicsAllocation(); EXPECT_TRUE(commandBufferGraphicsAllocation->isResident(csr->getOsContext().getContextId())); - //preemption allocation + // preemption allocation size_t csrSurfaceCount = (device->getPreemptionMode() == PreemptionMode::MidThread) ? 2 : 0; csrSurfaceCount += testedCsr->globalFenceAllocation ? 1 : 0; csrSurfaceCount += testedCsr->clearColorAllocation ? 1 : 0; csrSurfaceCount += testedCsr->getKernelArgsBufferAllocation() ? 1 : 0; - //validate that submited command buffer has what we want + // validate that submited command buffer has what we want EXPECT_EQ(3u + csrSurfaceCount, this->mock->execBuffer.getBufferCount()); EXPECT_EQ(4u, this->mock->execBuffer.getBatchStartOffset()); EXPECT_EQ(submittedCommandBuffer.getUsed(), this->mock->execBuffer.getBatchLen()); @@ -813,7 +816,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamDirectSubmissionTest, givenDirectSubmissionFa auto &cs = csr->getCS(); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; uint8_t bbStart[64]; batchBuffer.endCmdPtr = &bbStart[0]; @@ -832,7 +836,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamBlitterDirectSubmissionTest, givenBlitterDire auto &cs = csr->getCS(); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; uint8_t bbStart[64]; batchBuffer.endCmdPtr = &bbStart[0]; @@ -855,7 +860,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamDirectSubmissionTest, givenEnabledDirectSubmi auto &cs = csr->getCS(); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; uint8_t bbStart[64]; batchBuffer.endCmdPtr = &bbStart[0]; @@ -874,7 +880,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamDirectSubmissionTest, givenEnabledDirectSubmi auto &cs = csr->getCS(); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; uint8_t bbStart[64]; batchBuffer.endCmdPtr = &bbStart[0]; @@ -892,7 +899,8 @@ HWTEST_TEMPLATED_F(DrmCommandStreamBlitterDirectSubmissionTest, givenEnabledDire auto &cs = csr->getCS(); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 4, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.startOffset = 4; uint8_t bbStart[64]; batchBuffer.endCmdPtr = &bbStart[0]; diff --git a/shared/test/unit_test/os_interface/linux/drm_command_stream_xehp_and_later_prelim_tests.cpp b/shared/test/unit_test/os_interface/linux/drm_command_stream_xehp_and_later_prelim_tests.cpp index 1be40107bc..52eaebc000 100644 --- a/shared/test/unit_test/os_interface/linux/drm_command_stream_xehp_and_later_prelim_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_command_stream_xehp_and_later_prelim_tests.cpp @@ -351,11 +351,7 @@ HWTEST_TEMPLATED_F(DrmCommandStreamForceTileTest, givenForceExecutionTileThenCor auto &cs = csr->getCS(); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), - 0, 0, 0, nullptr, false, false, - QueueThrottle::MEDIUM, - QueueSliceCount::defaultSliceCount, - cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); csr->flush(batchBuffer, csr->getResidencyAllocations()); } @@ -609,7 +605,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, DrmImplicitScalingCommandStreamTest, givenUseSingle csr->CommandStreamReceiver::makeResident(*allocation); auto &cs = csr->getCS(); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, true}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.useSingleSubdevice = true; csr->flush(batchBuffer, csr->getResidencyAllocations()); diff --git a/shared/test/unit_test/os_interface/linux/drm_memory_manager_localmem_prelim_tests.cpp b/shared/test/unit_test/os_interface/linux/drm_memory_manager_localmem_prelim_tests.cpp index 88b212ee7a..0a30e742c3 100644 --- a/shared/test/unit_test/os_interface/linux/drm_memory_manager_localmem_prelim_tests.cpp +++ b/shared/test/unit_test/os_interface/linux/drm_memory_manager_localmem_prelim_tests.cpp @@ -13,6 +13,7 @@ #include "shared/source/memory_manager/memory_banks.h" #include "shared/source/memory_manager/unified_memory_manager.h" #include "shared/source/os_interface/linux/allocator_helper.h" +#include "shared/test/common/helpers/batch_buffer_helper.h" #include "shared/test/common/libult/linux/drm_mock_helper.h" #include "shared/test/common/libult/linux/drm_mock_prelim_context.h" #include "shared/test/common/libult/linux/drm_query_mock.h" @@ -2270,7 +2271,7 @@ struct DrmCommandStreamEnhancedPrelimTest : public DrmCommandStreamEnhancedTempl LinearStream cs(commandBuffer); CommandStreamReceiverHw::addBatchBufferEnd(cs, nullptr); EncodeNoop::alignToCacheLine(cs); - this->batchBuffer = BatchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + this->batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); this->allocation = this->mm->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); this->csr->makeResident(*this->allocation); } diff --git a/shared/test/unit_test/os_interface/windows/device_command_stream_tests.cpp b/shared/test/unit_test/os_interface/windows/device_command_stream_tests.cpp index 6713b2bd01..cd50bfa5e9 100644 --- a/shared/test/unit_test/os_interface/windows/device_command_stream_tests.cpp +++ b/shared/test/unit_test/os_interface/windows/device_command_stream_tests.cpp @@ -288,7 +288,7 @@ TEST_F(WddmCommandStreamTest, GivenOffsetWhenFlushingThenFlushIsSubmittedCorrect ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), offset, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer{cs.getGraphicsAllocation(), offset, 0, 0, nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false, false}; csr->flush(batchBuffer, csr->getResidencyAllocations()); EXPECT_EQ(1u, wddm->submitResult.called); EXPECT_TRUE(wddm->submitResult.success); @@ -428,7 +428,8 @@ TEST_F(WddmCommandStreamTest, givenWdmmWhenSubmitIsCalledWhenEUCountWouldBeOddTh wddm->getGtSysInfo()->EUCount = 9; wddm->getGtSysInfo()->SubSliceCount = 3; - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, nullptr, false, false, QueueThrottle::LOW, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.throttle = QueueThrottle::LOW; csr->flush(batchBuffer, csr->getResidencyAllocations()); auto commandHeader = wddm->submitResult.commandHeaderSubmitted; @@ -446,7 +447,8 @@ TEST_F(WddmCommandStreamTest, givenWdmmWhenSubmitIsCalledAndThrottleIsToLowThenS ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, nullptr, false, false, QueueThrottle::LOW, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.throttle = QueueThrottle::LOW; csr->flush(batchBuffer, csr->getResidencyAllocations()); auto commandHeader = wddm->submitResult.commandHeaderSubmitted; @@ -482,7 +484,8 @@ TEST_F(WddmCommandStreamTest, givenWdmmWhenSubmitIsCalledAndThrottleIsToHighThen ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, nullptr, false, false, QueueThrottle::HIGH, QueueSliceCount::defaultSliceCount, cs.getUsed(), &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.throttle = QueueThrottle::HIGH; csr->flush(batchBuffer, csr->getResidencyAllocations()); auto commandHeader = wddm->submitResult.commandHeaderSubmitted; @@ -881,7 +884,7 @@ HWTEST_TEMPLATED_F(WddmCommandStreamMockGdiTest, WhenMakingResidentThenResidency HWTEST_TEMPLATED_F(WddmCommandStreamMockGdiTest, givenRecordedCommandBufferWhenItIsSubmittedThenFlushTaskIsProperlyCalled) { auto mockCsr = static_cast *>(csr); - //preemption allocation + sip allocation + // preemption allocation + sip allocation size_t csrSurfaceCount = 0; if (device->getPreemptionMode() == PreemptionMode::MidThread) { csrSurfaceCount = 2; @@ -1120,9 +1123,7 @@ HWTEST_TEMPLATED_F(WddmCommandStreamMockGdiTest, givenDirectSubmissionFailsThenF GraphicsAllocation *commandBuffer = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, - nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), - &cs, commandBuffer->getUnderlyingBuffer(), false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); mockCsr->directSubmission = std::make_unique(*device->getDefaultEngine().commandStreamReceiver); auto res = csr->flush(batchBuffer, csr->getResidencyAllocations()); @@ -1155,9 +1156,9 @@ HWTEST_TEMPLATED_F(WddmCommandStreamMockGdiTest, givenDirectSubmissionEnabledOnR GraphicsAllocation *commandBuffer = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, - nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), - &cs, commandBuffer->getUnderlyingBuffer(), false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.endCmdPtr = commandBuffer->getUnderlyingBuffer(); + csr->flush(batchBuffer, csr->getResidencyAllocations()); auto directSubmission = reinterpret_cast(mockCsr->directSubmission.get()); EXPECT_TRUE(directSubmission->ringStart); @@ -1196,9 +1197,9 @@ HWTEST_TEMPLATED_F(WddmCommandStreamMockGdiTest, givenDirectSubmissionEnabledOnB GraphicsAllocation *commandBuffer = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, - nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), - &cs, commandBuffer->getUnderlyingBuffer(), false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + batchBuffer.endCmdPtr = commandBuffer->getUnderlyingBuffer(); + csr->flush(batchBuffer, csr->getResidencyAllocations()); auto directSubmission = reinterpret_cast(mockCsr->blitterDirectSubmission.get()); EXPECT_TRUE(directSubmission->ringStart); @@ -1223,9 +1224,8 @@ TEST_F(WddmCommandStreamTest, givenResidencyLoggingAvailableWhenFlushingCommandB GraphicsAllocation *commandBuffer = memoryManager->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), MemoryConstants::pageSize}); ASSERT_NE(nullptr, commandBuffer); LinearStream cs(commandBuffer); - BatchBuffer batchBuffer{cs.getGraphicsAllocation(), 0, 0, 0, - nullptr, false, false, QueueThrottle::MEDIUM, QueueSliceCount::defaultSliceCount, cs.getUsed(), - &cs, nullptr, false}; + BatchBuffer batchBuffer = BatchBufferHelper::createDefaultBatchBuffer(cs.getGraphicsAllocation(), &cs, cs.getUsed()); + DebugManagerStateRestore restorer; DebugManager.flags.WddmResidencyLogger.set(1);