From 2c5cbec0338527af5e437ba191b4dd14ba85f4d5 Mon Sep 17 00:00:00 2001 From: Tomasz Biernacik Date: Thu, 10 Jul 2025 10:59:42 +0000 Subject: [PATCH] feature: control post sync completion check Related-To: NEO-14844 Signed-off-by: Tomasz Biernacik --- level_zero/core/source/cmdlist/cmdlist.h | 12 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 28 +++ .../source/cmdlist/cmdlist_hw_immediate.inl | 39 ++-- .../cmdlist/cmdlist_hw_skl_to_tgllp.inl | 1 + .../cmdlist/cmdlist_hw_xehp_and_later.inl | 1 + level_zero/core/source/cmdqueue/cmdqueue.h | 7 + .../core/source/cmdqueue/cmdqueue_hw.inl | 3 + .../sources/cmdlist/test_cmdlist_1.cpp | 219 ++++++++++++++++++ opencl/source/command_queue/command_queue.cpp | 1 + opencl/source/command_queue/command_queue.h | 16 ++ opencl/source/command_queue/enqueue_common.h | 3 + .../command_queue/hardware_interface_base.inl | 5 + .../command_queue/dispatch_walker_tests.cpp | 62 +++-- .../command_stream/command_stream_receiver.h | 1 + .../command_stream_receiver_hw_base.inl | 16 ++ ...mand_stream_receiver_hw_xehp_and_later.inl | 2 + .../source/command_stream/csr_definitions.h | 2 + shared/source/helpers/pipe_control_args.h | 3 +- shared/source/os_interface/product_helper.h | 1 + shared/source/os_interface/product_helper.inl | 5 + .../source/os_interface/product_helper_hw.h | 1 + .../libult/ult_command_stream_receiver.h | 1 + .../command_stream_receiver_tests.cpp | 46 ++++ .../os_interface/product_helper_tests.cpp | 4 + 24 files changed, 447 insertions(+), 32 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 50865c7cf7..9126e7e71b 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -456,6 +456,16 @@ struct CommandList : _ze_command_list_handle_t { return this->captureTarget->capture(apiArgs...); } + inline bool getIsWalkerWithProfilingEnqueued() { + return this->isWalkerWithProfilingEnqueued; + } + + inline bool getAndClearIsWalkerWithProfilingEnqueued() { + bool retVal = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; + return retVal; + } + protected: NEO::GraphicsAllocation *getAllocationFromHostPtrMap(const void *buffer, uint64_t bufferSize, bool copyOffload); NEO::GraphicsAllocation *getHostPtrAlloc(const void *buffer, uint64_t bufferSize, bool hostCopyAllowed, bool copyOffload); @@ -553,6 +563,8 @@ struct CommandList : _ze_command_list_handle_t { bool l3FlushAfterPostSyncRequired = false; bool textureCacheFlushPending = false; bool closedCmdList = false; + bool isWalkerWithProfilingEnqueued = false; + bool shouldRegisterEnqueuedWalkerWithProfiling = false; Graph *captureTarget = nullptr; }; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 6e465ee165..4471880269 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -157,6 +157,7 @@ ze_result_t CommandListCoreFamily::reset() { taskCountUpdateFenceRequired = false; textureCacheFlushPending = false; closedCmdList = false; + isWalkerWithProfilingEnqueued = false; this->inOrderPatchCmds.clear(); @@ -273,6 +274,7 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->compactL3FlushEventPacket = L0GfxCoreHelper::useCompactL3FlushEventPacket(hwInfo, this->l3FlushAfterPostSyncRequired); this->useAdditionalBlitProperties = productHelper.useAdditionalBlitProperties(); this->isPostImageWriteFlushRequired = releaseHelper ? releaseHelper->isPostImageWriteFlushRequired() : false; + this->shouldRegisterEnqueuedWalkerWithProfiling = this->device->getNEODevice()->getProductHelper().shouldRegisterEnqueuedWalkerWithProfiling(); if (NEO::debugManager.flags.OverrideThreadArbitrationPolicy.get() != -1) { this->defaultPipelinedThreadArbitrationPolicy = NEO::debugManager.flags.OverrideThreadArbitrationPolicy.get(); @@ -448,6 +450,10 @@ ze_result_t CommandListCoreFamily::appendLaunchKernel(ze_kernel_h if (!launchParams.isKernelSplitOperation) { event->resetKernelCountAndPacketUsedCount(); } + + if (event->isEventTimestampFlagSet() && this->shouldRegisterEnqueuedWalkerWithProfiling) { + this->isWalkerWithProfilingEnqueued = true; + } } if (!handleCounterBasedEventOperations(event, launchParams.omitAddingEventResidency)) { @@ -501,6 +507,10 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelIndirect(ze_ event->setKernelWithPrintfDeviceMutex(kernel->getDevicePrintfKernelMutex()); } launchParams.isHostSignalScopeEvent = event->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST); + + if (event->isEventTimestampFlagSet() && this->shouldRegisterEnqueuedWalkerWithProfiling) { + this->isWalkerWithProfilingEnqueued = true; + } } if (!handleCounterBasedEventOperations(event, false)) { @@ -547,6 +557,10 @@ ze_result_t CommandListCoreFamily::appendLaunchMultipleKernelsInd if (hEvent) { event = Event::fromHandle(hEvent); launchParams.isHostSignalScopeEvent = event->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST); + + if (this->shouldRegisterEnqueuedWalkerWithProfiling && event->isEventTimestampFlagSet()) { + this->isWalkerWithProfilingEnqueued = true; + } } if (!handleCounterBasedEventOperations(event, false)) { @@ -1476,6 +1490,12 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyKernelWithGA(v auto lock = device->getBuiltinFunctionsLib()->obtainUniqueOwnership(); + if (signalEvent) { + if (signalEvent->isEventTimestampFlagSet() && this->shouldRegisterEnqueuedWalkerWithProfiling) { + this->isWalkerWithProfilingEnqueued = true; + } + } + Kernel *builtinKernel = nullptr; builtinKernel = device->getBuiltinFunctionsLib()->getFunction(builtin); @@ -2351,6 +2371,9 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, signalEvent = Event::fromHandle(hSignalEvent); launchParams.isHostSignalScopeEvent = signalEvent->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST); dcFlush = getDcFlushRequired(signalEvent->isSignalScope()); + if (signalEvent->isEventTimestampFlagSet() && this->shouldRegisterEnqueuedWalkerWithProfiling) { + this->isWalkerWithProfilingEnqueued = true; + } } if (isCopyOnly(memoryCopyParams.copyOffloadAllowed)) { @@ -3267,6 +3290,7 @@ void CommandListCoreFamily::appendSignalInOrderDependencyCounter( args.dcFlushEnable = true; args.workloadPartitionOffset = partitionCount > 1; args.textureCacheInvalidationEnable = textureFlushRequired; + args.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( *cmdStream, @@ -3484,6 +3508,7 @@ ze_result_t CommandListCoreFamily::appendWriteGlobalTimestamp( } else { NEO::PipeControlArgs args; args.blockSettingPostSyncProperties = true; + args.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( *commandContainer.getCommandStream(), @@ -4291,6 +4316,7 @@ ze_result_t CommandListCoreFamily::appendWriteToMemory(void *desc NEO::PipeControlArgs args; args.dcFlushEnable = getDcFlushRequired(!!descriptor->writeScope); args.dcFlushEnable &= dstAllocationStruct.needsFlush; + args.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( *commandContainer.getCommandStream(), @@ -4428,6 +4454,7 @@ void CommandListCoreFamily::dispatchPostSyncCommands(const CmdLis NEO::PipeControlArgs pipeControlArgs; pipeControlArgs.dcFlushEnable = getDcFlushRequired(signalScope); pipeControlArgs.workloadPartitionOffset = eventOperations.workPartitionOperation; + pipeControlArgs.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); const auto &productHelper = this->device->getNEODevice()->getRootDeviceEnvironment().template getHelper(); if (productHelper.isDirectSubmissionConstantCacheInvalidationNeeded(this->device->getHwInfo())) { @@ -4853,6 +4880,7 @@ void CommandListCoreFamily::programEventL3Flush(Event *event) { NEO::PipeControlArgs args; args.dcFlushEnable = true; args.workloadPartitionOffset = partitionCount > 1; + args.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( cmdListStream, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index e96d0df528..48233b23b0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -263,15 +263,17 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushImmedia NEO::LinearStream *optionalEpilogueCmdStream = getOptionalEpilogueCmdStream(&cmdStreamTask, appendOperation); NEO::ImmediateDispatchFlags dispatchFlags{ - &this->requiredStreamState, // requiredState - sshCpuPointer, // sshCpuBase - optionalEpilogueCmdStream, // optionalEpilogueCmdStream - appendOperation, // dispatchOperation - this->isSyncModeQueue, // blockingAppend - requireTaskCountUpdate, // requireTaskCountUpdate - hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies - hasStallingCmds // hasStallingCmds + &this->requiredStreamState, // requiredState + sshCpuPointer, // sshCpuBase + optionalEpilogueCmdStream, // optionalEpilogueCmdStream + appendOperation, // dispatchOperation + this->isSyncModeQueue, // blockingAppend + requireTaskCountUpdate, // requireTaskCountUpdate + hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies + hasStallingCmds, // hasStallingCmds + this->isWalkerWithProfilingEnqueued // isWalkerWithProfilingEnqueued }; + this->isWalkerWithProfilingEnqueued = false; CommandListImp::storeReferenceTsToMappedEvents(true); return getCsr(false)->flushImmediateTask(cmdStreamTask, @@ -294,15 +296,17 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushImmedia NEO::LinearStream *optionalEpilogueCmdStream = getOptionalEpilogueCmdStream(&cmdStreamTask, appendOperation); NEO::ImmediateDispatchFlags dispatchFlags{ - nullptr, // requiredState - sshCpuPointer, // sshCpuBase - optionalEpilogueCmdStream, // optionalEpilogueCmdStream - appendOperation, // dispatchOperation - this->isSyncModeQueue, // blockingAppend - requireTaskCountUpdate, // requireTaskCountUpdate - hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies - hasStallingCmds // hasStallingCmds + nullptr, // requiredState + sshCpuPointer, // sshCpuBase + optionalEpilogueCmdStream, // optionalEpilogueCmdStream + appendOperation, // dispatchOperation + this->isSyncModeQueue, // blockingAppend + requireTaskCountUpdate, // requireTaskCountUpdate + hasRelaxedOrderingDependencies, // hasRelaxedOrderingDependencies + hasStallingCmds, // hasStallingCmds + this->isWalkerWithProfilingEnqueued // isWalkerWithProfilingEnqueued }; + this->isWalkerWithProfilingEnqueued = false; CommandListImp::storeReferenceTsToMappedEvents(true); return getCsr(false)->flushImmediateTaskStateless(cmdStreamTask, @@ -348,6 +352,7 @@ NEO::CompletionStamp CommandListCoreFamilyImmediate::flushRegular false // isDcFlushRequiredOnStallingCommandsOnNextFlush ); + dispatchFlags.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); dispatchFlags.optionalEpilogueCmdStream = getOptionalEpilogueCmdStream(&cmdStreamTask, appendOperation); auto ioh = (this->commandContainer.getIndirectHeap(NEO::IndirectHeap::Type::indirectObject)); @@ -1806,6 +1811,8 @@ ze_result_t CommandListCoreFamilyImmediate::appendCommandLists(ui return ret; } + this->isWalkerWithProfilingEnqueued |= this->cmdQImmediate->getAndClearIsWalkerWithProfilingEnqueued(); + CommandListCoreFamily::appendSignalEventPostWalker(signalEvent, nullptr, nullptr, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl index 8193543579..a811931bfe 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl @@ -329,6 +329,7 @@ void CommandListCoreFamily::appendComputeBarrierCommand() { template inline NEO::PipeControlArgs CommandListCoreFamily::createBarrierFlags() { NEO::PipeControlArgs args; + args.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); return args; } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 56268aa34e..0a85fcda4d 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -632,6 +632,7 @@ NEO::PipeControlArgs CommandListCoreFamily::createBarrierFlags() args.hdcPipelineFlush = true; args.unTypedDataPortCacheFlush = true; args.textureCacheInvalidationEnable = this->consumeTextureCacheFlushPending(); + args.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); return args; } diff --git a/level_zero/core/source/cmdqueue/cmdqueue.h b/level_zero/core/source/cmdqueue/cmdqueue.h index 7ac9f3ec00..1b8b24494d 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.h +++ b/level_zero/core/source/cmdqueue/cmdqueue.h @@ -84,6 +84,12 @@ struct CommandQueue : _ze_command_queue_handle_t { TaskCountType getTaskCount() const { return taskCount; } void setTaskCount(TaskCountType newTaskCount) { taskCount = newTaskCount; } + inline bool getAndClearIsWalkerWithProfilingEnqueued() { + bool retVal = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; + return retVal; + } + protected: bool frontEndTrackingEnabled() const; @@ -104,6 +110,7 @@ struct CommandQueue : _ze_command_queue_handle_t { bool dispatchCmdListBatchBufferAsPrimary = false; bool heaplessModeEnabled = false; bool heaplessStateInitEnabled = false; + bool isWalkerWithProfilingEnqueued = false; }; using CommandQueueAllocatorFn = CommandQueue *(*)(Device *device, NEO::CommandStreamReceiver *csr, diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index b2de6d6b80..b962a985cc 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -827,6 +827,8 @@ ze_result_t CommandQueueHw::setupCmdListsAndContextParams( ctx.spaceForResidency += estimateCommandListResidencySize(commandList); } + + this->isWalkerWithProfilingEnqueued = commandList->getIsWalkerWithProfilingEnqueued(); } this->getCsr()->getResidencyAllocations().reserve(ctx.spaceForResidency); @@ -1387,6 +1389,7 @@ void CommandQueueHw::dispatchTaskCountPostSyncRegular( args.dcFlushEnable = this->csr->getDcFlushSupport(); args.workloadPartitionOffset = this->partitionCount > 1; args.notifyEnable = this->csr->isUsedNotifyEnableForPostSync(); + args.isWalkerWithProfilingEnqueued = this->getAndClearIsWalkerWithProfilingEnqueued(); NEO::MemorySynchronizationCommands::addBarrierWithPostSyncOperation( cmdStream, NEO::PostSyncMode::immediateData, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index 06d058d7e6..ac08c1f0de 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -2164,6 +2164,225 @@ HWTEST2_F(CommandListCreateTests, givenInOrderExecutionWhenDispatchingRelaxedOrd NEO::CompareOperation::less, true, FamilyType::isQwordInOrderCounter, false)); } +HWTEST2_F(CommandListCreateTests, givenDirectSubmissionAndImmCmdListWhenDispatchingWalkerWithProfilingThenSetCsrFlagIsWalkerWithProfilingEnqueued, IsAtLeastXeCore) { + ze_command_queue_desc_t desc = {}; + desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; + ze_result_t returnValue; + std::unique_ptr commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::renderCompute, returnValue)); + ASSERT_NE(nullptr, commandList); + auto whiteBoxCmdList = static_cast(commandList.get()); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_event_handle_t event = nullptr; + + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event)); + std::unique_ptr eventObject(L0::Event::fromHandle(event)); + + std::unique_ptr mockModule = std::make_unique(device, nullptr, ModuleType::builtin); + Mock<::L0::KernelImp> kernel; + kernel.module = mockModule.get(); + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + + uint8_t srcPtr[64] = {}; + uint8_t dstPtr[64] = {}; + const ze_copy_region_t region = {0U, 0U, 0U, 1, 1, 0U}; + + driverHandle->importExternalPointer(dstPtr, MemoryConstants::pageSize); + + auto ultCsr = static_cast *>(whiteBoxCmdList->getCsr(false)); + + auto verifyFlag = [&ultCsr](ze_result_t result, bool dispatchFlag) { + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(ultCsr->isWalkerWithProfilingEnqueued, dispatchFlag); + ultCsr->isWalkerWithProfilingEnqueued = false; + }; + + auto expectFlagEnabled = true && this->device->getNEODevice()->getProductHelper().shouldRegisterEnqueuedWalkerWithProfiling(); + // non-pipelined state + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, launchParams), expectFlagEnabled); + + // non-pipelined state already programmed + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, launchParams), expectFlagEnabled); + + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams), false); + + verifyFlag(commandList->appendLaunchKernelIndirect(kernel.toHandle(), groupCount, event, 0, nullptr, false), expectFlagEnabled); + + verifyFlag(commandList->appendBarrier(event, 0, nullptr, false), false); + + CmdListMemoryCopyParams copyParams = {}; + verifyFlag(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendMemoryCopyRegion(dstPtr, ®ion, 0, 0, srcPtr, ®ion, 0, 0, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendEventReset(event), false); + + verifyFlag(commandList->appendSignalEvent(event, false), false); + + verifyFlag(commandList->appendPageFaultCopy(kernel.getIsaAllocation(), kernel.getIsaAllocation(), 1, false), false); + + verifyFlag(commandList->appendWaitOnEvents(1, &event, nullptr, false, true, false, false, false, false), false); + + verifyFlag(commandList->appendWriteGlobalTimestamp(reinterpret_cast(dstPtr), event, 0, nullptr), false); + + if constexpr (FamilyType::supportsSampler) { + auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::copyImageRegion); + auto mockBuiltinKernel = static_cast *>(kernel); + mockBuiltinKernel->setArgRedescribedImageCallBase = false; + + auto image = std::make_unique>>(); + ze_image_region_t imgRegion = {1, 1, 1, 1, 1, 1}; + ze_image_desc_t zeDesc = {}; + zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + image->initialize(device, &zeDesc); + auto bytesPerPixel = static_cast(image->getImageInfo().surfaceFormat->imageElementSizeInBytes); + + CmdListMemoryCopyParams copyParams = {}; + + verifyFlag(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendImageCopyFromMemoryExt(image->toHandle(), dstPtr, &imgRegion, bytesPerPixel, bytesPerPixel, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendImageCopyToMemoryExt(dstPtr, image->toHandle(), &imgRegion, bytesPerPixel, bytesPerPixel, event, 0, nullptr, copyParams), expectFlagEnabled); + } + + size_t rangeSizes = 1; + const void **ranges = reinterpret_cast(&dstPtr[0]); + verifyFlag(commandList->appendMemoryRangesBarrier(1, &rangeSizes, ranges, event, 0, nullptr), false); + + CmdListKernelLaunchParams cooperativeParams = {}; + cooperativeParams.isCooperative = true; + + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, cooperativeParams), expectFlagEnabled); + + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, cooperativeParams), expectFlagEnabled); + + driverHandle->releaseImportedPointer(dstPtr); +} + +HWTEST2_F(CommandListCreateTests, givenCmdListWhenDispatchingWalkerWithProfilingThenSetCmdListFlagIsWalkerWithProfilingEnqueued, IsAtLeastXeCore) { + ze_result_t returnValue; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::renderCompute, 0u, returnValue, false)); + ASSERT_NE(nullptr, commandList); + auto whiteBoxCmdList = static_cast(commandList.get()); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + + ze_event_desc_t eventDesc = {}; + eventDesc.wait = ZE_EVENT_SCOPE_FLAG_HOST; + + ze_event_handle_t event = nullptr; + + std::unique_ptr eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue)); + EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue); + + ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event)); + std::unique_ptr eventObject(L0::Event::fromHandle(event)); + + std::unique_ptr mockModule = std::make_unique(device, nullptr, ModuleType::builtin); + Mock<::L0::KernelImp> kernel; + kernel.module = mockModule.get(); + ze_group_count_t groupCount{1, 1, 1}; + CmdListKernelLaunchParams launchParams = {}; + + uint8_t srcPtr[64] = {}; + uint8_t dstPtr[64] = {}; + const ze_copy_region_t region = {0U, 0U, 0U, 1, 1, 0U}; + + driverHandle->importExternalPointer(dstPtr, MemoryConstants::pageSize); + + auto verifyFlag = [&whiteBoxCmdList](ze_result_t result, bool dispatchFlag) { + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + EXPECT_EQ(whiteBoxCmdList->getAndClearIsWalkerWithProfilingEnqueued(), dispatchFlag); + }; + + auto expectFlagEnabled = true && device->getNEODevice()->getProductHelper().shouldRegisterEnqueuedWalkerWithProfiling(); + // non-pipelined state + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, launchParams), expectFlagEnabled); + + // non-pipelined state already programmed + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, launchParams), expectFlagEnabled); + + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams), false); + + verifyFlag(commandList->appendLaunchKernelIndirect(kernel.toHandle(), groupCount, event, 0, nullptr, false), expectFlagEnabled); + + verifyFlag(commandList->appendBarrier(event, 0, nullptr, false), false); + + CmdListMemoryCopyParams copyParams = {}; + verifyFlag(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendMemoryCopyRegion(dstPtr, ®ion, 0, 0, srcPtr, ®ion, 0, 0, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendEventReset(event), false); + + verifyFlag(commandList->appendSignalEvent(event, false), false); + + verifyFlag(commandList->appendPageFaultCopy(kernel.getIsaAllocation(), kernel.getIsaAllocation(), 1, false), false); + + verifyFlag(commandList->appendWaitOnEvents(1, &event, nullptr, false, true, false, false, false, false), false); + + verifyFlag(commandList->appendWriteGlobalTimestamp(reinterpret_cast(dstPtr), event, 0, nullptr), false); + + if constexpr (FamilyType::supportsSampler) { + auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::copyImageRegion); + auto mockBuiltinKernel = static_cast *>(kernel); + mockBuiltinKernel->setArgRedescribedImageCallBase = false; + + auto image = std::make_unique>>(); + ze_image_region_t imgRegion = {1, 1, 1, 1, 1, 1}; + ze_image_desc_t zeDesc = {}; + zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + image->initialize(device, &zeDesc); + auto bytesPerPixel = static_cast(image->getImageInfo().surfaceFormat->imageElementSizeInBytes); + + CmdListMemoryCopyParams copyParams = {}; + + verifyFlag(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendImageCopyFromMemoryExt(image->toHandle(), dstPtr, &imgRegion, bytesPerPixel, bytesPerPixel, event, 0, nullptr, copyParams), expectFlagEnabled); + + verifyFlag(commandList->appendImageCopyToMemoryExt(dstPtr, image->toHandle(), &imgRegion, bytesPerPixel, bytesPerPixel, event, 0, nullptr, copyParams), expectFlagEnabled); + } + + size_t rangeSizes = 1; + const void **ranges = reinterpret_cast(&dstPtr[0]); + verifyFlag(commandList->appendMemoryRangesBarrier(1, &rangeSizes, ranges, event, 0, nullptr), false); + + CmdListKernelLaunchParams cooperativeParams = {}; + cooperativeParams.isCooperative = true; + + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, cooperativeParams), expectFlagEnabled); + + verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, cooperativeParams), expectFlagEnabled); + + driverHandle->releaseImportedPointer(dstPtr); +} + TEST_F(CommandListCreateTests, GivenGpuHangWhenCreatingImmCmdListWithSyncModeAndAppendBarrierThenAppendBarrierReturnsDeviceLost) { ze_command_queue_desc_t desc = {}; desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS; diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index f89d5544a6..d2296dd7dc 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -137,6 +137,7 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr this->heaplessStateInitEnabled = compilerProductHelper.isHeaplessStateInitEnabled(this->heaplessModeEnabled); this->isForceStateless = compilerProductHelper.isForceToStatelessRequired(); this->l3FlushAfterPostSyncEnabled = productHelper.isL3FlushAfterPostSyncRequired(this->heaplessModeEnabled); + this->shouldRegisterEnqueuedWalkerWithProfiling = productHelper.shouldRegisterEnqueuedWalkerWithProfiling(); } } diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 2507b8f71c..d6842c34ca 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -415,6 +415,20 @@ class CommandQueue : public BaseObject<_cl_command_queue> { return this->isCacheFlushOnNextBcsWriteRequired && this->isImageWriteOperation(cmdType); } + bool getShouldRegisterEnqueuedWalkerWithProfiling() { + return this->shouldRegisterEnqueuedWalkerWithProfiling; + } + + void registerWalkerWithProfilingEnqueued() { + this->isWalkerWithProfilingEnqueued = true; + } + + bool getAndClearIsWalkerWithProfilingEnqueued() { + bool retVal = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; + return retVal; + } + protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); cl_int enqueueWriteMemObjForUnmap(MemObj *memObj, void *mappedPtr, EventsRequest &eventsRequest); @@ -529,6 +543,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool isForceStateless = false; bool l3FlushedAfterCpuRead = true; bool l3FlushAfterPostSyncEnabled = false; + bool isWalkerWithProfilingEnqueued = false; + bool shouldRegisterEnqueuedWalkerWithProfiling = false; }; static_assert(NEO::NonCopyableAndNonMovable); diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index dd015eb90f..10bdad6a4e 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -918,6 +918,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( isDcFlushRequiredOnStallingCommandsOnNextFlush() // isDcFlushRequiredOnStallingCommandsOnNextFlush ); + dispatchFlags.isWalkerWithProfilingEnqueued = getAndClearIsWalkerWithProfilingEnqueued(); dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.systolicPipelineSelectMode = systolicPipelineSelectMode; uint32_t lws[3] = {static_cast(multiDispatchInfo.begin()->getLocalWorkgroupSize().x), static_cast(multiDispatchInfo.begin()->getLocalWorkgroupSize().y), static_cast(multiDispatchInfo.begin()->getLocalWorkgroupSize().z)}; @@ -1178,6 +1179,8 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( isDcFlushRequiredOnStallingCommandsOnNextFlush() // isDcFlushRequiredOnStallingCommandsOnNextFlush ); + dispatchFlags.isWalkerWithProfilingEnqueued = getAndClearIsWalkerWithProfilingEnqueued(); + const bool isHandlingBarrier = isStallingCommandsOnNextFlushRequired(); if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index c8540dbf86..4aff39c990 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -155,12 +155,17 @@ void HardwareInterface::dispatchWalker( dispatchInfo.dispatchEpilogueCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment()); } + if (commandQueue.getShouldRegisterEnqueuedWalkerWithProfiling() && commandQueue.isProfilingEnabled() && walkerArgs.event) { + commandQueue.registerWalkerWithProfilingEnqueued(); + } + if (PauseOnGpuProperties::gpuScratchRegWriteAllowed(debugManager.flags.GpuScratchRegWriteAfterWalker.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount())) { uint32_t registerOffset = debugManager.flags.GpuScratchRegWriteRegisterOffset.get(); uint32_t registerData = debugManager.flags.GpuScratchRegWriteRegisterData.get(); PipeControlArgs args; args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, commandQueue.getDevice().getRootDeviceEnvironment()); + args.isWalkerWithProfilingEnqueued = commandQueue.getAndClearIsWalkerWithProfilingEnqueued(); MemorySynchronizationCommands::addBarrierWithPostSyncOperation( *commandStream, PostSyncMode::noWrite, diff --git a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp index c3ce1c801e..c77a6fa2ae 100644 --- a/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp +++ b/opencl/test/unit_test/command_queue/dispatch_walker_tests.cpp @@ -31,6 +31,7 @@ #include "opencl/test/unit_test/command_queue/hardware_interface_helper.h" #include "opencl/test/unit_test/fixtures/cl_device_fixture.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_event.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_mdi.h" #include "opencl/test/unit_test/mocks/mock_program.h" @@ -1385,30 +1386,61 @@ HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSp } } -HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsAndLocalWorkSizeIsSetThenIohRequiresMoreSpace) { - debugManager.flags.EnableHwGenerationLocalIds.set(0); +HWTEST_F(DispatchWalkerTest, givenProfilingEnabledWhenProgrammingWalkerThenSetIsWalkerWithProfilingEnqueued) { size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; - size_t workGroupSize[3] = {683, 1, 1}; + size_t workGroupSize[3] = {2, 5, 10}; cl_uint dimensions = 1; + auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); + kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 1u; - UnitTestHelper::adjustKernelDescriptorForImplicitArgs(kernelInfo.kernelDescriptor); - MockKernel kernelWithImplicitArgs(program.get(), kernelInfo, *pClDevice); - ASSERT_EQ(CL_SUCCESS, kernelWithImplicitArgs.initialize()); + kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = false; + MockKernel kernelWithoutImplicitArgs(program.get(), kernelInfo, *pClDevice); + ASSERT_EQ(CL_SUCCESS, kernelWithoutImplicitArgs.initialize()); - DispatchInfo dispatchInfoWithImplicitArgs(pClDevice, const_cast(&kernelWithImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets); - dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1}); - dispatchInfoWithImplicitArgs.setTotalNumberOfWorkgroups({1, 1, 1}); + DispatchInfo dispatchInfoWithoutImplicitArgs(pClDevice, const_cast(&kernelWithoutImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets); + dispatchInfoWithoutImplicitArgs.setNumberOfWorkgroups({1, 1, 1}); + dispatchInfoWithoutImplicitArgs.setTotalNumberOfWorkgroups({1, 1, 1}); + MultiDispatchInfo multiDispatchInfoWithoutImplicitArgs(&kernelWithoutImplicitArgs); + multiDispatchInfoWithoutImplicitArgs.push(dispatchInfoWithoutImplicitArgs); + HardwareInterfaceWalkerArgs walkerArgsWithoutImplicitArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); + walkerArgsWithoutImplicitArgs.blockedCommandsData = blockedCommandsData.get(); + auto *event = new MockEvent(nullptr, 0, 0, 0); - auto iohSizeWithImplicitArgsWithoutLWS = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize, pClDevice->getRootDeviceEnvironment()); + { + walkerArgsWithoutImplicitArgs.event = event; + HardwareInterface::template dispatchWalker( + *pCmdQ, + multiDispatchInfoWithoutImplicitArgs, + CsrDependencies(), + walkerArgsWithoutImplicitArgs); - dispatchInfoWithImplicitArgs.setLWS({683, 1, 1}); + EXPECT_FALSE(pCmdQ->getAndClearIsWalkerWithProfilingEnqueued()); + } - auto lws = dispatchInfoWithImplicitArgs.getLocalWorkgroupSize(); - kernelWithImplicitArgs.setLocalWorkSizeValues(static_cast(lws.x), static_cast(lws.y), static_cast(lws.z)); + reinterpret_cast(pCmdQ)->setProfilingEnabled(); + { + walkerArgsWithoutImplicitArgs.event = nullptr; + HardwareInterface::template dispatchWalker( + *pCmdQ, + multiDispatchInfoWithoutImplicitArgs, + CsrDependencies(), + walkerArgsWithoutImplicitArgs); - auto iohSizeWithImplicitArgsWithLWS = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, workGroupSize, pClDevice->getRootDeviceEnvironment()); + EXPECT_FALSE(pCmdQ->getAndClearIsWalkerWithProfilingEnqueued()); + } - EXPECT_LE(iohSizeWithImplicitArgsWithoutLWS, iohSizeWithImplicitArgsWithLWS); + { + walkerArgsWithoutImplicitArgs.event = event; + HardwareInterface::template dispatchWalker( + *pCmdQ, + multiDispatchInfoWithoutImplicitArgs, + CsrDependencies(), + walkerArgsWithoutImplicitArgs); + + EXPECT_EQ(pClDevice->getRootDeviceEnvironment().getProductHelper().shouldRegisterEnqueuedWalkerWithProfiling(), pCmdQ->getAndClearIsWalkerWithProfilingEnqueued()); + } + + event->release(); } diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index da4e1d8583..e167c36720 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -681,6 +681,7 @@ class CommandStreamReceiver : NEO::NonCopyableAndNonMovableClass { bool isEnginePrologueSent = false; bool areExceptionsSent = false; bool isPerDssBackedBufferSent = false; + bool isWalkerWithProfilingEnqueued = false; bool gsbaFor32BitProgrammed = false; bool gsbaStateDirty = true; bool bindingTableBaseAddressRequired = false; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 904b7d48dc..0ef4633be4 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -196,6 +196,8 @@ void CommandStreamReceiverHw::addPipeControlFlushTaskIfNeeded(LinearS const auto programPipeControl = !timestampPacketWriteEnabled; if (programPipeControl) { PipeControlArgs args; + args.isWalkerWithProfilingEnqueued = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; MemorySynchronizationCommands::addSingleBarrier(commandStreamCSR, args); } this->taskLevel = taskLevel; @@ -305,6 +307,8 @@ CompletionStamp CommandStreamReceiverHw::flushImmediateTask( ImmediateDispatchFlags &dispatchFlags, Device &device) { + this->isWalkerWithProfilingEnqueued |= dispatchFlags.isWalkerWithProfilingEnqueued; + ImmediateFlushData flushData; if (dispatchFlags.dispatchOperation != AppendOperations::cmdList) { flushData.pipelineSelectFullConfigurationNeeded = !getPreambleSetFlag(); @@ -400,6 +404,8 @@ CompletionStamp CommandStreamReceiverHw::flushTask( DispatchFlags &dispatchFlags, Device &device) { + this->isWalkerWithProfilingEnqueued |= dispatchFlags.isWalkerWithProfilingEnqueued; + if (this->getHeaplessStateInitEnabled()) { return flushTaskHeapless(commandStreamTask, commandStreamStartTask, dsh, ioh, ssh, taskLevel, dispatchFlags, device); } else { @@ -1246,6 +1252,8 @@ SubmissionStatus CommandStreamReceiverHw::flushPipeControl(bool state args.dcFlushEnable = this->dcFlushSupport; args.notifyEnable = isUsedNotifyEnableForPostSync(); args.workloadPartitionOffset = isMultiTileOperationEnabled(); + args.isWalkerWithProfilingEnqueued = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; if (stateCacheFlush) { args.textureCacheInvalidationEnable = true; @@ -1876,6 +1884,8 @@ inline void CommandStreamReceiverHw::processBarrierWithPostSync(Linea args.workloadPartitionOffset = isMultiTileOperationEnabled(); args.stateCacheInvalidationEnable |= dispatchFlags.stateCacheInvalidation || this->heapStorageRequiresRecyclingTag; this->heapStorageRequiresRecyclingTag = false; + args.isWalkerWithProfilingEnqueued = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; MemorySynchronizationCommands::addBarrierWithPostSyncOperation( commandStreamTask, @@ -1925,6 +1935,8 @@ inline CompletionStamp CommandStreamReceiverHw::handleFlushTaskSubmis this->latestFlushedTaskCount = this->taskCount + 1; } } else { + args.isWalkerWithProfilingEnqueued = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; auto commandBuffer = new CommandBuffer(device); commandBuffer->batchBufferEndLocation = batchBuffer.endCmdPtr; commandBuffer->batchBuffer = std::move(batchBuffer); @@ -2246,6 +2258,8 @@ void CommandStreamReceiverHw::dispatchImmediateFlushClientBufferComma args.dcFlushEnable = this->dcFlushSupport; args.notifyEnable = isUsedNotifyEnableForPostSync(); args.workloadPartitionOffset = isMultiTileOperationEnabled(); + args.isWalkerWithProfilingEnqueued = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; MemorySynchronizationCommands::addBarrierWithPostSyncOperation( epilogueCommandStream, PostSyncMode::immediateData, @@ -2442,6 +2456,8 @@ bool CommandStreamReceiverHw::submitDependencyUpdate(TagNodeBase *tag auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tag); this->programEnginePrologue(commandStream); args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, this->peekRootDeviceEnvironment()); + args.isWalkerWithProfilingEnqueued = this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; MemorySynchronizationCommands::addBarrierWithPostSyncOperation( commandStream, PostSyncMode::immediateData, diff --git a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl index 1461ac9418..b7d32d06db 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl @@ -199,6 +199,8 @@ inline void CommandStreamReceiverHw::programStallingPostSyncCommandsF args.dcFlushEnable = this->dcFlushSupport && dcFlushRequired; args.hdcPipelineFlush = true; args.unTypedDataPortCacheFlush = true; + args.isWalkerWithProfilingEnqueued |= this->isWalkerWithProfilingEnqueued; + this->isWalkerWithProfilingEnqueued = false; if (isMultiTileOperationEnabled()) { args.workloadPartitionOffset = true; ImplicitScalingDispatch::dispatchBarrierCommands(cmdStream, diff --git a/shared/source/command_stream/csr_definitions.h b/shared/source/command_stream/csr_definitions.h index 30880ff71f..807b4c67a5 100644 --- a/shared/source/command_stream/csr_definitions.h +++ b/shared/source/command_stream/csr_definitions.h @@ -127,6 +127,7 @@ struct DispatchFlags { bool stateCacheInvalidation = false; bool isStallingCommandsOnNextFlushRequired = false; bool isDcFlushRequiredOnStallingCommandsOnNextFlush = false; + bool isWalkerWithProfilingEnqueued = false; }; struct CsrSizeRequestFlags { @@ -147,6 +148,7 @@ struct ImmediateDispatchFlags { bool requireTaskCountUpdate = false; bool hasRelaxedOrderingDependencies = false; bool hasStallingCmds = false; + bool isWalkerWithProfilingEnqueued = false; }; } // namespace NEO diff --git a/shared/source/helpers/pipe_control_args.h b/shared/source/helpers/pipe_control_args.h index 95a5f9a298..59feb6e57a 100644 --- a/shared/source/helpers/pipe_control_args.h +++ b/shared/source/helpers/pipe_control_args.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -34,6 +34,7 @@ struct PipeControlArgs { bool depthCacheFlushEnable = false; bool depthStallEnable = false; bool protectedMemoryDisable = false; + bool isWalkerWithProfilingEnqueued = false; }; } // namespace NEO diff --git a/shared/source/os_interface/product_helper.h b/shared/source/os_interface/product_helper.h index 9d87bbcb34..225419674e 100644 --- a/shared/source/os_interface/product_helper.h +++ b/shared/source/os_interface/product_helper.h @@ -274,6 +274,7 @@ class ProductHelper { virtual void adjustRTDispatchGlobals(RTDispatchGlobals &rtDispatchGlobals, const HardwareInfo &hwInfo) const = 0; virtual uint32_t getSyncNumRTStacksPerDss(const HardwareInfo &hwInfo) const = 0; virtual uint32_t getNumRtStacksPerDSSForAllocation(const HardwareInfo &hwInfo) const = 0; + virtual bool shouldRegisterEnqueuedWalkerWithProfiling() const = 0; virtual bool getStorageInfoLocalOnlyFlag(LocalMemAllocationMode usmDeviceAllocationMode, bool defaultValue) const = 0; virtual ~ProductHelper() = default; diff --git a/shared/source/os_interface/product_helper.inl b/shared/source/os_interface/product_helper.inl index 7f3c1f8d4a..00474648ef 100644 --- a/shared/source/os_interface/product_helper.inl +++ b/shared/source/os_interface/product_helper.inl @@ -1098,4 +1098,9 @@ uint32_t ProductHelperHw::getNumRtStacksPerDSSForAllocation(const Ha return RayTracingHelper::getAsyncNumRTStacksPerDss(); } +template +bool ProductHelperHw::shouldRegisterEnqueuedWalkerWithProfiling() const { + return false; +} + } // namespace NEO diff --git a/shared/source/os_interface/product_helper_hw.h b/shared/source/os_interface/product_helper_hw.h index a9b646be1f..e61a2f59b0 100644 --- a/shared/source/os_interface/product_helper_hw.h +++ b/shared/source/os_interface/product_helper_hw.h @@ -211,6 +211,7 @@ class ProductHelperHw : public ProductHelper { void adjustRTDispatchGlobals(RTDispatchGlobals &rtDispatchGlobals, const HardwareInfo &hwInfo) const override; uint32_t getSyncNumRTStacksPerDss(const HardwareInfo &hwInfo) const override; uint32_t getNumRtStacksPerDSSForAllocation(const HardwareInfo &hwInfo) const override; + bool shouldRegisterEnqueuedWalkerWithProfiling() const override; ~ProductHelperHw() override = default; diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index bd61559d4d..1a4d592370 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -79,6 +79,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw { using BaseClass::isBlitterDirectSubmissionEnabled; using BaseClass::isDirectSubmissionEnabled; using BaseClass::isPerDssBackedBufferSent; + using BaseClass::isWalkerWithProfilingEnqueued; using BaseClass::makeResident; using BaseClass::pageTableManagerInitialized; using BaseClass::perDssBackedBuffer; diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index a7b2cb7487..d232930346 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -6498,3 +6498,49 @@ HWTEST_F(CommandStreamReceiverHwTest, givenImmediateFlushTaskCmdListDispatchWhen EXPECT_TRUE(commandStreamReceiver.latestFlushedBatchBuffer.disableFlatRingBuffer); } + +HWTEST_F(CommandStreamReceiverHwTest, GivenWaitOnWalkerPostSyncWhenImmediateFlushTaskCalledThenExpectIsWalkerWithProfilingEnqueuedFlagTrue) { + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + { + immediateFlushTaskFlags.isWalkerWithProfilingEnqueued = false; + commandStreamReceiver.flushImmediateTask(commandStream, + commandStream.getUsed(), + immediateFlushTaskFlags, + *pDevice); + + EXPECT_FALSE(commandStreamReceiver.isWalkerWithProfilingEnqueued); + } + + { + immediateFlushTaskFlags.isWalkerWithProfilingEnqueued = true; + commandStreamReceiver.flushImmediateTask(commandStream, + commandStream.getUsed(), + immediateFlushTaskFlags, + *pDevice); + + EXPECT_TRUE(commandStreamReceiver.isWalkerWithProfilingEnqueued); + } + + { + immediateFlushTaskFlags.isWalkerWithProfilingEnqueued = false; + commandStreamReceiver.isWalkerWithProfilingEnqueued = true; + commandStreamReceiver.flushImmediateTask(commandStream, + commandStream.getUsed(), + immediateFlushTaskFlags, + *pDevice); + + EXPECT_TRUE(commandStreamReceiver.isWalkerWithProfilingEnqueued); + } + + { + immediateFlushTaskFlags.isWalkerWithProfilingEnqueued = true; + commandStreamReceiver.isWalkerWithProfilingEnqueued = true; + commandStreamReceiver.flushImmediateTask(commandStream, + commandStream.getUsed(), + immediateFlushTaskFlags, + *pDevice); + + EXPECT_TRUE(commandStreamReceiver.isWalkerWithProfilingEnqueued); + } +} diff --git a/shared/test/unit_test/os_interface/product_helper_tests.cpp b/shared/test/unit_test/os_interface/product_helper_tests.cpp index 8e2324f3f1..13d0108769 100644 --- a/shared/test/unit_test/os_interface/product_helper_tests.cpp +++ b/shared/test/unit_test/os_interface/product_helper_tests.cpp @@ -1239,3 +1239,7 @@ HWTEST2_F(ProductHelperTest, givenProductHelperWhenPidFdOrSocketForIpcIsNotSuppo HWTEST2_F(ProductHelperTest, givenProductHelperWhenPidFdOrSocketForIpcIsNotSupportedThenFalseReturned, IsAtMostXeCore) { EXPECT_FALSE(productHelper->isPidFdOrSocketForIpcSupported()); } + +HWTEST_F(ProductHelperTest, givenProductHelperWhenAskingShouldRegisterEnqueuedWalkerWithProfilingThenFalseReturned) { + EXPECT_FALSE(productHelper->shouldRegisterEnqueuedWalkerWithProfiling()); +}