From 68755178a01ccb98476d0778434d657e294469d0 Mon Sep 17 00:00:00 2001 From: Damian Tomczak Date: Wed, 24 Sep 2025 09:39:02 +0000 Subject: [PATCH] feature: isForceStatelessRefactor Related-to: NEO-6075 Signed-off-by: Damian Tomczak --- level_zero/core/source/cmdlist/cmdlist.h | 3 ++ level_zero/core/source/cmdlist/cmdlist_hw.inl | 33 ++++--------------- .../command_queue/command_queue_hw_base.inl | 2 +- .../command_queue/enqueue_copy_buffer.h | 2 +- .../command_queue/enqueue_copy_buffer_rect.h | 2 +- .../enqueue_copy_buffer_to_image.h | 2 +- .../command_queue/enqueue_fill_buffer.h | 2 +- .../command_queue/enqueue_read_buffer.h | 2 +- .../command_queue/enqueue_read_buffer_rect.h | 2 +- .../source/command_queue/enqueue_read_image.h | 2 +- opencl/source/command_queue/enqueue_svm.h | 6 ++-- .../command_queue/enqueue_write_buffer.h | 2 +- .../command_queue/enqueue_write_buffer_rect.h | 2 +- .../command_queue/enqueue_write_image.h | 2 +- .../command_queue_hw_1_tests.cpp | 24 +++++++++++++- 15 files changed, 46 insertions(+), 42 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index d56e2ce3a3..d7c189a268 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -547,6 +547,9 @@ struct CommandList : _ze_command_list_handle_t { bool isNonDualStreamCopyOffloadOperation(bool offloadOperation) const { return offloadOperation && !isDualStreamCopyOffloadOperation(offloadOperation); } void registerWalkerWithProfilingEnqueued(Event *event); + bool forceStateless(size_t size) const { + return (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) || this->isStatelessBuiltinsEnabled() || size >= 4ull * MemoryConstants::gigaByte; + } std::map hostPtrMap; NEO::PrivateAllocsToReuseContainer ownedPrivateAllocations; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 156df587d2..4b5f3213f0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -865,11 +865,7 @@ ze_result_t CommandListCoreFamily::appendImageCopyFromMemoryExt(z return status; } - bool isStateless = (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) || this->isStatelessBuiltinsEnabled(); - if (bufferSize >= 4ull * MemoryConstants::gigaByte) { - isStateless = true; - } - + const auto isStateless = this->forceStateless(bufferSize); bool isHeaplessEnabled = this->heaplessModeEnabled; ImageBuiltin builtInType = ImageBuiltin::copyBufferToImage3dBytes; @@ -1072,11 +1068,7 @@ ze_result_t CommandListCoreFamily::appendImageCopyToMemoryExt(voi return status; } - bool isStateless = (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) || this->isStatelessBuiltinsEnabled(); - if (bufferSize >= 4ull * MemoryConstants::gigaByte) { - isStateless = true; - } - + const auto isStateless = this->forceStateless(bufferSize); bool isHeaplessEnabled = this->heaplessModeEnabled; ImageBuiltin builtInType = ImageBuiltin::copyBufferToImage3dBytes; @@ -1743,10 +1735,7 @@ ze_result_t CommandListCoreFamily::appendPageFaultCopy(NEO::Graph size_t middleElSize = sizeof(uint32_t) * 4; uintptr_t rightSize = size % middleElSize; - bool isStateless = (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) || this->isStatelessBuiltinsEnabled(); - if (size >= 4ull * MemoryConstants::gigaByte) { - isStateless = true; - } + const auto isStateless = this->forceStateless(size); uintptr_t dstAddress = static_cast(dstAllocation->getGpuAddress()); uintptr_t srcAddress = static_cast(srcAllocation->getGpuAddress()); @@ -1879,10 +1868,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, uintptr_t leftSize = 0; uintptr_t rightSize = 0; uintptr_t middleSizeBytes = 0; - bool isStateless = (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) || this->isStatelessBuiltinsEnabled(); - if (size >= 4ull * MemoryConstants::gigaByte) { - isStateless = true; - } + const auto isStateless = this->forceStateless(size); const bool isHeapless = this->isHeaplessModeEnabled(); @@ -2137,10 +2123,7 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d const bool isCopyOnlyEnabled = isCopyOnly(memoryCopyParams.copyOffloadAllowed); const bool inOrderCopyOnlySignalingAllowed = this->isInOrderExecutionEnabled() && !memoryCopyParams.forceDisableCopyOnlyInOrderSignaling && isCopyOnlyEnabled; - bool isStateless = (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) || this->isStatelessBuiltinsEnabled(); - if ((srcSize >= 4ull * MemoryConstants::gigaByte) || (dstSize >= 4ull * MemoryConstants::gigaByte)) { - isStateless = true; - } + const auto isStateless = this->forceStateless(std::max(srcSize, dstSize)); ze_result_t result = ZE_RESULT_SUCCESS; if (isCopyOnlyEnabled) { @@ -2452,11 +2435,7 @@ ze_result_t CommandListCoreFamily::appendMemoryFill(void *ptr, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, CmdListMemoryCopyParams &memoryCopyParams) { - bool isStateless = (this->cmdListHeapAddressModel == NEO::HeapAddressModel::globalStateless) || this->isStatelessBuiltinsEnabled(); - if (size >= 4ull * MemoryConstants::gigaByte) { - isStateless = true; - } - + const auto isStateless = this->forceStateless(size); const bool isHeapless = this->isHeaplessModeEnabled(); memoryCopyParams.copyOffloadAllowed = isCopyOffloadEnabled() && (patternSize <= this->maxFillPatternSizeForCopyEngine); diff --git a/opencl/source/command_queue/command_queue_hw_base.inl b/opencl/source/command_queue/command_queue_hw_base.inl index 66e0a730d5..2ada8fc977 100644 --- a/opencl/source/command_queue/command_queue_hw_base.inl +++ b/opencl/source/command_queue/command_queue_hw_base.inl @@ -129,7 +129,7 @@ void CommandQueueHw::dispatchAuxTranslationBuiltin(MultiDispatchInfo &mu template bool CommandQueueHw::forceStateless(size_t size) { - return size >= 4ull * MemoryConstants::gigaByte; + return isForceStateless || size >= 4ull * MemoryConstants::gigaByte; } template diff --git a/opencl/source/command_queue/enqueue_copy_buffer.h b/opencl/source/command_queue/enqueue_copy_buffer.h index 4495e83788..2dd61d7955 100644 --- a/opencl/source/command_queue/enqueue_copy_buffer.h +++ b/opencl/source/command_queue/enqueue_copy_buffer.h @@ -29,7 +29,7 @@ cl_int CommandQueueHw::enqueueCopyBuffer( CsrSelectionArgs csrSelectionArgs{cmdType, srcBuffer, dstBuffer, device->getRootDeviceIndex(), &size}; CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs); - const bool isStateless = isForceStateless || forceStateless(std::max(srcBuffer->getSize(), dstBuffer->getSize())); + const bool isStateless = forceStateless(std::max(srcBuffer->getSize(), dstBuffer->getSize())); const bool useHeapless = this->getHeaplessModeEnabled(); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); diff --git a/opencl/source/command_queue/enqueue_copy_buffer_rect.h b/opencl/source/command_queue/enqueue_copy_buffer_rect.h index 430998e6bf..7055197e82 100644 --- a/opencl/source/command_queue/enqueue_copy_buffer_rect.h +++ b/opencl/source/command_queue/enqueue_copy_buffer_rect.h @@ -34,7 +34,7 @@ cl_int CommandQueueHw::enqueueCopyBufferRect( CsrSelectionArgs csrSelectionArgs{cmdType, srcBuffer, dstBuffer, device->getRootDeviceIndex(), region}; CommandStreamReceiver &csr = selectCsrForBuiltinOperation(csrSelectionArgs); - const bool isStateless = isForceStateless || forceStateless(std::max(srcBuffer->getSize(), dstBuffer->getSize())); + const bool isStateless = forceStateless(std::max(srcBuffer->getSize(), dstBuffer->getSize())); const bool useHeapless = this->getHeaplessModeEnabled(); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); diff --git a/opencl/source/command_queue/enqueue_copy_buffer_to_image.h b/opencl/source/command_queue/enqueue_copy_buffer_to_image.h index f91390353a..8d884e9da8 100644 --- a/opencl/source/command_queue/enqueue_copy_buffer_to_image.h +++ b/opencl/source/command_queue/enqueue_copy_buffer_to_image.h @@ -25,7 +25,7 @@ cl_int CommandQueueHw::enqueueCopyBufferToImage( const cl_event *eventWaitList, cl_event *event) { - const bool isStateless = isForceStateless || forceStateless(srcBuffer->getSize()); + const bool isStateless = forceStateless(srcBuffer->getSize()); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, this->heaplessModeEnabled); auto &builder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(builtInType, diff --git a/opencl/source/command_queue/enqueue_fill_buffer.h b/opencl/source/command_queue/enqueue_fill_buffer.h index 0129c4b5de..b057368539 100644 --- a/opencl/source/command_queue/enqueue_fill_buffer.h +++ b/opencl/source/command_queue/enqueue_fill_buffer.h @@ -52,7 +52,7 @@ cl_int CommandQueueHw::enqueueFillBuffer( memcpy_s(patternAllocation->getUnderlyingBuffer(), patternSize, pattern, patternSize); } - const bool isStateless = isForceStateless || forceStateless(buffer->getSize()); + const bool isStateless = forceStateless(buffer->getSize()); const bool useHeapless = this->getHeaplessModeEnabled(); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); diff --git a/opencl/source/command_queue/enqueue_read_buffer.h b/opencl/source/command_queue/enqueue_read_buffer.h index 7ae7d7f212..e846bacf2d 100644 --- a/opencl/source/command_queue/enqueue_read_buffer.h +++ b/opencl/source/command_queue/enqueue_read_buffer.h @@ -87,7 +87,7 @@ cl_int CommandQueueHw::enqueueReadBufferImpl( numEventsInWaitList, eventWaitList, event); } - const bool isStateless = isForceStateless || forceStateless(buffer->getSize()); + const bool isStateless = forceStateless(buffer->getSize()); const bool useHeapless = this->getHeaplessModeEnabled(); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); diff --git a/opencl/source/command_queue/enqueue_read_buffer_rect.h b/opencl/source/command_queue/enqueue_read_buffer_rect.h index 13146132e8..4545a94320 100644 --- a/opencl/source/command_queue/enqueue_read_buffer_rect.h +++ b/opencl/source/command_queue/enqueue_read_buffer_rect.h @@ -54,7 +54,7 @@ cl_int CommandQueueHw::enqueueReadBufferRect( bool isCpuCopyAllowed = false; getContext().tryGetExistingHostPtrAllocation(ptr, hostPtrSize, rootDeviceIndex, mapAllocation, memoryType, isCpuCopyAllowed); - const bool isStateless = isForceStateless || forceStateless(buffer->getSize()); + const bool isStateless = forceStateless(buffer->getSize()); const bool useHeapless = this->getHeaplessModeEnabled(); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); diff --git a/opencl/source/command_queue/enqueue_read_image.h b/opencl/source/command_queue/enqueue_read_image.h index fec5201358..6b0e79aa2e 100644 --- a/opencl/source/command_queue/enqueue_read_image.h +++ b/opencl/source/command_queue/enqueue_read_image.h @@ -145,7 +145,7 @@ cl_int CommandQueueHw::enqueueReadImageImpl( dc.bcsSplit = bcsSplit; dc.direction = csrSelectionArgs.direction; - const bool isStateless = isForceStateless || forceStateless(srcImage->getSize()); + const bool isStateless = forceStateless(srcImage->getSize()); const bool useHeapless = this->getHeaplessModeEnabled(); auto eBuiltInOps = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); MultiDispatchInfo dispatchInfo(dc); diff --git a/opencl/source/command_queue/enqueue_svm.h b/opencl/source/command_queue/enqueue_svm.h index 44798b2a09..91d9c044d9 100644 --- a/opencl/source/command_queue/enqueue_svm.h +++ b/opencl/source/command_queue/enqueue_svm.h @@ -128,7 +128,7 @@ cl_int CommandQueueHw::enqueueSVMMap(cl_bool blockingMap, dc.direction = csrSelectionArgs.direction; MultiDispatchInfo dispatchInfo(dc); - const bool isStateless = isForceStateless || forceStateless(svmData->size); + const bool isStateless = forceStateless(svmData->size); const bool useHeapless = this->getHeaplessModeEnabled(); auto eBuiltInOps = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); const auto dispatchResult = dispatchBcsOrGpgpuEnqueue(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, blocking, csr); @@ -219,7 +219,7 @@ cl_int CommandQueueHw::enqueueSVMUnmap(void *svmPtr, dc.direction = csrSelectionArgs.direction; MultiDispatchInfo dispatchInfo(dc); - const bool isStateless = isForceStateless || forceStateless(svmData->size); + const bool isStateless = forceStateless(svmData->size); const bool useHeapless = this->getHeaplessModeEnabled(); auto eBuiltInOps = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); const auto dispatchResult = dispatchBcsOrGpgpuEnqueue(dispatchInfo, surfaces, eBuiltInOps, numEventsInWaitList, eventWaitList, event, false, csr); @@ -518,7 +518,7 @@ cl_int CommandQueueHw::enqueueSVMMemFill(void *svmPtr, memcpy_s(patternAllocation->getUnderlyingBuffer(), patternSize, pattern, patternSize); } - const bool isStateless = isForceStateless || forceStateless(svmData->size); + const bool isStateless = forceStateless(svmData->size); const bool useHeapless = this->getHeaplessModeEnabled(); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); diff --git a/opencl/source/command_queue/enqueue_write_buffer.h b/opencl/source/command_queue/enqueue_write_buffer.h index 37a404a38a..d67324393f 100644 --- a/opencl/source/command_queue/enqueue_write_buffer.h +++ b/opencl/source/command_queue/enqueue_write_buffer.h @@ -80,7 +80,7 @@ cl_int CommandQueueHw::enqueueWriteBufferImpl( numEventsInWaitList, eventWaitList, event); } - const bool isStateless = isForceStateless || forceStateless(buffer->getSize()); + const bool isStateless = forceStateless(buffer->getSize()); const bool useHeapless = this->getHeaplessModeEnabled(); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); diff --git a/opencl/source/command_queue/enqueue_write_buffer_rect.h b/opencl/source/command_queue/enqueue_write_buffer_rect.h index 56fd819a75..d9e3066ee9 100644 --- a/opencl/source/command_queue/enqueue_write_buffer_rect.h +++ b/opencl/source/command_queue/enqueue_write_buffer_rect.h @@ -58,7 +58,7 @@ cl_int CommandQueueHw::enqueueWriteBufferRect( bool isCpuCopyAllowed = false; getContext().tryGetExistingHostPtrAllocation(ptr, hostPtrSize, rootDeviceIndex, mapAllocation, memoryType, isCpuCopyAllowed); - const bool isStateless = isForceStateless || forceStateless(buffer->getSize()); + const bool isStateless = forceStateless(buffer->getSize()); const bool useHeapless = this->getHeaplessModeEnabled(); auto builtInType = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); diff --git a/opencl/source/command_queue/enqueue_write_image.h b/opencl/source/command_queue/enqueue_write_image.h index 2fc445f23d..a0ecc49554 100644 --- a/opencl/source/command_queue/enqueue_write_image.h +++ b/opencl/source/command_queue/enqueue_write_image.h @@ -134,7 +134,7 @@ cl_int CommandQueueHw::enqueueWriteImageImpl( dc.bcsSplit = bcsSplit; dc.direction = csrSelectionArgs.direction; - const bool isStateless = isForceStateless || forceStateless(dstImage->getSize()); + const bool isStateless = forceStateless(dstImage->getSize()); const bool useHeapless = getHeaplessModeEnabled(); auto eBuiltInOps = EBuiltInOps::adjustBuiltinType(isStateless, useHeapless); MultiDispatchInfo dispatchInfo(dc); diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp index 9e670cadfb..d80f75a43e 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_1_tests.cpp @@ -1242,7 +1242,7 @@ HWTEST_F(CommandQueueHwTest, givenKernelSplitEnqueueReadBufferWhenBlockedThenEnq pCmdQ->isQueueBlocked(); } -HWTEST_F(CommandQueueHwTest, givenSizeWhenForceStatelessIsCalledThenCorrectValueIsReturned) { +HWTEST_F(CommandQueueHwTest, givenSizeAndIsForceStatelessDisabledWhenForceStatelessIsCalledThenCorrectValueIsReturned) { if (is32bit) { GTEST_SKIP(); @@ -1250,9 +1250,11 @@ HWTEST_F(CommandQueueHwTest, givenSizeWhenForceStatelessIsCalledThenCorrectValue struct MockCommandQueueHw : public CommandQueueHw { using CommandQueueHw::forceStateless; + using CommandQueueHw::isForceStateless; }; MockCommandQueueHw *pCmdQHw = reinterpret_cast(pCmdQ); + pCmdQHw->isForceStateless = false; uint64_t bigSize = 4ull * MemoryConstants::gigaByte; EXPECT_TRUE(pCmdQHw->forceStateless(static_cast(bigSize))); @@ -1260,6 +1262,26 @@ HWTEST_F(CommandQueueHwTest, givenSizeWhenForceStatelessIsCalledThenCorrectValue EXPECT_FALSE(pCmdQHw->forceStateless(static_cast(smallSize))); } +HWTEST_F(CommandQueueHwTest, givenSizeAndIsForceStatelessEnabledWhenForceStatelessIsCalledThenCorrectValueIsReturned) { + + if (is32bit) { + GTEST_SKIP(); + } + + struct MockCommandQueueHw : public CommandQueueHw { + using CommandQueueHw::forceStateless; + using CommandQueueHw::isForceStateless; + }; + + MockCommandQueueHw *pCmdQHw = reinterpret_cast(pCmdQ); + pCmdQHw->isForceStateless = true; + uint64_t bigSize = 4ull * MemoryConstants::gigaByte; + EXPECT_TRUE(pCmdQHw->forceStateless(static_cast(bigSize))); + + uint64_t smallSize = bigSize - 1; + EXPECT_TRUE(pCmdQHw->forceStateless(static_cast(smallSize))); +} + HWTEST_F(CommandQueueHwTest, givenFlushWhenFlushBatchedSubmissionsFailsThenErrorIsRetured) { MockCommandQueueHwWithOverwrittenCsr cmdQueue(context, pClDevice, nullptr, false); MockCommandStreamReceiverWithFailingFlushBatchedSubmission csr(*pDevice->executionEnvironment, 0, pDevice->getDeviceBitfield());