diff --git a/level_zero/core/source/cmdqueue/cmdqueue.cpp b/level_zero/core/source/cmdqueue/cmdqueue.cpp index 3251726878..a3aac04593 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue.cpp +++ b/level_zero/core/source/cmdqueue/cmdqueue.cpp @@ -24,11 +24,16 @@ namespace L0 { CommandQueueAllocatorFn commandQueueFactory[IGFX_MAX_PRODUCT] = {}; CommandQueueImp::CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc) - : device(device), csr(csr), desc(*desc) { + : desc(*desc), device(device), csr(csr) { int overrideCmdQueueSyncMode = NEO::DebugManager.flags.OverrideCmdQueueSynchronousMode.get(); if (overrideCmdQueueSyncMode != -1) { this->desc.mode = static_cast(overrideCmdQueueSyncMode); } + + int overrideUseKmdWaitFunction = NEO::DebugManager.flags.OverrideUseKmdWaitFunction.get(); + if (overrideUseKmdWaitFunction != -1) { + useKmdWaitFunction = !!(overrideUseKmdWaitFunction); + } } ze_result_t CommandQueueImp::destroy() { @@ -79,9 +84,10 @@ void CommandQueueImp::submitBatchBuffer(size_t offset, NEO::ResidencyContainer & } ze_result_t CommandQueueImp::synchronize(uint64_t timeout) { - if (timeout == std::numeric_limits::max()) { + if ((timeout == std::numeric_limits::max()) && useKmdWaitFunction) { auto &waitPair = buffers.getCurrentFlushStamp(); csr->waitForTaskCountWithKmdNotifyFallback(waitPair.first, waitPair.second, false, false); + postSyncOperations(); return ZE_RESULT_SUCCESS; } else { return synchronizeByPollingForTaskCount(timeout); @@ -101,11 +107,7 @@ ze_result_t CommandQueueImp::synchronizeByPollingForTaskCount(uint64_t timeout) return ZE_RESULT_NOT_READY; } - printFunctionsPrintfOutput(); - - if (NEO::Debugger::isDebugEnabled(internalUsage) && device->getL0Debugger() && NEO::DebugManager.flags.DebuggerLogBitmask.get()) { - device->getL0Debugger()->printTrackedAddresses(csr->getOsContext().getContextId()); - } + postSyncOperations(); return ZE_RESULT_SUCCESS; } @@ -118,6 +120,14 @@ void CommandQueueImp::printFunctionsPrintfOutput() { this->printfFunctionContainer.clear(); } +void CommandQueueImp::postSyncOperations() { + printFunctionsPrintfOutput(); + + if (NEO::Debugger::isDebugEnabled(internalUsage) && device->getL0Debugger() && NEO::DebugManager.flags.DebuggerLogBitmask.get()) { + device->getL0Debugger()->printTrackedAddresses(csr->getOsContext().getContextId()); + } +} + CommandQueue *CommandQueue::create(uint32_t productFamily, Device *device, NEO::CommandStreamReceiver *csr, const ze_command_queue_desc_t *desc, bool isCopyOnly, bool isInternal, ze_result_t &returnValue) { CommandQueueAllocatorFn allocator = nullptr; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_imp.h b/level_zero/core/source/cmdqueue/cmdqueue_imp.h index f866a76724..f9f511426b 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_imp.h +++ b/level_zero/core/source/cmdqueue/cmdqueue_imp.h @@ -89,15 +89,21 @@ struct CommandQueueImp : public CommandQueue { void printFunctionsPrintfOutput(); - Device *device = nullptr; - NEO::CommandStreamReceiver *csr = nullptr; - ze_command_queue_desc_t desc; - NEO::LinearStream *commandStream = nullptr; - std::atomic taskCount{0}; - std::vector printfFunctionContainer; - bool gpgpuEnabled = false; + void postSyncOperations(); + CommandBufferManager buffers; NEO::HeapContainer heapContainer; + ze_command_queue_desc_t desc; + std::vector printfFunctionContainer; + + Device *device = nullptr; + NEO::CommandStreamReceiver *csr = nullptr; + NEO::LinearStream *commandStream = nullptr; + + std::atomic taskCount{0}; + + bool gpgpuEnabled = false; + bool useKmdWaitFunction = false; }; } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp index 6ba0ec7bdf..1c23d94388 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue.cpp @@ -1397,37 +1397,77 @@ HWTEST2_F(ExecuteCommandListTests, givenTwoCommandQueuesHavingTwoB2BCommandLists using CommandQueueSynchronizeTest = Test; +template +struct SynchronizeCsr : public NEO::UltCommandStreamReceiver { + ~SynchronizeCsr() override { + delete tagAddress; + } + + SynchronizeCsr(const NEO::ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield) + : NEO::UltCommandStreamReceiver(const_cast(executionEnvironment), 0, deviceBitfield) { + tagAddress = new uint32_t; + } + + bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override { + waitForComplitionCalledTimes++; + return true; + } + + void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { + waitForTaskCountWithKmdNotifyFallbackCalled++; + NEO::UltCommandStreamReceiver::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, quickKmdSleep, forcePowerSavingMode); + } + + volatile uint32_t *getTagAddress() const override { + return tagAddress; + } + + uint32_t *tagAddress; + uint32_t waitForComplitionCalledTimes = 0; + uint32_t waitForTaskCountWithKmdNotifyFallbackCalled = 0; +}; + HWTEST_F(CommandQueueSynchronizeTest, givenCallToSynchronizeThenCorrectEnableTimeoutAndTimeoutValuesAreUsed) { - struct SynchronizeCsr : public NEO::UltCommandStreamReceiver { - ~SynchronizeCsr() override { - delete tagAddress; - } + auto csr = std::unique_ptr>(new SynchronizeCsr(*device->getNEODevice()->getExecutionEnvironment(), + device->getNEODevice()->getDeviceBitfield())); - SynchronizeCsr(const NEO::ExecutionEnvironment &executionEnvironment, const DeviceBitfield deviceBitfield) - : NEO::UltCommandStreamReceiver(const_cast(executionEnvironment), 0, deviceBitfield) { - tagAddress = new uint32_t; - } + ze_command_queue_desc_t desc = {}; + ze_command_queue_handle_t commandQueue = {}; + ze_result_t res = context->createCommandQueue(device, &desc, &commandQueue); + EXPECT_EQ(ZE_RESULT_SUCCESS, res); + EXPECT_NE(nullptr, commandQueue); - bool waitForCompletionWithTimeout(bool enableTimeout, int64_t timeoutMs, uint32_t taskCountToWait) override { - waitForComplitionCalledTimes++; - return true; - } + CommandQueue *queue = reinterpret_cast(L0::CommandQueue::fromHandle(commandQueue)); + queue->csr = csr.get(); - void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { - waitForTaskCountWithKmdNotifyFallbackCalled++; - NEO::UltCommandStreamReceiver::waitForTaskCountWithKmdNotifyFallback(taskCountToWait, flushStampToWait, quickKmdSleep, forcePowerSavingMode); - } + uint64_t timeout = 10; + bool enableTimeoutExpected = true; + int64_t timeoutMicrosecondsExpected = timeout; - volatile uint32_t *getTagAddress() const override { - return tagAddress; - } - uint32_t *tagAddress; - uint32_t waitForComplitionCalledTimes = 0; - uint32_t waitForTaskCountWithKmdNotifyFallbackCalled = 0; - }; + queue->synchronize(timeout); + + EXPECT_EQ(1u, csr->waitForComplitionCalledTimes); + EXPECT_EQ(0u, csr->waitForTaskCountWithKmdNotifyFallbackCalled); + + timeout = std::numeric_limits::max(); + enableTimeoutExpected = false; + timeoutMicrosecondsExpected = NEO::TimeoutControls::maxTimeout; + + queue->synchronize(timeout); + + EXPECT_EQ(2u, csr->waitForComplitionCalledTimes); + EXPECT_EQ(0u, csr->waitForTaskCountWithKmdNotifyFallbackCalled); + + L0::CommandQueue::fromHandle(commandQueue)->destroy(); +} + +HWTEST_F(CommandQueueSynchronizeTest, givenDebugOverrideEnabledWhenCallToSynchronizeThenCorrectEnableTimeoutAndTimeoutValuesAreUsed) { + DebugManagerStateRestore restore; + NEO::DebugManager.flags.OverrideUseKmdWaitFunction.set(1); + + auto csr = std::unique_ptr>(new SynchronizeCsr(*device->getNEODevice()->getExecutionEnvironment(), + device->getNEODevice()->getDeviceBitfield())); - auto csr = std::unique_ptr(new SynchronizeCsr(*device->getNEODevice()->getExecutionEnvironment(), - device->getNEODevice()->getDeviceBitfield())); ze_command_queue_desc_t desc = {}; ze_command_queue_handle_t commandQueue = {}; ze_result_t res = context->createCommandQueue(device, &desc, &commandQueue); diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index 64dfb12a99..7a0f4ea2a5 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -304,6 +304,7 @@ EnableUserFenceUseCtxId = -1 EnableResourceTags = 0 SetKmdWaitTimeout = -1 OverrideNotifyEnableForTagUpdatePostSync = -1 +OverrideUseKmdWaitFunction = -1 EnableCacheFlushAfterWalkerForAllQueues = -1 Force32BitDriverSupport = -1 OverrideCmdQueueSynchronousMode = -1 diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 46b7de91d5..5301703b9b 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -69,6 +69,7 @@ DECLARE_DEBUG_VARIABLE(bool, GlobalSequencerFlushOnCopyEngine, false, "false: di DECLARE_DEBUG_VARIABLE(bool, UseImmDataWriteModeOnPostSyncOperation, false, "Use IMM data write mode as post sync operation in Compute Walker") DECLARE_DEBUG_VARIABLE(bool, DisableTimestampEvents, false, "Timestamp info will not be reported and events will only perform regular synchronization functions") DECLARE_DEBUG_VARIABLE(bool, EnableResourceTags, false, "Enable resource tagging in GMM") +DECLARE_DEBUG_VARIABLE(bool, EnableFlushTaskSubmission, false, "true: driver uses csr flushTask for immediate submissions, false: driver uses legacy executeCommandList path") DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "DeviceId selected for testing") DECLARE_DEBUG_VARIABLE(std::string, LoadBinarySipFromFile, std::string("unk"), "Select binary file to load SIP kernel raw binary") DECLARE_DEBUG_VARIABLE(int64_t, OverrideMultiStoragePlacement, -1, "-1: disable, 0+: tile mask, each bit corresponds to tile") @@ -152,7 +153,6 @@ DECLARE_DEBUG_VARIABLE(int32_t, OverrideTimestampPacketSize, -1, "-1: default, > DECLARE_DEBUG_VARIABLE(int32_t, OverrideMaxWorkGroupCount, -1, "-1: default, >0: Max WG size") DECLARE_DEBUG_VARIABLE(int32_t, OverrideCmdQueueSynchronousMode, -1, "Overrides all command queues synchronous mode: -1: do not override, 0: implicit driver behavior, 1: synchronous, 2: asynchronous") DECLARE_DEBUG_VARIABLE(int64_t, EnableStatelessCompression, -1, "-1: default, 0: disable, 1: Enable E2EC in SBA for all stateless accesses") -DECLARE_DEBUG_VARIABLE(bool, EnableFlushTaskSubmission, false, "true: driver uses csr flushTask for immediate submissions, false: driver uses legacy executeCommandList path") /*LOGGING FLAGS*/ DECLARE_DEBUG_VARIABLE(int32_t, PrintDriverDiagnostics, -1, "prints driver diagnostics messages to standard output, value corresponds to hint level") @@ -212,6 +212,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForNewResource, -1, "-1: pla DECLARE_DEBUG_VARIABLE(int32_t, PerformImplicitFlushForIdleGpu, -1, "-1: platform specific, 0: force disable, 1: force enable") DECLARE_DEBUG_VARIABLE(int32_t, EnableCacheFlushAfterWalkerForAllQueues, -1, "Enable cache flush after walker even if queue doesn't require it") DECLARE_DEBUG_VARIABLE(int32_t, OverrideKernelSizeLimitForSmallDispatch, -1, "-1: default, >=0: on XEHP+ changes the threshold for treating kernel as small during NULL LWS selection") +DECLARE_DEBUG_VARIABLE(int32_t, OverrideUseKmdWaitFunction, -1, "-1: default (L0: disabled), 0: disabled, 1: enabled. It uses only busy loop to wait or busy loop with KMD wait function, when KMD fallback is enabled") /*DIRECT SUBMISSION FLAGS*/ DECLARE_DEBUG_VARIABLE(int32_t, EnableDirectSubmission, -1, "-1: default (disabled), 0: disable, 1:enable. Enables direct submission of command buffers bypassing KMD") @@ -228,8 +229,8 @@ DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionOverrideRenderSupport, -1, "Over DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionOverrideComputeSupport, -1, "Overrides default compute support: -1: do not override, 0: disable engine support, 1: enable engine support with init start, 2: enable engine support without init start") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableCacheFlush, -1, "-1: driver default, 0: additional cache flush is present 1: disable dispatching cache flush commands") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionNewResourceTlbFlush, -1, "-1: driver default - flush when new resource is bound, 0: disabled, 1: enabled") -DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU") DECLARE_DEBUG_VARIABLE(int32_t, DirectSubmissionDisableMonitorFence, -1, "Disable dispatching monitor fence commands") +DECLARE_DEBUG_VARIABLE(bool, USMEvictAfterMigration, true, "Evict USM allocation after implicit migration to GPU") /*FEATURE FLAGS*/ DECLARE_DEBUG_VARIABLE(bool, EnableNV12, true, "Enables NV12 extension")