From 83ebbb01d30b3320e127b7c3ab7182c87f52db64 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Tue, 24 Dec 2024 10:50:33 +0000 Subject: [PATCH] performance: Add flag to mitigate host visible signal in CB events Related-To: NEO-13441 Signed-off-by: Lukasz Jobczyk --- .../driver_experimental/public/zex_event.cpp | 16 +++++++-- level_zero/core/source/event/event.h | 5 +++ level_zero/core/source/event/event_impl.inl | 10 ++++++ .../cmdlist/test_in_order_cmdlist_1.cpp | 34 +++++++++++++++++++ .../debug_settings/debug_variables_base.inl | 1 + shared/source/helpers/in_order_cmd_helpers.h | 2 +- .../libult/ult_command_stream_receiver.h | 2 ++ shared/test/common/test_files/igdrcl.config | 1 + 8 files changed, 68 insertions(+), 3 deletions(-) diff --git a/level_zero/api/driver_experimental/public/zex_event.cpp b/level_zero/api/driver_experimental/public/zex_event.cpp index ac52fb1a74..7f91b8572b 100644 --- a/level_zero/api/driver_experimental/public/zex_event.cpp +++ b/level_zero/api/driver_experimental/public/zex_event.cpp @@ -66,6 +66,12 @@ zexCounterBasedEventCreate2(ze_context_handle_t hContext, ze_device_handle_t hDe return ZE_RESULT_ERROR_INVALID_ARGUMENT; } + auto signalScope = desc->signalScope; + + if (NEO::debugManager.flags.MitigateHostVisibleSignal.get()) { + signalScope &= ~ZE_EVENT_SCOPE_FLAG_HOST; + } + EventDescriptor eventDescriptor = { nullptr, // eventPoolAllocation desc->pNext, // extensions @@ -74,7 +80,7 @@ zexCounterBasedEventCreate2(ze_context_handle_t hContext, ze_device_handle_t hDe 1, // maxPacketsCount inputCbFlags, // counterBasedFlags 0, // index - desc->signalScope, // signalScope + signalScope, // signalScope desc->waitScope, // waitScope timestampFlag, // timestampPool mappedTimestampFlag, // kerneMappedTsPoolFlag @@ -84,7 +90,13 @@ zexCounterBasedEventCreate2(ze_context_handle_t hContext, ze_device_handle_t hDe ze_result_t result = ZE_RESULT_SUCCESS; - *phEvent = device->getL0GfxCoreHelper().createStandaloneEvent(eventDescriptor, device, result); + auto l0Event = device->getL0GfxCoreHelper().createStandaloneEvent(eventDescriptor, device, result); + + if (signalScope ^ desc->signalScope) { + l0Event->setMitigateHostVisibleSignal(); + } + + *phEvent = l0Event; return result; } diff --git a/level_zero/core/source/event/event.h b/level_zero/core/source/event/event.h index 295dcea3e4..ddbb2aaf69 100644 --- a/level_zero/core/source/event/event.h +++ b/level_zero/core/source/event/event.h @@ -327,6 +327,10 @@ struct Event : _ze_event_handle_t { bool isIpcImported() const { return isFromIpcPool; } + void setMitigateHostVisibleSignal() { + this->mitigateHostVisibleSignal = true; + } + virtual ze_result_t hostEventSetValue(State eventState) = 0; protected: @@ -398,6 +402,7 @@ struct Event : _ze_event_handle_t { bool kmdWaitMode = false; bool interruptMode = false; bool isSharableCouterBased = false; + bool mitigateHostVisibleSignal = false; uint64_t timestampRefreshIntervalInNanoSec = 0; }; diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index b71e988625..c63f04b23f 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -664,6 +664,13 @@ ze_result_t EventImp::hostSynchronize(uint64_t timeout) { timeout = NEO::debugManager.flags.OverrideEventSynchronizeTimeout.get(); } + TaskCountType taskCountToWaitForL3Flush = 0; + if (this->mitigateHostVisibleSignal && this->device->getProductHelper().isDcFlushAllowed()) { + auto lock = this->csrs[0]->obtainUniqueOwnership(); + this->csrs[0]->flushTagUpdate(); + taskCountToWaitForL3Flush = this->csrs[0]->peekLatestFlushedTaskCount(); + } + waitStartTime = std::chrono::high_resolution_clock::now(); lastHangCheckTime = waitStartTime; @@ -693,6 +700,9 @@ ze_result_t EventImp::hostSynchronize(uint64_t timeout) { return ZE_RESULT_ERROR_DEVICE_LOST; } } + if (taskCountToWaitForL3Flush) { + this->csrs[0]->waitForTaskCount(taskCountToWaitForL3Flush); + } return ret; } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp index 312be6c139..be9a90eecf 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp @@ -5213,6 +5213,40 @@ HWTEST2_F(InOrderCmdListTests, givenStandaloneEventWhenCallingSynchronizeThenRet context->freeMem(hostAddress); } +HWTEST2_F(InOrderCmdListTests, givenMitigateHostVisibleSignalWhenCallingSynchronizeOnCbEventThenFlushDcIfSupported, MatchAny) { + DebugManagerStateRestore restorer; + NEO::debugManager.flags.MitigateHostVisibleSignal.set(true); + + auto ultCsr = static_cast *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver); + + uint64_t counterValue = 2; + auto hostAddress = reinterpret_cast(allocHostMem(sizeof(uint64_t))); + *hostAddress = counterValue; + uint64_t *gpuAddress = ptrOffset(&counterValue, 64); + ze_event_desc_t eventDesc = {}; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + ze_event_handle_t handle = nullptr; + + EXPECT_EQ(ZE_RESULT_SUCCESS, zexCounterBasedEventCreate(context, device, gpuAddress, hostAddress, counterValue, &eventDesc, &handle)); + auto eventObj = Event::fromHandle(handle); + + EXPECT_FALSE(ultCsr->waitForTaskCountCalled); + EXPECT_FALSE(ultCsr->flushTagUpdateCalled); + + EXPECT_EQ(ZE_RESULT_SUCCESS, eventObj->hostSynchronize(-1)); + + if (device->getProductHelper().isDcFlushAllowed()) { + EXPECT_TRUE(ultCsr->waitForTaskCountCalled); + EXPECT_TRUE(ultCsr->flushTagUpdateCalled); + } else { + EXPECT_FALSE(ultCsr->waitForTaskCountCalled); + EXPECT_FALSE(ultCsr->flushTagUpdateCalled); + } + + zeEventDestroy(handle); + context->freeMem(hostAddress); +} + HWTEST2_F(InOrderCmdListTests, givenStandaloneCbEventWhenPassingExternalInterruptIdThenAssign, MatchAny) { zex_intel_event_sync_mode_exp_desc_t syncModeDesc = {ZEX_INTEL_STRUCTURE_TYPE_EVENT_SYNC_MODE_EXP_DESC}; // NOLINT(clang-analyzer-optin.core.EnumCastOutOfRange), NEO-12901 syncModeDesc.externalInterruptId = 123; diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 01ee812480..9268547117 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -376,6 +376,7 @@ DECLARE_DEBUG_VARIABLE(bool, UseNoRingFlushesKmdMode, true, "Windows only, passe DECLARE_DEBUG_VARIABLE(bool, DisableZeroCopyForUseHostPtr, false, "When active all buffer allocations created with CL_MEM_USE_HOST_PTR flag will not share memory with CPU.") DECLARE_DEBUG_VARIABLE(bool, ForceNonCoherentModeForTimestamps, false, "When active timestamp buffers are allocated in non coherent memory.") DECLARE_DEBUG_VARIABLE(bool, SetAssumeNotInUse, true, "Set AssumeNotInUse flag in d3d destroy allocation.") +DECLARE_DEBUG_VARIABLE(bool, MitigateHostVisibleSignal, false, "Reset host visible signal in CB events, flush L3 when synchronize") DECLARE_DEBUG_VARIABLE(bool, ForceZeroCopyForUseHostPtr, false, "When active all buffer allocations created with CL_MEM_USE_HOST_PTR flag will use share memory with CPU.") DECLARE_DEBUG_VARIABLE(int32_t, EnableReusingGpuTimestamps, -1, "Reuse GPU timestamp for next device time requests. -1: os-specific, 0: disable, 1: enable") DECLARE_DEBUG_VARIABLE(int32_t, AllowZeroCopyWithoutCoherency, -1, "Use cacheline flush instead of memory copy for map/unmap mem object") diff --git a/shared/source/helpers/in_order_cmd_helpers.h b/shared/source/helpers/in_order_cmd_helpers.h index 2cc9d7040d..00802cea2f 100644 --- a/shared/source/helpers/in_order_cmd_helpers.h +++ b/shared/source/helpers/in_order_cmd_helpers.h @@ -30,7 +30,7 @@ class DeviceAllocNodeType { static constexpr size_t defaultAllocatorTagCount = 128; - static constexpr AllocationType getAllocationType() { return deviceAlloc ? AllocationType::timestampPacketTagBuffer : NEO::AllocationType::bufferHostMemory; } + static constexpr AllocationType getAllocationType() { return deviceAlloc ? NEO::AllocationType::timestampPacketTagBuffer : NEO::AllocationType::bufferHostMemory; } static constexpr TagNodeType getTagNodeType() { return TagNodeType::counter64b; } diff --git a/shared/test/common/libult/ult_command_stream_receiver.h b/shared/test/common/libult/ult_command_stream_receiver.h index bfc7402c92..4cebbdf182 100644 --- a/shared/test/common/libult/ult_command_stream_receiver.h +++ b/shared/test/common/libult/ult_command_stream_receiver.h @@ -324,6 +324,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ } WaitStatus waitForTaskCount(TaskCountType requiredTaskCount) override { + this->waitForTaskCountCalled = true; if (waitForTaskCountReturnValue.has_value()) { return *waitForTaskCountReturnValue; } @@ -605,6 +606,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ bool recordFlushedBatchBuffer = false; bool checkAndActivateAubSubCaptureCalled = false; bool addAubCommentCalled = false; + bool waitForTaskCountCalled = false; std::atomic_bool downloadAllocationCalled = false; std::atomic_bool downloadAllocationsCalled = false; bool flushBatchedSubmissionsCalled = false; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 01b9ee997d..9c2a2bd495 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -564,6 +564,7 @@ TrackNumCsrClientsOnSyncPoints = -1 EventTimestampRefreshIntervalInMilliSec = -1 SynchronizeEventBeforeReset = -1 RemoveRestrictionsOnNumberOfThreadsInGpgpuThreadGroup = 0 +MitigateHostVisibleSignal = 0 DisableGemCreateExtSetPat = 0 SkipDcFlushOnBarrierWithoutEvents = -1 EnableAIL=1