diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp index 1b3b705cd0..be8a77e46c 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp @@ -76,6 +76,7 @@ struct BlitEnqueueTests : public ::testing::Test { REQUIRE_AUX_RESOLVES(); DebugManager.flags.EnableTimestampPacket.set(timestampPacketEnabled); + DebugManager.flags.DisableAtomicForPostSyncs.set(!timestampPacketEnabled); DebugManager.flags.EnableBlitterForEnqueueOperations.set(1); DebugManager.flags.ForceAuxTranslationMode.set(static_cast(AuxTranslationMode::Blit)); DebugManager.flags.RenderCompressedBuffersEnabled.set(1); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp index 03ad01dcb3..567d95f109 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp @@ -203,7 +203,7 @@ HWTEST_F(BcsTests, WhenGetNumberOfBlitsIsCalledThenCorrectValuesAreReturned) { } } -HWTEST_F(BcsTests, givenCsrDependenciesWhenProgrammingCommandStreamThenAddSemaphoreAndAtomic) { +HWTEST_F(BcsTests, givenCsrDependenciesWhenProgrammingCommandStreamThenAddSemaphore) { auto &csr = pDevice->getUltCommandStreamReceiver(); cl_int retVal = CL_SUCCESS; @@ -243,12 +243,9 @@ HWTEST_F(BcsTests, givenCsrDependenciesWhenProgrammingCommandStreamThenAddSemaph } dependenciesFound = true; EXPECT_FALSE(xyCopyBltCmdFound); - auto miAtomic = genCmdCast(*(++cmdIterator)); - EXPECT_NE(nullptr, miAtomic); for (uint32_t i = 1; i < numberOfDependencyContainers * numberNodesPerContainer; i++) { EXPECT_NE(nullptr, genCmdCast(*(++cmdIterator))); - EXPECT_NE(nullptr, genCmdCast(*(++cmdIterator))); } } } diff --git a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp index 7c5d1bee4e..1528a2c5d1 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_1_tests.cpp @@ -1469,6 +1469,35 @@ HWTEST_F(TimestampPacketTests, givenAlreadyAssignedNodeWhenEnqueueingThenKeepDep } } +HWTEST_F(TimestampPacketTests, givenDefaultDebugFlagValueAndAlreadyAssignedNodeWhenEnqueueingThenKeepDependencyOnPreviousNodeWithoutAtomicIfItsNotReady) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + DebugManager.flags.DisableAtomicForPostSyncs.set(-1); + device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; + + MockTimestampPacketContainer firstNode(*device->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(), 0); + + MockCommandQueueHw cmdQ(context, device.get(), nullptr); + TimestampPacketContainer previousNodes; + cmdQ.obtainNewTimestampPacketNodes(2, previousNodes, false, false); + firstNode.add(cmdQ.timestampPacketContainer->peekNodes().at(0)); + firstNode.add(cmdQ.timestampPacketContainer->peekNodes().at(1)); + auto firstTag0 = firstNode.getNode(0); + auto firstTag1 = firstNode.getNode(1); + + verifyDependencyCounterValues(&firstNode, 0); + cmdQ.enqueueKernel(kernel->mockKernel, 1, nullptr, gws, nullptr, 0, nullptr, nullptr); + verifyDependencyCounterValues(&firstNode, 0); + + HardwareParse hwParser; + hwParser.parseCommands(*cmdQ.commandStream, 0); + + auto it = hwParser.cmdList.begin(); + verifySemaphore(genCmdCast(*it), firstTag0, 0); + + verifySemaphore(genCmdCast(*++it), firstTag1, 0); +} + HWTEST_F(TimestampPacketTests, givenAlreadyAssignedNodeWhenEnqueueingToOoqThenDontKeepDependencyOnPreviousNodeIfItsNotReady) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; device->getUltCommandStreamReceiver().timestampPacketWriteEnabled = true; diff --git a/opencl/test/unit_test/helpers/timestamp_packet_tests.h b/opencl/test/unit_test/helpers/timestamp_packet_tests.h index 037c9c6483..e723c32f3c 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_tests.h +++ b/opencl/test/unit_test/helpers/timestamp_packet_tests.h @@ -47,6 +47,7 @@ struct TimestampPacketTests : public TimestampPacketSimpleTests { void SetUp() override { DebugManager.flags.EnableTimestampPacket.set(1); + DebugManager.flags.DisableAtomicForPostSyncs.set(0); executionEnvironment = platform()->peekExecutionEnvironment(); executionEnvironment->prepareRootDeviceEnvironments(2); diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 4a5400308c..3d33fc6cbe 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -532,7 +532,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenWriteBufferEnqueueWhenProgrammingCommand } } EXPECT_EQ(1u, semaphoresCount); - EXPECT_EQ(1u, miAtomicsCount); + EXPECT_EQ(0u, miAtomicsCount); EXPECT_EQ(initialTaskCount + 1, queueCsr->peekTaskCount()); } @@ -577,7 +577,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenReadBufferEnqueueWhenProgrammingCommandS } } EXPECT_EQ(1u, semaphoresCount); - EXPECT_EQ(1u, miAtomicsCount); + EXPECT_EQ(0u, miAtomicsCount); EXPECT_EQ(initialTaskCount + 1, queueCsr->peekTaskCount()); } diff --git a/opencl/test/unit_test/test_files/igdrcl.config b/opencl/test/unit_test/test_files/igdrcl.config index 4111cbb2b8..6f08cefb6a 100644 --- a/opencl/test/unit_test/test_files/igdrcl.config +++ b/opencl/test/unit_test/test_files/igdrcl.config @@ -172,7 +172,7 @@ AllocateSharedAllocationsWithCpuAndGpuStorage = -1 UseMaxSimdSizeToDeduceMaxWorkgroupSize = 0 ReturnRawGpuTimestamps = 0 ForcePerDssBackedBufferProgramming = 0 -DisableAtomicForPostSyncs = 0 +DisableAtomicForPostSyncs = -1 MaxHwThreadsPercent = 0 MinHwThreadsUnoccupied = 0 LimitBlitterMaxWidth = -1 diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 14ff09a6a1..c6465b9b09 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -261,10 +261,10 @@ DECLARE_DEBUG_VARIABLE(int32_t, EnableStaticPartitioning, -1, "Divide workload i DECLARE_DEBUG_VARIABLE(int32_t, UpdateTaskCountFromWait, -1, " Do not update task count after each enqueue, but send update request while wait, -1: default(disabled), 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, DeferOsContextInitialization, -1, "-1: default, 0: create all contexts immediately, 1: defer, if possible") DECLARE_DEBUG_VARIABLE(int32_t, ForceHostPointerImport, -1, "-1: default, 0: disable, 1: enable, Forces the driver to import every host pointer coming into driver, WARNING this is not spec complaint.") +DECLARE_DEBUG_VARIABLE(int32_t, DisableAtomicForPostSyncs, -1, "When enabled, post syncs are not tracked with atomics") DECLARE_DEBUG_VARIABLE(bool, UseMaxSimdSizeToDeduceMaxWorkgroupSize, false, "With this flag on, max workgroup size is deduced using SIMD32 instead of SIMD8, this causes the max wkg size to be 4 times bigger") DECLARE_DEBUG_VARIABLE(bool, ReturnRawGpuTimestamps, false, "Driver returns raw GPU tiemstamps instead of calculated ones.") DECLARE_DEBUG_VARIABLE(bool, ForcePerDssBackedBufferProgramming, false, "Always program per-DSS memory backed buffer in preamble") -DECLARE_DEBUG_VARIABLE(bool, DisableAtomicForPostSyncs, false, "When enabled, post syncs are not tracked with atomics") DECLARE_DEBUG_VARIABLE(bool, UseCommandBufferHeaderSizeForWddmQueueSubmission, true, "0: Page size (4096), 1: sizeof(COMMAND_BUFFER_HEADER)") DECLARE_DEBUG_VARIABLE(bool, DisableDeepBind, false, "Disable passing RTLD_DEEPBIND flag to all dlopen calls.") DECLARE_DEBUG_VARIABLE(bool, UseUmKmDataTranslator, false, "Use helper library for UMD<->KMD (WDDM) struct layout compatibility") diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index a5672631c9..ad5ebc3725 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -50,7 +50,7 @@ class TimestampPackets : public TagTypeBase { static constexpr size_t getSinglePacketSize() { return sizeof(Packet); } bool isCompleted() const { - if (DebugManager.flags.DisableAtomicForPostSyncs.get()) { + if (DebugManager.flags.DisableAtomicForPostSyncs.get() == 1) { return false; } @@ -164,12 +164,7 @@ struct TimestampPacketHelper { EncodeSempahore::addMiSemaphoreWaitCommand(cmdStream, compareAddress + compareOffset, 1, COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); } - bool trackPostSyncDependencies = true; - if (DebugManager.flags.DisableAtomicForPostSyncs.get()) { - trackPostSyncDependencies = false; - } - - if (trackPostSyncDependencies) { + if (DebugManager.flags.DisableAtomicForPostSyncs.get() == 0) { overrideSupportedDevicesCount(numSupportedDevices); for (uint32_t i = 0; i < numSupportedDevices; i++) { diff --git a/shared/source/utilities/tag_allocator.inl b/shared/source/utilities/tag_allocator.inl index 2a51281710..42a3f775fe 100644 --- a/shared/source/utilities/tag_allocator.inl +++ b/shared/source/utilities/tag_allocator.inl @@ -253,10 +253,11 @@ uint32_t TagNode::getPacketsUsed() const { template uint32_t TagNode::getImplicitGpuDependenciesCount() const { if constexpr (TagType::getTagNodeType() == TagNodeType::TimestampPacket) { - return tagForCpuAccess->getImplicitGpuDependenciesCount(); - } else { - return 0; + if (DebugManager.flags.DisableAtomicForPostSyncs.get() == 0) { + return tagForCpuAccess->getImplicitGpuDependenciesCount(); + } } + return 0; } template