From 8c3c703ec0a5b089af997aa26826575973d4bd14 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Fri, 25 Oct 2024 12:56:15 +0000 Subject: [PATCH] performance: Use lock pointer copy with sfence for dc flush mitigation Resolves: NEO-12898 Signed-off-by: Lukasz Jobczyk --- opencl/source/mem_obj/buffer.cpp | 9 ++++-- .../unit_test/libult/command_queue_ult.cpp | 2 +- .../test/unit_test/mem_obj/buffer_tests.cpp | 30 +++++++++---------- .../test/common/base_ult_config_listener.cpp | 2 +- shared/test/common/helpers/ult_hw_config.h | 5 +++- 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/opencl/source/mem_obj/buffer.cpp b/opencl/source/mem_obj/buffer.cpp index cb27ec90e9..a30ab58268 100644 --- a/opencl/source/mem_obj/buffer.cpp +++ b/opencl/source/mem_obj/buffer.cpp @@ -24,6 +24,7 @@ #include "shared/source/memory_manager/memory_operations_handler.h" #include "shared/source/memory_manager/migration_sync_data.h" #include "shared/source/os_interface/os_interface.h" +#include "shared/source/utilities/cpuintrinsics.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/command_queue/command_queue.h" @@ -212,8 +213,7 @@ bool inline copyHostPointer(Buffer *buffer, size <= Buffer::maxBufferSizeForCopyOnCpu && isCompressionEnabled == false && productHelper.getLocalMemoryAccessMode(hwInfo) != LocalMemoryAccessMode::cpuAccessDisallowed && - isLockable && - !isGpuCopyRequiredForDcFlushMitigation; + isLockable; if (debugManager.flags.CopyHostPtrOnCpu.get() != -1) { copyOnCpuAllowed = debugManager.flags.CopyHostPtrOnCpu.get() == 1; @@ -222,6 +222,11 @@ bool inline copyHostPointer(Buffer *buffer, memory->setAubWritable(true, GraphicsAllocation::defaultBank); memory->setTbxWritable(true, GraphicsAllocation::defaultBank); memcpy_s(ptrOffset(lockedPointer, buffer->getOffset()), size, hostPtr, size); + + if (isGpuCopyRequiredForDcFlushMitigation) { + CpuIntrinsics::sfence(); + } + return true; } else { auto blitMemoryToAllocationResult = BlitOperationResult::unsupported; diff --git a/opencl/test/unit_test/libult/command_queue_ult.cpp b/opencl/test/unit_test/libult/command_queue_ult.cpp index 9ca968afc1..1975a381bc 100644 --- a/opencl/test/unit_test/libult/command_queue_ult.cpp +++ b/opencl/test/unit_test/libult/command_queue_ult.cpp @@ -20,7 +20,7 @@ bool CommandQueue::isTimestampWaitEnabled() { } bool checkIsGpuCopyRequiredForDcFlushMitigation(AllocationType type) { - return false; + return ultHwConfig.useGpuCopyForDcFlushMitigation; } } // namespace NEO \ No newline at end of file diff --git a/opencl/test/unit_test/mem_obj/buffer_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_tests.cpp index 0429519118..a1186b7f2f 100644 --- a/opencl/test/unit_test/mem_obj/buffer_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_tests.cpp @@ -599,26 +599,22 @@ TEST(Buffer, givenClMemCopyHostPointerPassedToBufferCreateWhenAllocationIsNotInS } } -TEST(Buffer, givenDcFlushMitigationWhenCreateBufferCopyHostptrThenUseBlitterCopy) { - DebugManagerStateRestore restorer; - debugManager.flags.AllowDcFlush.set(0); - ExecutionEnvironment *executionEnvironment = MockClDevice::prepareExecutionEnvironment(defaultHwInfo.get(), 0u); - executionEnvironment->rootDeviceEnvironments[0]->getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = true; +namespace CpuIntrinsicsTests { +extern std::atomic sfenceCounter; +} // namespace CpuIntrinsicsTests +TEST(Buffer, givenDcFlushMitigationWhenCreateBufferCopyHostptrThenUseLockPointerCopyWithSfence) { + ExecutionEnvironment *executionEnvironment = MockClDevice::prepareExecutionEnvironment(defaultHwInfo.get(), 0u); auto productHelper = executionEnvironment->rootDeviceEnvironments[0]->productHelper.get(); - if (!(productHelper->isBlitterFullySupported(*defaultHwInfo) && productHelper->isDcFlushMitigated())) { + if (!productHelper->isDcFlushMitigated()) { GTEST_SKIP(); } - auto blitterCalled = 0u; - auto mockBlitMemoryToAllocation = [&](const NEO::Device &device, NEO::GraphicsAllocation *memory, size_t offset, const void *hostPtr, - Vec3 size) -> NEO::BlitOperationResult { - memcpy(memory->getUnderlyingBuffer(), hostPtr, size.x); - blitterCalled++; - return BlitOperationResult::success; - }; - VariableBackup blitMemoryToAllocationFuncBackup( - &NEO::BlitHelperFunctions::blitMemoryToAllocation, mockBlitMemoryToAllocation); + VariableBackup backup(&ultHwConfig); + ultHwConfig.useGpuCopyForDcFlushMitigation = true; + + DebugManagerStateRestore restorer; + debugManager.flags.AllowDcFlush.set(0); auto *memoryManager = new MockMemoryManagerFailFirstAllocation(*executionEnvironment); executionEnvironment->memoryManager.reset(memoryManager); @@ -626,6 +622,7 @@ TEST(Buffer, givenDcFlushMitigationWhenCreateBufferCopyHostptrThenUseBlitterCopy auto device = std::make_unique(MockDevice::create(executionEnvironment, 0)); MockContext ctx(device.get()); + CpuIntrinsicsTests::sfenceCounter.store(0u); cl_int retVal = 0; cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; @@ -634,7 +631,8 @@ TEST(Buffer, givenDcFlushMitigationWhenCreateBufferCopyHostptrThenUseBlitterCopy std::unique_ptr buffer(Buffer::create(&ctx, flags, sizeof(memory), memory, retVal)); ASSERT_NE(nullptr, buffer.get()); - EXPECT_EQ(blitterCalled, 1u); + EXPECT_EQ(1u, CpuIntrinsicsTests::sfenceCounter.load()); + CpuIntrinsicsTests::sfenceCounter.store(0u); } TEST(Buffer, givenPropertiesWithClDeviceHandleListKHRWhenCreateBufferThenCorrectBufferIsSet) { diff --git a/shared/test/common/base_ult_config_listener.cpp b/shared/test/common/base_ult_config_listener.cpp index 317e186f58..997cd689c8 100644 --- a/shared/test/common/base_ult_config_listener.cpp +++ b/shared/test/common/base_ult_config_listener.cpp @@ -50,7 +50,7 @@ void BaseUltConfigListener::OnTestEnd(const ::testing::TestInfo &) { // Ensure that global state is restored UltHwConfig expectedState{}; - static_assert(sizeof(UltHwConfig) == (16 * sizeof(bool) + sizeof(const char *)), ""); // Ensure that there is no internal padding + static_assert(sizeof(UltHwConfig) == (17 * sizeof(bool) + sizeof(const char *)) + sizeof(UltHwConfig::padding), ""); // Ensure that there is no internal padding EXPECT_EQ(0, memcmp(&expectedState, &ultHwConfig, sizeof(UltHwConfig))); EXPECT_EQ(0, memcmp(&referencedHwInfo.platform, &defaultHwInfo->platform, sizeof(PLATFORM))); diff --git a/shared/test/common/helpers/ult_hw_config.h b/shared/test/common/helpers/ult_hw_config.h index f26717c5c8..94bb551ac9 100644 --- a/shared/test/common/helpers/ult_hw_config.h +++ b/shared/test/common/helpers/ult_hw_config.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -16,6 +16,7 @@ struct UltHwConfig { bool useWaitForTimestamps = false; bool useBlitSplit = false; bool useFirstSubmissionInitDevice = false; + bool useGpuCopyForDcFlushMitigation = false; bool csrFailInitDirectSubmission = false; bool csrBaseCallDirectSubmissionAvailable = false; @@ -28,6 +29,8 @@ struct UltHwConfig { bool csrCreatePreemptionReturnValue = true; bool reserved = false; + char padding[7]; + const char *aubTestName = nullptr; };