Fix mutex order for event task and move args to gpu

This commit fixes problem with untransfered shared usm memory to gpu when there is submit to gpu trigerred by user event. Also there is a fix for dead lock problem caused by mixed orders of locking mutexes in csr and in direct submission controller. Related-To: NEO-6762 Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
2022-05-19 10:06:08 +00:00 · 2022-05-19 10:06:08 +00:00 · 6ab6e1abff
parent d308df254c
commit 6ab6e1abff
18 changed files with 427 additions and 139 deletions
--- a/opencl/source/command_queue/command_queue.cpp
+++ b/opencl/source/command_queue/command_queue.cpp
@ -88,7 +88,7 @@ CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_pr
        }

        if (!deferCmdQBcsInitialization) {
-            this->initializeBcsEngine(internalUsage);
+            this->constructBcsEngine(internalUsage);
        }
    }

@ -274,7 +274,7 @@ CommandStreamReceiver &CommandQueue::selectCsrForBuiltinOperation(const CsrSelec
    return *selectedCsr;
 }

-void CommandQueue::initializeBcsEngine(bool internalUsage) {
+void CommandQueue::constructBcsEngine(bool internalUsage) {
    if (bcsAllowed && !bcsInitialized) {
        auto &hwInfo = device->getHardwareInfo();
        auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
@ -293,6 +293,10 @@ void CommandQueue::initializeBcsEngine(bool internalUsage) {
    }
 }

+void CommandQueue::initializeBcsEngine(bool internalUsage) {
+    constructBcsEngine(internalUsage);
+}
+
 Device &CommandQueue::getDevice() const noexcept {
    return device->getDevice();
 }
--- a/opencl/source/command_queue/command_queue.h
+++ b/opencl/source/command_queue/command_queue.h
@ -228,7 +228,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
    MOCKABLE_VIRTUAL CommandStreamReceiver *getBcsCommandStreamReceiver(aub_stream::EngineType bcsEngineType);
    CommandStreamReceiver *getBcsForAuxTranslation();
    MOCKABLE_VIRTUAL CommandStreamReceiver &selectCsrForBuiltinOperation(const CsrSelectionArgs &args);
-    void initializeBcsEngine(bool internalUsage);
+    void constructBcsEngine(bool internalUsage);
+    MOCKABLE_VIRTUAL void initializeBcsEngine(bool internalUsage);
    Device &getDevice() const noexcept;
    ClDevice &getClDevice() const { return *device; }
    Context &getContext() const { return *context; }
--- a/opencl/source/event/event.cpp
+++ b/opencl/source/event/event.cpp
@ -570,6 +570,7 @@ void Event::transitionExecutionStatus(int32_t newExecutionStatus) const {
 void Event::submitCommand(bool abortTasks) {
    std::unique_ptr<Command> cmdToProcess(cmdToSubmit.exchange(nullptr));
    if (cmdToProcess.get() != nullptr) {
+        getCommandQueue()->initializeBcsEngine(getCommandQueue()->isSpecial());
        auto lockCSR = getCommandQueue()->getGpgpuCommandStreamReceiver().obtainUniqueOwnership();

        if (this->isProfilingEnabled()) {
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@ -1280,8 +1280,14 @@ void Kernel::getResidency(std::vector<Surface *> &dst) {
    for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
        if (kernelArguments[argIndex].object) {
            if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
+                bool needsMigration = false;
+                auto pageFaultManager = executionEnvironment.memoryManager->getPageFaultManager();
+                if (pageFaultManager &&
+                    this->isUnifiedMemorySyncRequired) {
+                    needsMigration = true;
+                }
                auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
-                dst.push_back(new GeneralSurface(pSVMAlloc));
+                dst.push_back(new GeneralSurface(pSVMAlloc, needsMigration));
            } else if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
                auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
                auto memObj = castToObject<MemObj>(clMem);
--- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp
+++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_3_tests.cpp
@ -1830,6 +1830,50 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenItIsUnblocke
    EXPECT_EQ(numGrfRequired, csr->savedDispatchFlags.numGrfRequired);
 }

+class MockCommandQueueInitializeBcs : public MockCommandQueue {
+  public:
+    MockCommandQueueInitializeBcs() : MockCommandQueue(nullptr, nullptr, 0, false) {}
+    MockCommandQueueInitializeBcs(Context &context) : MockCommandQueueInitializeBcs(&context, context.getDevice(0), nullptr, false) {}
+    MockCommandQueueInitializeBcs(Context *context, ClDevice *device, const cl_queue_properties *props, bool internalUsage)
+        : MockCommandQueue(context, device, props, internalUsage) {
+    }
+    void initializeBcsEngine(bool internalUsage) override {
+        if (initializeBcsEngineCalledTimes == 0) {
+            auto th = std::thread([&]() {
+                isCsrLocked = reinterpret_cast<MockCommandStreamReceiver *>(&this->getGpgpuCommandStreamReceiver())->isOwnershipMutexLocked();
+            });
+            th.join();
+        }
+        initializeBcsEngineCalledTimes++;
+        MockCommandQueue::initializeBcsEngine(internalUsage);
+    }
+    int initializeBcsEngineCalledTimes = 0;
+    bool isCsrLocked = false;
+};
+
+HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenBlockedKernelWhenInitializeBcsCalledThenCrsIsNotLocked) {
+    MockContext mockContext;
+    auto csr = new MockCommandStreamReceiver(*pDevice->executionEnvironment, 0, pDevice->getDeviceBitfield());
+    pDevice->resetCommandStreamReceiver(csr);
+    uint32_t numGrfRequired = 666u;
+
+    auto pCmdQ = std::make_unique<MockCommandQueueInitializeBcs>(&mockContext, pClDevice, nullptr, false);
+    auto mockProgram = std::make_unique<MockProgram>(&mockContext, false, toClDeviceVector(*pClDevice));
+
+    auto pKernel = MockKernel::create(*pDevice, mockProgram.get(), numGrfRequired);
+    auto kernelInfos = MockKernel::toKernelInfoContainer(pKernel->getKernelInfo(), rootDeviceIndex);
+    MultiDeviceKernel multiDeviceKernel(MockMultiDeviceKernel::toKernelVector(pKernel), kernelInfos);
+    auto event = std::make_unique<MockEvent<Event>>(pCmdQ.get(), CL_COMMAND_MARKER, 0, 0);
+    auto cmdStream = new LinearStream(pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties({pDevice->getRootDeviceIndex(), 4096, AllocationType::COMMAND_BUFFER, pDevice->getDeviceBitfield()}));
+
+    auto blockedCommandsData = std::make_unique<KernelOperation>(cmdStream, *pCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage());
+
+    std::vector<Surface *> surfaces;
+    event->setCommand(std::make_unique<CommandComputeKernel>(*pCmdQ, blockedCommandsData, surfaces, false, false, false, nullptr, pDevice->getPreemptionMode(), pKernel, 1));
+    event->submitCommand(false);
+    EXPECT_FALSE(pCmdQ->isCsrLocked);
+}
+
 HWTEST_F(CommandStreamReceiverFlushTaskTests, givenDcFlushArgumentIsTrueWhenCallingAddPipeControlThenDcFlushIsEnabled) {
    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
    std::unique_ptr<uint8_t> buffer(new uint8_t[128]);
--- a/opencl/test/unit_test/event/event_tests.cpp
+++ b/opencl/test/unit_test/event/event_tests.cpp
@ -145,7 +145,7 @@ TEST(Event, givenBcsCsrSetInEventWhenPeekingBcsTaskCountThenReturnCorrectTaskCou
        new MockClDevice{MockDevice::createWithNewExecutionEnvironment<MockAlignedMallocManagerDevice>(&hwInfo)}};
    MockContext context{device.get()};
    MockCommandQueue queue{context};
-    queue.initializeBcsEngine(false);
+    queue.constructBcsEngine(false);
    queue.updateBcsTaskCount(queue.bcsEngines[0]->getEngineType(), 19);
    Event event{&queue, CL_COMMAND_READ_BUFFER, 0, 0};

--- a/opencl/test/unit_test/kernel/kernel_tests.cpp
+++ b/opencl/test/unit_test/kernel/kernel_tests.cpp
@ -1328,6 +1328,57 @@ HWTEST_F(KernelResidencyTest, givenSharedUnifiedMemoryAndNotRequiredMemSyncWhenM
    svmAllocationsManager->freeSVMAlloc(unifiedMemoryAllocation);
 }

+class MockGeneralSurface : public GeneralSurface {
+  public:
+    using GeneralSurface::needsMigration;
+};
+
+HWTEST_F(KernelResidencyTest, givenSvmArgWhenKernelDoesNotRequireUnifiedMemorySyncThenSurfaceDoesNotNeedMigration) {
+    auto mockPageFaultManager = new MockPageFaultManager();
+    static_cast<MockMemoryManager *>(this->pDevice->getExecutionEnvironment()->memoryManager.get())->pageFaultManager.reset(mockPageFaultManager);
+    MockKernelWithInternals mockKernel(*this->pClDevice, nullptr, true);
+
+    auto svmAllocationsManager = mockKernel.mockContext->getSVMAllocsManager();
+    auto sharedProperties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::SHARED_UNIFIED_MEMORY, mockKernel.mockContext->getRootDeviceIndices(), mockKernel.mockContext->getDeviceBitfields());
+    auto unifiedMemoryAllocation = svmAllocationsManager->createSharedUnifiedMemoryAllocation(4096u, sharedProperties, mockKernel.mockContext->getSpecialQueue(pDevice->getRootDeviceIndex()));
+    auto unifiedMemoryGraphicsAllocation = svmAllocationsManager->getSVMAlloc(unifiedMemoryAllocation);
+    mockPageFaultManager->insertAllocation(unifiedMemoryAllocation, 4096u, svmAllocationsManager, mockKernel.mockContext->getSpecialQueue(pDevice->getRootDeviceIndex()), {});
+
+    auto gpuAllocation = unifiedMemoryGraphicsAllocation->gpuAllocations.getGraphicsAllocation(pDevice->getRootDeviceIndex());
+    mockKernel.mockKernel->kernelArguments[0] = {Kernel::kernelArgType::SVM_ALLOC_OBJ, gpuAllocation, unifiedMemoryAllocation, 4096u, gpuAllocation, sizeof(uintptr_t)};
+    mockKernel.mockKernel->setUnifiedMemorySyncRequirement(false);
+    std::vector<NEO::Surface *> residencySurfaces;
+    mockKernel.mockKernel->getResidency(residencySurfaces);
+    EXPECT_FALSE(reinterpret_cast<MockGeneralSurface *>(residencySurfaces[0])->needsMigration);
+    for (auto surface : residencySurfaces) {
+        delete surface;
+    }
+    svmAllocationsManager->freeSVMAlloc(unifiedMemoryAllocation);
+}
+
+HWTEST_F(KernelResidencyTest, givenSvmArgWhenKernelRequireUnifiedMemorySyncThenSurfaceNeedMigration) {
+    auto mockPageFaultManager = new MockPageFaultManager();
+    static_cast<MockMemoryManager *>(this->pDevice->getExecutionEnvironment()->memoryManager.get())->pageFaultManager.reset(mockPageFaultManager);
+    MockKernelWithInternals mockKernel(*this->pClDevice, nullptr, true);
+
+    auto svmAllocationsManager = mockKernel.mockContext->getSVMAllocsManager();
+    auto sharedProperties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::SHARED_UNIFIED_MEMORY, mockKernel.mockContext->getRootDeviceIndices(), mockKernel.mockContext->getDeviceBitfields());
+    auto unifiedMemoryAllocation = svmAllocationsManager->createSharedUnifiedMemoryAllocation(4096u, sharedProperties, mockKernel.mockContext->getSpecialQueue(pDevice->getRootDeviceIndex()));
+    auto unifiedMemoryGraphicsAllocation = svmAllocationsManager->getSVMAlloc(unifiedMemoryAllocation);
+    mockPageFaultManager->insertAllocation(unifiedMemoryAllocation, 4096u, svmAllocationsManager, mockKernel.mockContext->getSpecialQueue(pDevice->getRootDeviceIndex()), {});
+
+    auto gpuAllocation = unifiedMemoryGraphicsAllocation->gpuAllocations.getGraphicsAllocation(pDevice->getRootDeviceIndex());
+    mockKernel.mockKernel->kernelArguments[0] = {Kernel::kernelArgType::SVM_ALLOC_OBJ, gpuAllocation, unifiedMemoryAllocation, 4096u, gpuAllocation, sizeof(uintptr_t)};
+    mockKernel.mockKernel->setUnifiedMemorySyncRequirement(true);
+    std::vector<NEO::Surface *> residencySurfaces;
+    mockKernel.mockKernel->getResidency(residencySurfaces);
+    EXPECT_TRUE(reinterpret_cast<MockGeneralSurface *>(residencySurfaces[0])->needsMigration);
+    for (auto surface : residencySurfaces) {
+        delete surface;
+    }
+    svmAllocationsManager->freeSVMAlloc(unifiedMemoryAllocation);
+}
+
 HWTEST_F(KernelResidencyTest, givenSharedUnifiedMemoryRequiredMemSyncWhenMakeResidentIsCalledThenAllocationIsDecommited) {
    auto mockPageFaultManager = new MockPageFaultManager();
    static_cast<MockMemoryManager *>(this->pDevice->getExecutionEnvironment()->memoryManager.get())->pageFaultManager.reset(mockPageFaultManager);
@ -1836,6 +1887,60 @@ HWTEST_F(KernelResidencyTest, givenSimpleKernelTunningAndNoAtomicsWhenPerformTun
    EXPECT_NE(mockKernel.mockKernel->isSingleSubdevicePreferred(), mockKernel.mockKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics);
 }

+HWTEST_F(KernelResidencyTest, givenSimpleKernelWhenExecEnvDoesNotHavePageFaultManagerThenPageFaultDoesNotMoveAllocation) {
+    auto mockPageFaultManager = std::make_unique<MockPageFaultManager>();
+    MockKernelWithInternals mockKernel(*this->pClDevice);
+
+    auto svmAllocationsManager = mockKernel.mockContext->getSVMAllocsManager();
+    auto sharedProperties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::SHARED_UNIFIED_MEMORY, mockKernel.mockContext->getRootDeviceIndices(), mockKernel.mockContext->getDeviceBitfields());
+    auto unifiedMemoryAllocation = svmAllocationsManager->createSharedUnifiedMemoryAllocation(4096u, sharedProperties, mockKernel.mockContext->getSpecialQueue(pDevice->getRootDeviceIndex()));
+    auto unifiedMemoryGraphicsAllocation = svmAllocationsManager->getSVMAlloc(unifiedMemoryAllocation);
+    mockPageFaultManager->insertAllocation(reinterpret_cast<void *>(unifiedMemoryGraphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()), 4096u, svmAllocationsManager, mockKernel.mockContext->getSpecialQueue(pDevice->getRootDeviceIndex()), {});
+
+    Kernel::SimpleKernelArgInfo kernelArgInfo;
+    kernelArgInfo.object = unifiedMemoryGraphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation();
+    kernelArgInfo.type = Kernel::kernelArgType::SVM_ALLOC_OBJ;
+
+    std::vector<Kernel::SimpleKernelArgInfo> kernelArguments;
+    kernelArguments.resize(1);
+    kernelArguments[0] = kernelArgInfo;
+    mockKernel.kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.resize(1);
+    mockKernel.kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[0].as<ArgDescPointer>(true).accessedUsingStatelessAddressingMode = true;
+    mockKernel.mockKernel->setKernelArguments(kernelArguments);
+    mockKernel.mockKernel->kernelArgRequiresCacheFlush.resize(1);
+    EXPECT_EQ(mockPageFaultManager->transferToGpuCalled, 0);
+    svmAllocationsManager->freeSVMAlloc(unifiedMemoryAllocation);
+    static_cast<MockMemoryManager *>(this->pDevice->getExecutionEnvironment()->memoryManager.get())->pageFaultManager.reset();
+}
+
+HWTEST_F(KernelResidencyTest, givenSimpleKernelWhenIsUnifiedMemorySyncRequiredIsFalseThenPageFaultDoesNotMoveAllocation) {
+    auto mockPageFaultManager = new MockPageFaultManager();
+    static_cast<MockMemoryManager *>(this->pDevice->getExecutionEnvironment()->memoryManager.get())->pageFaultManager.reset(mockPageFaultManager);
+    MockKernelWithInternals mockKernel(*this->pClDevice);
+
+    auto svmAllocationsManager = mockKernel.mockContext->getSVMAllocsManager();
+    auto sharedProperties = SVMAllocsManager::UnifiedMemoryProperties(InternalMemoryType::SHARED_UNIFIED_MEMORY, mockKernel.mockContext->getRootDeviceIndices(), mockKernel.mockContext->getDeviceBitfields());
+    auto unifiedMemoryAllocation = svmAllocationsManager->createSharedUnifiedMemoryAllocation(4096u, sharedProperties, mockKernel.mockContext->getSpecialQueue(pDevice->getRootDeviceIndex()));
+    auto unifiedMemoryGraphicsAllocation = svmAllocationsManager->getSVMAlloc(unifiedMemoryAllocation);
+    mockPageFaultManager->insertAllocation(reinterpret_cast<void *>(unifiedMemoryGraphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation()->getGpuAddress()), 4096u, svmAllocationsManager, mockKernel.mockContext->getSpecialQueue(pDevice->getRootDeviceIndex()), {});
+
+    Kernel::SimpleKernelArgInfo kernelArgInfo;
+    kernelArgInfo.object = unifiedMemoryGraphicsAllocation->gpuAllocations.getDefaultGraphicsAllocation();
+    kernelArgInfo.type = Kernel::kernelArgType::SVM_ALLOC_OBJ;
+
+    std::vector<Kernel::SimpleKernelArgInfo> kernelArguments;
+    kernelArguments.resize(1);
+    kernelArguments[0] = kernelArgInfo;
+    mockKernel.kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.resize(1);
+    mockKernel.kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[0].as<ArgDescPointer>(true).accessedUsingStatelessAddressingMode = true;
+    mockKernel.mockKernel->setKernelArguments(kernelArguments);
+    mockKernel.mockKernel->kernelArgRequiresCacheFlush.resize(1);
+    mockKernel.mockKernel->isUnifiedMemorySyncRequired = false;
+    EXPECT_EQ(mockPageFaultManager->transferToGpuCalled, 0);
+    svmAllocationsManager->freeSVMAlloc(unifiedMemoryAllocation);
+    static_cast<MockMemoryManager *>(this->pDevice->getExecutionEnvironment()->memoryManager.get())->pageFaultManager.reset();
+}
+
 TEST(KernelImageDetectionTests, givenKernelWithImagesOnlyWhenItIsAskedIfItHasImagesOnlyThenTrueIsReturned) {
    auto pKernelInfo = std::make_unique<MockKernelInfo>();
    pKernelInfo->kernelDescriptor.kernelAttributes.simdSize = 1;
--- a/opencl/test/unit_test/memory_manager/CMakeLists.txt
+++ b/opencl/test/unit_test/memory_manager/CMakeLists.txt
@ -10,8 +10,8 @@ set(IGDRCL_SRCS_tests_memory_manager
    ${CMAKE_CURRENT_SOURCE_DIR}/memory_manager_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/memory_manager_allocate_in_preferred_pool_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/migraton_controller_tests.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/surface_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/unified_memory_manager_tests.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/mem_obj_surface_tests.cpp
    ${CMAKE_CURRENT_SOURCE_DIR}/unified_memory_token_tests.cpp
 )

--- a/opencl/test/unit_test/memory_manager/mem_obj_surface_tests.cpp
+++ b/opencl/test/unit_test/memory_manager/mem_obj_surface_tests.cpp
@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2018-2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/command_stream/preemption.h"
+#include "shared/source/helpers/hw_helper.h"
+#include "shared/source/memory_manager/graphics_allocation.h"
+#include "shared/test/common/helpers/engine_descriptor_helper.h"
+#include "shared/test/common/mocks/mock_csr.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/source/memory_manager/mem_obj_surface.h"
+#include "opencl/source/platform/platform.h"
+#include "opencl/test/unit_test/mocks/mock_buffer.h"
+#include "opencl/test/unit_test/mocks/mock_platform.h"
+
+#include "gtest/gtest.h"
+
+#include <type_traits>
+
+using namespace NEO;
+
+namespace createSurface {
+Surface *create(char *data, MockBuffer *buffer, GraphicsAllocation *gfxAllocation) {
+    return new MemObjSurface(buffer);
+}
+} // namespace createSurface
+
+class SurfaceTest : public ::testing::Test {
+  public:
+    char data[10];
+    MockBuffer buffer;
+    MockGraphicsAllocation gfxAllocation{nullptr, 0};
+};
+
+HWTEST_F(SurfaceTest, GivenSurfaceWhenInterfaceIsUsedThenSurfaceBehavesCorrectly) {
+    int32_t execStamp;
+
+    ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment();
+    executionEnvironment->initializeMemoryManager();
+    DeviceBitfield deviceBitfield(1);
+    auto csr = std::make_unique<MockCsr<FamilyType>>(execStamp, *executionEnvironment, 0, deviceBitfield);
+    auto hwInfo = *defaultHwInfo;
+    auto engine = HwHelper::get(hwInfo.platform.eRenderCoreFamily).getGpgpuEngineInstances(hwInfo)[0];
+    auto osContext = executionEnvironment->memoryManager->createAndRegisterOsContext(csr.get(), EngineDescriptorHelper::getDefaultDescriptor(engine, PreemptionHelper::getDefaultPreemptionMode(hwInfo)));
+    csr->setupContext(*osContext);
+
+    Surface *surface = createSurface::create(this->data,
+                                             &this->buffer,
+                                             &this->gfxAllocation);
+    ASSERT_NE(nullptr, surface); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
+
+    Surface *duplicatedSurface = surface->duplicate();
+    ASSERT_NE(nullptr, duplicatedSurface); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
+
+    surface->makeResident(*csr);
+
+    EXPECT_EQ(1u, csr->madeResidentGfxAllocations.size());
+
+    delete duplicatedSurface;
+    delete surface;
+}
+
+class CoherentMemObjSurface : public SurfaceTest {
+  public:
+    CoherentMemObjSurface() {
+        this->buffer.getGraphicsAllocation(mockRootDeviceIndex)->setCoherent(true);
+    }
+};
+
+TEST_F(CoherentMemObjSurface, GivenCoherentMemObjWhenCreatingSurfaceFromMemObjThenSurfaceIsCoherent) {
+    Surface *surface = createSurface::create(this->data,
+                                             &this->buffer,
+                                             &this->gfxAllocation);
+
+    EXPECT_TRUE(surface->IsCoherent);
+
+    delete surface;
+}
--- a/opencl/test/unit_test/memory_manager/surface_tests.cpp
+++ b/opencl/test/unit_test/memory_manager/surface_tests.cpp
@ -1,131 +0,0 @@
-/*
- * Copyright (C) 2018-2022 Intel Corporation
- *
- * SPDX-License-Identifier: MIT
- *
- */
-
-#include "shared/source/command_stream/preemption.h"
-#include "shared/source/helpers/hw_helper.h"
-#include "shared/source/memory_manager/graphics_allocation.h"
-#include "shared/test/common/helpers/engine_descriptor_helper.h"
-#include "shared/test/common/mocks/mock_csr.h"
-#include "shared/test/common/test_macros/test.h"
-
-#include "opencl/source/memory_manager/mem_obj_surface.h"
-#include "opencl/source/platform/platform.h"
-#include "opencl/test/unit_test/mocks/mock_buffer.h"
-#include "opencl/test/unit_test/mocks/mock_platform.h"
-
-#include "gtest/gtest.h"
-
-#include <type_traits>
-
-using namespace NEO;
-
-typedef ::testing::Types<NullSurface, HostPtrSurface, MemObjSurface, GeneralSurface> SurfaceTypes;
-
-namespace createSurface {
-template <typename surfType>
-Surface *create(char *data, MockBuffer *buffer, GraphicsAllocation *gfxAllocation);
-
-template <>
-Surface *create<NullSurface>(char *data, MockBuffer *buffer, GraphicsAllocation *gfxAllocation) {
-    return new NullSurface;
-}
-
-template <>
-Surface *create<HostPtrSurface>(char *data, MockBuffer *buffer, GraphicsAllocation *gfxAllocation) {
-    return new HostPtrSurface(data, 10, gfxAllocation);
-}
-
-template <>
-Surface *create<MemObjSurface>(char *data, MockBuffer *buffer, GraphicsAllocation *gfxAllocation) {
-    return new MemObjSurface(buffer);
-}
-
-template <>
-Surface *create<GeneralSurface>(char *data, MockBuffer *buffer, GraphicsAllocation *gfxAllocation) {
-    return new GeneralSurface(gfxAllocation);
-}
-} // namespace createSurface
-
-template <typename T>
-class SurfaceTest : public ::testing::Test {
-  public:
-    char data[10];
-    MockBuffer buffer;
-    MockGraphicsAllocation gfxAllocation{nullptr, 0};
-};
-
-TYPED_TEST_CASE(SurfaceTest, SurfaceTypes);
-
-HWTEST_TYPED_TEST(SurfaceTest, GivenSurfaceWhenInterfaceIsUsedThenSurfaceBehavesCorrectly) {
-    int32_t execStamp;
-
-    ExecutionEnvironment *executionEnvironment = platform()->peekExecutionEnvironment();
-    executionEnvironment->initializeMemoryManager();
-    DeviceBitfield deviceBitfield(1);
-    auto csr = std::make_unique<MockCsr<FamilyType>>(execStamp, *executionEnvironment, 0, deviceBitfield);
-    auto hwInfo = *defaultHwInfo;
-    auto engine = HwHelper::get(hwInfo.platform.eRenderCoreFamily).getGpgpuEngineInstances(hwInfo)[0];
-    auto osContext = executionEnvironment->memoryManager->createAndRegisterOsContext(csr.get(), EngineDescriptorHelper::getDefaultDescriptor(engine, PreemptionHelper::getDefaultPreemptionMode(hwInfo)));
-    csr->setupContext(*osContext);
-
-    Surface *surface = createSurface::create<TypeParam>(this->data,
-                                                        &this->buffer,
-                                                        &this->gfxAllocation);
-    ASSERT_NE(nullptr, surface); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
-
-    Surface *duplicatedSurface = surface->duplicate();
-    ASSERT_NE(nullptr, duplicatedSurface); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
-
-    surface->makeResident(*csr);
-
-    if (std::is_same<TypeParam, HostPtrSurface>::value ||
-        std::is_same<TypeParam, MemObjSurface>::value ||
-        std::is_same<TypeParam, GeneralSurface>::value) {
-        EXPECT_EQ(1u, csr->madeResidentGfxAllocations.size());
-    }
-
-    delete duplicatedSurface;
-    delete surface;
-}
-
-class CoherentMemObjSurface : public SurfaceTest<MemObjSurface> {
-  public:
-    CoherentMemObjSurface() {
-        this->buffer.getGraphicsAllocation(mockRootDeviceIndex)->setCoherent(true);
-    }
-};
-
-TEST_F(CoherentMemObjSurface, GivenCoherentMemObjWhenCreatingSurfaceFromMemObjThenSurfaceIsCoherent) {
-    Surface *surface = createSurface::create<MemObjSurface>(this->data,
-                                                            &this->buffer,
-                                                            &this->gfxAllocation);
-
-    EXPECT_TRUE(surface->IsCoherent);
-
-    delete surface;
-}
-
-TEST(HostPtrSurfaceTest, givenHostPtrSurfaceWhenCreatedWithoutSpecifyingPtrCopyAllowanceThenPtrCopyIsNotAllowed) {
-    char memory[2] = {};
-    HostPtrSurface surface(memory, sizeof(memory));
-
-    EXPECT_FALSE(surface.peekIsPtrCopyAllowed());
-}
-
-TEST(HostPtrSurfaceTest, givenHostPtrSurfaceWhenCreatedWithPtrCopyAllowedThenQueryReturnsTrue) {
-    char memory[2] = {};
-    HostPtrSurface surface(memory, sizeof(memory), true);
-
-    EXPECT_TRUE(surface.peekIsPtrCopyAllowed());
-}
-
-TEST(HostPtrSurfaceTest, givenHostPtrSurfaceWhenCreatedWithPtrCopyNotAllowedThenQueryReturnsFalse) {
-    char memory[2] = {};
-    HostPtrSurface surface(memory, sizeof(memory), false);
-
-    EXPECT_FALSE(surface.peekIsPtrCopyAllowed());
-}
--- a/opencl/test/unit_test/mocks/mock_command_queue.h
+++ b/opencl/test/unit_test/mocks/mock_command_queue.h
@ -258,7 +258,7 @@ class MockCommandQueueHw : public CommandQueueHw<GfxFamily> {
    MockCommandQueueHw(Context *context,
                       ClDevice *device,
                       cl_queue_properties *properties) : BaseClass(context, device, properties, false) {
-        this->initializeBcsEngine(false);
+        this->constructBcsEngine(false);
    }

    void clearBcsEngines() {
--- a/opencl/test/unit_test/mocks/mock_kernel.h
+++ b/opencl/test/unit_test/mocks/mock_kernel.h
@ -106,6 +106,7 @@ class MockKernel : public Kernel {
    using Kernel::hasDirectStatelessAccessToHostMemory;
    using Kernel::hasDirectStatelessAccessToSharedBuffer;
    using Kernel::hasIndirectStatelessAccessToHostMemory;
+    using Kernel::isUnifiedMemorySyncRequired;
    using Kernel::kernelArgHandlers;
    using Kernel::kernelArgRequiresCacheFlush;
    using Kernel::kernelArguments;
@ -250,6 +251,7 @@ class MockKernel : public Kernel {
    uint32_t makeResidentCalls = 0;
    uint32_t getResidencyCalls = 0;
    uint32_t setArgSvmAllocCalls = 0;
+    uint32_t moveArgsToGpuDomainCalls = 0;

    bool canKernelTransformImages = true;
    bool isPatchedOverride = true;
--- a/shared/source/memory_manager/surface.h
+++ b/shared/source/memory_manager/surface.h
@ -9,6 +9,7 @@
 #include "shared/source/command_stream/command_stream_receiver.h"
 #include "shared/source/helpers/cache_policy.h"
 #include "shared/source/memory_manager/graphics_allocation.h"
+#include "shared/source/memory_manager/memory_manager.h"
 #include "shared/source/os_interface/os_context.h"

 namespace NEO {
@ -99,9 +100,15 @@ class GeneralSurface : public Surface {
    GeneralSurface(GraphicsAllocation *gfxAlloc) : Surface(gfxAlloc->isCoherent()) {
        gfxAllocation = gfxAlloc;
    };
+    GeneralSurface(GraphicsAllocation *gfxAlloc, bool needsMigration) : GeneralSurface(gfxAlloc) {
+        this->needsMigration = needsMigration;
+    }
    ~GeneralSurface() override = default;

    void makeResident(CommandStreamReceiver &csr) override {
+        if (needsMigration) {
+            csr.getMemoryManager()->getPageFaultManager()->moveAllocationToGpuDomain(reinterpret_cast<void *>(gfxAllocation->getGpuAddress()));
+        }
        csr.makeResident(*gfxAllocation);
    };
    Surface *duplicate() override { return new GeneralSurface(gfxAllocation); };
@ -111,6 +118,7 @@ class GeneralSurface : public Surface {
    }

  protected:
+    bool needsMigration = false;
    GraphicsAllocation *gfxAllocation;
 };
 } // namespace NEO
--- a/shared/test/common/libult/ult_command_stream_receiver.h
+++ b/shared/test/common/libult/ult_command_stream_receiver.h
@ -104,6 +104,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
    using BaseClass::CommandStreamReceiver::mediaVfeStateDirty;
    using BaseClass::CommandStreamReceiver::newResources;
    using BaseClass::CommandStreamReceiver::osContext;
+    using BaseClass::CommandStreamReceiver::ownershipMutex;
    using BaseClass::CommandStreamReceiver::perfCounterAllocator;
    using BaseClass::CommandStreamReceiver::postSyncWriteOffset;
    using BaseClass::CommandStreamReceiver::profilingTimeStampAllocator;
@ -124,6 +125,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
    using BaseClass::CommandStreamReceiver::userPauseConfirmation;
    using BaseClass::CommandStreamReceiver::waitForTaskCountAndCleanAllocationList;
    using BaseClass::CommandStreamReceiver::workPartitionAllocation;
+    ;

    UltCommandStreamReceiver(ExecutionEnvironment &executionEnvironment,
                             uint32_t rootDeviceIndex,
--- a/shared/test/common/mocks/mock_command_stream_receiver.h
+++ b/shared/test/common/mocks/mock_command_stream_receiver.h
@ -39,6 +39,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
    using CommandStreamReceiver::latestSentTaskCount;
    using CommandStreamReceiver::newResources;
    using CommandStreamReceiver::osContext;
+    using CommandStreamReceiver::ownershipMutex;
    using CommandStreamReceiver::postSyncWriteOffset;
    using CommandStreamReceiver::preemptionAllocation;
    using CommandStreamReceiver::tagAddress;
@ -159,6 +160,13 @@ class MockCommandStreamReceiver : public CommandStreamReceiver {
        return status;
    }
    void postInitFlagsSetup() override {}
+    bool isOwnershipMutexLocked() {
+        bool isLocked = !this->ownershipMutex.try_lock();
+        if (!isLocked) {
+            this->ownershipMutex.unlock();
+        }
+        return isLocked;
+    }

    static constexpr size_t tagSize = 256;
    static volatile uint32_t mockTagAddress[tagSize];
--- a/shared/test/unit_test/memory_manager/CMakeLists.txt
+++ b/shared/test/unit_test/memory_manager/CMakeLists.txt
@ -24,6 +24,7 @@ target_sources(${TARGET_NAME} PRIVATE
               ${CMAKE_CURRENT_SOURCE_DIR}/physical_address_allocator_tests.cpp
               ${CMAKE_CURRENT_SOURCE_DIR}/special_heap_pool_tests.cpp
               ${CMAKE_CURRENT_SOURCE_DIR}/storage_info_tests.cpp
+               ${CMAKE_CURRENT_SOURCE_DIR}/surface_tests.cpp
               ${CMAKE_CURRENT_SOURCE_DIR}/unified_memory_manager_cache_tests.cpp
 )

--- a/shared/test/unit_test/memory_manager/surface_tests.cpp
+++ b/shared/test/unit_test/memory_manager/surface_tests.cpp
@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2018-2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/command_stream/preemption.h"
+#include "shared/source/helpers/hw_helper.h"
+#include "shared/source/memory_manager/graphics_allocation.h"
+#include "shared/test/common/helpers/engine_descriptor_helper.h"
+#include "shared/test/common/mocks/mock_csr.h"
+#include "shared/test/common/mocks/mock_execution_environment.h"
+#include "shared/test/common/mocks/mock_graphics_allocation.h"
+#include "shared/test/common/mocks/mock_memory_manager.h"
+#include "shared/test/common/test_macros/test.h"
+#include "shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h"
+
+#include "gtest/gtest.h"
+
+#include <type_traits>
+
+using namespace NEO;
+
+typedef ::testing::Types<NullSurface, HostPtrSurface, GeneralSurface> SurfaceTypes;
+
+namespace createSurface {
+template <typename surfType>
+Surface *create(char *data, GraphicsAllocation *gfxAllocation);
+
+template <>
+Surface *create<NullSurface>(char *data, GraphicsAllocation *gfxAllocation) {
+    return new NullSurface;
+}
+
+template <>
+Surface *create<HostPtrSurface>(char *data, GraphicsAllocation *gfxAllocation) {
+    return new HostPtrSurface(data, 10, gfxAllocation);
+}
+template <>
+Surface *create<GeneralSurface>(char *data, GraphicsAllocation *gfxAllocation) {
+    return new GeneralSurface(gfxAllocation);
+}
+} // namespace createSurface
+
+template <typename T>
+class SurfaceTest : public ::testing::Test {
+  public:
+    char data[10];
+    MockGraphicsAllocation gfxAllocation;
+};
+
+TYPED_TEST_CASE(SurfaceTest, SurfaceTypes);
+
+HWTEST_TYPED_TEST(SurfaceTest, GivenSurfaceWhenInterfaceIsUsedThenSurfaceBehavesCorrectly) {
+    int32_t execStamp;
+    auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
+    auto memoryManager = std::make_unique<MockMemoryManager>();
+    executionEnvironment->memoryManager.reset(memoryManager.release());
+    DeviceBitfield deviceBitfield(1);
+    auto csr = std::make_unique<MockCsr<FamilyType>>(execStamp, *executionEnvironment, 0, deviceBitfield);
+    auto hwInfo = *defaultHwInfo;
+    auto engine = HwHelper::get(hwInfo.platform.eRenderCoreFamily).getGpgpuEngineInstances(hwInfo)[0];
+    auto osContext = executionEnvironment->memoryManager->createAndRegisterOsContext(csr.get(), EngineDescriptorHelper::getDefaultDescriptor(engine, PreemptionHelper::getDefaultPreemptionMode(hwInfo)));
+    csr->setupContext(*osContext);
+
+    Surface *surface = createSurface::create<TypeParam>(this->data,
+                                                        &this->gfxAllocation);
+    ASSERT_NE(nullptr, surface); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
+
+    Surface *duplicatedSurface = surface->duplicate();
+    ASSERT_NE(nullptr, duplicatedSurface); // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks)
+
+    surface->makeResident(*csr);
+
+    if (std::is_same<TypeParam, HostPtrSurface>::value ||
+        std::is_same<TypeParam, GeneralSurface>::value) {
+        EXPECT_EQ(1u, csr->madeResidentGfxAllocations.size());
+    }
+
+    delete duplicatedSurface;
+    delete surface;
+}
+
+TEST(HostPtrSurfaceTest, givenHostPtrSurfaceWhenCreatedWithoutSpecifyingPtrCopyAllowanceThenPtrCopyIsNotAllowed) {
+    char memory[2] = {};
+    HostPtrSurface surface(memory, sizeof(memory));
+
+    EXPECT_FALSE(surface.peekIsPtrCopyAllowed());
+}
+
+TEST(HostPtrSurfaceTest, givenHostPtrSurfaceWhenCreatedWithPtrCopyAllowedThenQueryReturnsTrue) {
+    char memory[2] = {};
+    HostPtrSurface surface(memory, sizeof(memory), true);
+
+    EXPECT_TRUE(surface.peekIsPtrCopyAllowed());
+}
+
+TEST(HostPtrSurfaceTest, givenHostPtrSurfaceWhenCreatedWithPtrCopyNotAllowedThenQueryReturnsFalse) {
+    char memory[2] = {};
+    HostPtrSurface surface(memory, sizeof(memory), false);
+
+    EXPECT_FALSE(surface.peekIsPtrCopyAllowed());
+}
+
+using GeneralSurfaceTest = ::testing::Test;
+
+HWTEST_F(GeneralSurfaceTest, givenGeneralSurfaceWhenMigrationNeededThenMoveToGpuDomainCalled) {
+    int32_t execStamp;
+    MockGraphicsAllocation allocation;
+    auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
+    auto memoryManager = std::make_unique<MockMemoryManager>();
+    executionEnvironment->memoryManager.reset(memoryManager.release());
+    auto pageFaultManager = std::make_unique<MockPageFaultManager>();
+    auto pageFaultManagerPtr = pageFaultManager.get();
+    static_cast<MockMemoryManager *>(executionEnvironment->memoryManager.get())->pageFaultManager.reset(pageFaultManager.release());
+    DeviceBitfield deviceBitfield(1);
+    auto csr = std::make_unique<MockCsr<FamilyType>>(execStamp, *executionEnvironment, 0, deviceBitfield);
+    auto hwInfo = *defaultHwInfo;
+    auto engine = HwHelper::get(hwInfo.platform.eRenderCoreFamily).getGpgpuEngineInstances(hwInfo)[0];
+    auto osContext = executionEnvironment->memoryManager->createAndRegisterOsContext(csr.get(), EngineDescriptorHelper::getDefaultDescriptor(engine, PreemptionHelper::getDefaultPreemptionMode(hwInfo)));
+    csr->setupContext(*osContext);
+
+    auto surface = std::make_unique<GeneralSurface>(&allocation, true);
+
+    surface->makeResident(*csr);
+    EXPECT_EQ(pageFaultManagerPtr->moveAllocationToGpuDomainCalled, 1);
+}
+
+HWTEST_F(GeneralSurfaceTest, givenGeneralSurfaceWhenMigrationNotNeededThenMoveToGpuDomainNotCalled) {
+    int32_t execStamp;
+    MockGraphicsAllocation allocation;
+    auto executionEnvironment = std::make_unique<MockExecutionEnvironment>();
+    auto memoryManager = std::make_unique<MockMemoryManager>();
+    executionEnvironment->memoryManager.reset(memoryManager.release());
+    auto pageFaultManager = std::make_unique<MockPageFaultManager>();
+    auto pageFaultManagerPtr = pageFaultManager.get();
+    static_cast<MockMemoryManager *>(executionEnvironment->memoryManager.get())->pageFaultManager.reset(pageFaultManager.release());
+    DeviceBitfield deviceBitfield(1);
+    auto csr = std::make_unique<MockCsr<FamilyType>>(execStamp, *executionEnvironment, 0, deviceBitfield);
+    auto hwInfo = *defaultHwInfo;
+    auto engine = HwHelper::get(hwInfo.platform.eRenderCoreFamily).getGpgpuEngineInstances(hwInfo)[0];
+    auto osContext = executionEnvironment->memoryManager->createAndRegisterOsContext(csr.get(), EngineDescriptorHelper::getDefaultDescriptor(engine, PreemptionHelper::getDefaultPreemptionMode(hwInfo)));
+    csr->setupContext(*osContext);
+
+    auto surface = std::make_unique<GeneralSurface>(&allocation, false);
+
+    surface->makeResident(*csr);
+    EXPECT_EQ(pageFaultManagerPtr->moveAllocationToGpuDomainCalled, 0);
+}
--- a/shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h
+++ b/shared/test/unit_test/page_fault_manager/mock_cpu_page_fault_manager.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2021 Intel Corporation
+ * Copyright (C) 2019-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -62,11 +62,16 @@ class MockPageFaultManager : public PageFaultManager {
    void *getAubAndTbxHandlerAddress() {
        return reinterpret_cast<void *>(PageFaultManager::handleGpuDomainTransferForAubAndTbx);
    }
+    void moveAllocationToGpuDomain(void *ptr) override {
+        moveAllocationToGpuDomainCalled++;
+        PageFaultManager::moveAllocationToGpuDomain(ptr);
+    }

    int allowMemoryAccessCalled = 0;
    int protectMemoryCalled = 0;
    int transferToCpuCalled = 0;
    int transferToGpuCalled = 0;
+    int moveAllocationToGpuDomainCalled = 0;
    void *transferToCpuAddress = nullptr;
    void *transferToGpuAddress = nullptr;
    void *allowedMemoryAccessAddress = nullptr;