W/A for disabling RCC RHWO for compressed media surfaces on gen12lp

Whenever media compressed surface is used, the RCC Read-Hit-Write optimization disable bit (14) has to be set in Common Slice Chicken1 register (7010h). Related-To: NEO-4982 Change-Id: I71b91b52692252459da05b737838eb4854575974 Signed-off-by: Pawel Wilma <pawel.wilma@intel.com>
2025-12-26 07:00:17 +08:00 · 2020-10-09 12:27:32 +02:00
parent ca023fa532
commit 0c3d430f50
17 changed files with 285 additions and 3 deletions
--- a/opencl/source/command_queue/gpgpu_walker.h
+++ b/opencl/source/command_queue/gpgpu_walker.h
@@ -101,6 +101,7 @@ class GpgpuWalkerHelper {
                                               bool disablePerfMode);

    static size_t getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel);
+    static size_t getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel);

    static size_t setGpgpuWalkerThreadData(
        WALKER_TYPE<GfxFamily> *walkerCmd,
--- a/opencl/source/command_queue/gpgpu_walker_base.inl
+++ b/opencl/source/command_queue/gpgpu_walker_base.inl
@@ -171,6 +171,11 @@ size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const K
    return (size_t)0;
 }

+template <typename GfxFamily>
+size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) {
+    return 0u;
+}
+
 template <typename GfxFamily>
 size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, bool blitEnqueue, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
    size_t expectedSizeCS = 0;
--- a/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
+++ b/opencl/source/command_queue/gpgpu_walker_bdw_plus.inl
@@ -211,6 +211,7 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilin
    }
    size += PerformanceCounters::getGpuCommandsSize(commandQueue, reservePerfCounters);
    size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
+    size += GpgpuWalkerHelper<GfxFamily>::getSizeForWaDisableRccRhwoOptimization(pKernel);

    return size;
 }
--- a/opencl/source/gen12lp/gpgpu_walker_gen12lp.cpp
+++ b/opencl/source/gen12lp/gpgpu_walker_gen12lp.cpp
@@ -6,6 +6,9 @@
 */

 #include "shared/source/gen12lp/hw_info.h"
+#include "shared/source/gmm_helper/gmm.h"
+#include "shared/source/gmm_helper/resource_info.h"
+#include "shared/source/memory_manager/graphics_allocation.h"

 #include "opencl/source/command_queue/gpgpu_walker_bdw_plus.inl"
 #include "opencl/source/command_queue/hardware_interface_bdw_plus.inl"
@@ -51,6 +54,39 @@ void GpgpuWalkerHelper<TGLLPFamily>::dispatchProfilingCommandsEnd(
        args);
 }

+template <>
+void HardwareInterface<TGLLPFamily>::dispatchWorkarounds(
+    LinearStream *commandStream,
+    CommandQueue &commandQueue,
+    Kernel &kernel,
+    const bool &enable) {
+
+    using MI_LOAD_REGISTER_IMM = typename TGLLPFamily::MI_LOAD_REGISTER_IMM;
+    using PIPE_CONTROL = typename TGLLPFamily::PIPE_CONTROL;
+
+    if (kernel.requiresWaDisableRccRhwoOptimization()) {
+
+        PIPE_CONTROL cmdPipeControl = TGLLPFamily::cmdInitPipeControl;
+        cmdPipeControl.setCommandStreamerStallEnable(true);
+        auto pCmdPipeControl = commandStream->getSpaceForCmd<PIPE_CONTROL>();
+        *pCmdPipeControl = cmdPipeControl;
+
+        uint32_t value = enable ? 0x40004000 : 0x40000000;
+        NEO::LriHelper<TGLLPFamily>::program(commandStream,
+                                             0x7010,
+                                             value,
+                                             false);
+    }
+}
+
+template <>
+size_t GpgpuWalkerHelper<TGLLPFamily>::getSizeForWaDisableRccRhwoOptimization(const Kernel *pKernel) {
+    if (pKernel->requiresWaDisableRccRhwoOptimization()) {
+        return (2 * (sizeof(TGLLP::PIPE_CONTROL) + sizeof(TGLLPFamily::MI_LOAD_REGISTER_IMM)));
+    }
+    return 0u;
+}
+
 template class HardwareInterface<TGLLPFamily>;

 template class GpgpuWalkerHelper<TGLLPFamily>;
--- a/opencl/source/kernel/kernel.cpp
+++ b/opencl/source/kernel/kernel.cpp
@@ -2487,4 +2487,25 @@ uint32_t Kernel::getAdditionalKernelExecInfo() const {
    return this->additionalKernelExecInfo;
 }

+bool Kernel::requiresWaDisableRccRhwoOptimization() const {
+    auto &hardwareInfo = getDevice().getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+    if (hwHelper.isWaDisableRccRhwoOptimizationRequired() && isUsingSharedObjArgs()) {
+        for (auto &arg : getKernelArguments()) {
+            auto clMemObj = static_cast<cl_mem>(arg.object);
+            auto memObj = castToObject<MemObj>(clMemObj);
+            if (memObj && memObj->peekSharingHandler()) {
+                auto allocation = memObj->getGraphicsAllocation(getDevice().getRootDeviceIndex());
+                for (uint32_t handleId = 0u; handleId < allocation->getNumGmms(); handleId++) {
+                    if (allocation->getGmm(handleId)->gmmResourceInfo->getResourceFlags()->Info.MediaCompressed) {
+                        return true;
+                    }
+                }
+            }
+        }
+    }
+    return false;
+}
+
 } // namespace NEO
--- a/opencl/source/kernel/kernel.h
+++ b/opencl/source/kernel/kernel.h
@@ -421,6 +421,7 @@ class Kernel : public BaseObject<_cl_kernel> {
    int32_t setAdditionalKernelExecInfoWithParam(uint32_t paramName);
    void setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo);
    uint32_t getAdditionalKernelExecInfo() const;
+    MOCKABLE_VIRTUAL bool requiresWaDisableRccRhwoOptimization() const;

  protected:
    struct ObjectCounts {
--- a/opencl/test/unit_test/command_queue/enqueue_map_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/enqueue_map_buffer_tests.cpp
@@ -16,6 +16,7 @@
 #include "opencl/test/unit_test/mocks/mock_buffer.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_context.h"
+#include "opencl/test/unit_test/mocks/mock_gmm.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "test.h"

@@ -375,7 +376,12 @@ TEST_F(EnqueueMapBufferTest, givenNonReadOnlyBufferWhenMappedOnGpuThenSetValidEv

    std::unique_ptr<Buffer> buffer(Buffer::create(BufferDefaults::context, CL_MEM_READ_WRITE, 20, nullptr, retVal));
    buffer->setSharingHandler(new SharingHandler());
+    auto gfxAllocation = buffer->getGraphicsAllocation(pDevice->getRootDeviceIndex());
+    for (auto handleId = 0u; handleId < gfxAllocation->getNumGmms(); handleId++) {
+        gfxAllocation->setGmm(new MockGmm(), handleId);
+    }
    buffer->forceDisallowCPUCopy = true;
+
    EXPECT_EQ(CL_SUCCESS, retVal);
    EXPECT_NE(nullptr, buffer.get());

@@ -418,6 +424,10 @@ TEST_F(EnqueueMapBufferTest, givenReadOnlyBufferWhenMappedOnGpuThenSetValidEvent

    std::unique_ptr<Buffer> buffer(Buffer::create(BufferDefaults::context, CL_MEM_READ_WRITE, 20, nullptr, retVal));
    buffer->setSharingHandler(new SharingHandler());
+    auto gfxAllocation = buffer->getGraphicsAllocation(pDevice->getRootDeviceIndex());
+    for (auto handleId = 0u; handleId < gfxAllocation->getNumGmms(); handleId++) {
+        gfxAllocation->setGmm(new MockGmm(), handleId);
+    }
    EXPECT_EQ(CL_SUCCESS, retVal);
    EXPECT_NE(nullptr, buffer.get());

--- a/opencl/test/unit_test/command_queue/gl/windows/enqueue_kernel_gl_tests_windows.cpp
+++ b/opencl/test/unit_test/command_queue/gl/windows/enqueue_kernel_gl_tests_windows.cpp
@@ -18,6 +18,7 @@
 #include "opencl/test/unit_test/mocks/gl/windows/mock_gl_sharing_windows.h"
 #include "opencl/test/unit_test/mocks/mock_buffer.h"
 #include "opencl/test/unit_test/mocks/mock_csr.h"
+#include "opencl/test/unit_test/mocks/mock_gmm.h"
 #include "opencl/test/unit_test/mocks/mock_submissions_aggregator.h"

 using namespace NEO;
@@ -26,9 +27,11 @@ typedef HelloWorldFixture<HelloWorldFixtureFactory> EnqueueKernelFixture;
 typedef Test<EnqueueKernelFixture> EnqueueKernelTest;

 TEST_F(EnqueueKernelTest, givenKernelWithSharedObjArgsWhenEnqueueIsCalledThenResetPatchAddress) {
+
    auto nonSharedBuffer = new MockBuffer;
    MockGlSharing glSharing;
-    glSharing.uploadDataToBufferInfo(1, 0);
+    MockGmm mockGmm;
+    glSharing.uploadDataToBufferInfo(1, 0, mockGmm.gmmResourceInfo->peekHandle());
    pContext->setSharingFunctions(glSharing.sharingFunctions.release());
    auto retVal = CL_SUCCESS;
    auto sharedBuffer = GlBuffer::createSharedGlBuffer(pContext, CL_MEM_READ_WRITE, 1, &retVal);
@@ -51,7 +54,7 @@ TEST_F(EnqueueKernelTest, givenKernelWithSharedObjArgsWhenEnqueueIsCalledThenRes
    EXPECT_EQ(sharedBufferGpuAddress, address1);

    // update address
-    glSharing.uploadDataToBufferInfo(1, 1);
+    glSharing.uploadDataToBufferInfo(1, 1, mockGmm.gmmResourceInfo->peekHandle());
    pCmdQ->enqueueAcquireSharedObjects(1, &sharedMem, 0, nullptr, nullptr, CL_COMMAND_ACQUIRE_GL_OBJECTS);

    callOneWorkItemNDRKernel();
--- a/opencl/test/unit_test/command_queue/multiple_map_buffer_tests.cpp
+++ b/opencl/test/unit_test/command_queue/multiple_map_buffer_tests.cpp
@@ -11,6 +11,7 @@
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
 #include "opencl/test/unit_test/mocks/mock_allocation_properties.h"
 #include "opencl/test/unit_test/mocks/mock_context.h"
+#include "opencl/test/unit_test/mocks/mock_gmm.h"
 #include "test.h"

 using namespace NEO;
@@ -97,6 +98,10 @@ struct MultipleMapBufferTest : public ClDeviceFixture, public ::testing::Test {
                                                 GraphicsAllocationHelper::toMultiGraphicsAllocation(mockAlloc), false, false, false);
        if (mapOnGpu) {
            buffer->setSharingHandler(new SharingHandler());
+            auto gfxAllocation = buffer->getGraphicsAllocation(pDevice->getRootDeviceIndex());
+            for (auto handleId = 0u; handleId < gfxAllocation->getNumGmms(); handleId++) {
+                gfxAllocation->setGmm(new MockGmm(), handleId);
+            }
        }
        return std::unique_ptr<MockBuffer<FamilyType>>(buffer);
    }
--- a/opencl/test/unit_test/gen12lp/gpgpu_walker_tests_gen12lp.cpp
+++ b/opencl/test/unit_test/gen12lp/gpgpu_walker_tests_gen12lp.cpp
@@ -6,9 +6,14 @@
 */

 #include "shared/source/gen12lp/hw_info.h"
+#include "shared/test/unit_test/cmd_parse/hw_parse.h"

 #include "opencl/source/command_queue/gpgpu_walker.h"
 #include "opencl/source/command_queue/hardware_interface.h"
+#include "opencl/test/unit_test/command_stream/linear_stream_fixture.h"
+#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
+#include "opencl/test/unit_test/mocks/mock_command_queue.h"
+#include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "test.h"

 namespace NEO {
@@ -29,4 +34,116 @@ GEN12LPTEST_F(GpgpuWalkerTests, givenMiStoreRegMemWhenAdjustMiStoreRegMemModeThe

    EXPECT_EQ(true, cmd.getMmioRemapEnable());
 }
+
+class MockKernelWithApplicableWa : public MockKernel {
+  public:
+    MockKernelWithApplicableWa(Program *program, KernelInfo &kernelInfo, ClDevice &device) : MockKernel(program, kernelInfo, device) {}
+    bool requiresWaDisableRccRhwoOptimization() const override {
+        return waApplicable;
+    }
+    bool waApplicable = false;
+};
+
+struct HardwareInterfaceTests : public ClDeviceFixture, public LinearStreamFixture, public ::testing::Test {
+    void SetUp() override {
+        ClDeviceFixture::SetUp();
+        LinearStreamFixture::SetUp();
+
+        pContext = new NEO::MockContext(pClDevice);
+        pCommandQueue = new MockCommandQueue(pContext, pClDevice, nullptr);
+        pProgram = new MockProgram(*pClDevice->getExecutionEnvironment(), pContext, false, &pClDevice->getDevice());
+        pKernel = new MockKernelWithApplicableWa(static_cast<Program *>(pProgram), pProgram->mockKernelInfo, *pClDevice);
+    }
+
+    void TearDown() override {
+        pKernel->release();
+        pProgram->release();
+        pCommandQueue->release();
+        pContext->release();
+
+        LinearStreamFixture::TearDown();
+        ClDeviceFixture::TearDown();
+    }
+
+    CommandQueue *pCommandQueue = nullptr;
+    Context *pContext = nullptr;
+    MockProgram *pProgram = nullptr;
+    MockKernelWithApplicableWa *pKernel = nullptr;
+};
+
+GEN12LPTEST_F(HardwareInterfaceTests, GivenKernelWithApplicableWaDisableRccRhwoOptimizationWhenDispatchWorkaroundsIsCalledThenWorkaroundIsApplied) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
+
+    bool enableWa = true;
+    pKernel->waApplicable = true;
+
+    HardwareInterface<FamilyType>::dispatchWorkarounds(&linearStream, *pCommandQueue, *pKernel, enableWa);
+    size_t expectedUsedForEnableWa = (sizeof(PIPE_CONTROL) + sizeof(MI_LOAD_REGISTER_IMM));
+    ASSERT_EQ(expectedUsedForEnableWa, linearStream.getUsed());
+
+    HardwareParse hwParse;
+    hwParse.parseCommands<FamilyType>(linearStream);
+    auto itorPipeCtrl = find<PIPE_CONTROL *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
+    ASSERT_NE(hwParse.cmdList.end(), itorPipeCtrl);
+    auto pipeControl = genCmdCast<PIPE_CONTROL *>(*itorPipeCtrl);
+    ASSERT_NE(nullptr, pipeControl);
+    EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
+
+    auto itorLri = find<MI_LOAD_REGISTER_IMM *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
+    ASSERT_NE(hwParse.cmdList.end(), itorLri);
+    auto lriCmd = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itorLri);
+    ASSERT_NE(nullptr, lriCmd);
+    EXPECT_EQ(0x7010u, lriCmd->getRegisterOffset());
+    EXPECT_EQ(0x40004000u, lriCmd->getDataDword());
+
+    enableWa = false;
+    HardwareInterface<FamilyType>::dispatchWorkarounds(&linearStream, *pCommandQueue, *pKernel, enableWa);
+    size_t expectedUsedForDisableWa = 2 * (sizeof(PIPE_CONTROL) + sizeof(MI_LOAD_REGISTER_IMM));
+    ASSERT_EQ(expectedUsedForDisableWa, linearStream.getUsed());
+
+    hwParse.TearDown();
+    hwParse.parseCommands<FamilyType>(linearStream, expectedUsedForEnableWa);
+    itorPipeCtrl = find<PIPE_CONTROL *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
+    ASSERT_NE(hwParse.cmdList.end(), itorPipeCtrl);
+    pipeControl = genCmdCast<PIPE_CONTROL *>(*itorPipeCtrl);
+    ASSERT_NE(nullptr, pipeControl);
+    EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
+
+    itorLri = find<MI_LOAD_REGISTER_IMM *>(hwParse.cmdList.begin(), hwParse.cmdList.end());
+    ASSERT_NE(hwParse.cmdList.end(), itorLri);
+    lriCmd = genCmdCast<MI_LOAD_REGISTER_IMM *>(*itorLri);
+    ASSERT_NE(nullptr, lriCmd);
+    EXPECT_EQ(0x7010u, lriCmd->getRegisterOffset());
+    EXPECT_EQ(0x40000000u, lriCmd->getDataDword());
+}
+
+GEN12LPTEST_F(HardwareInterfaceTests, GivenKernelWithoutApplicableWaDisableRccRhwoOptimizationWhenDispatchWorkaroundsIsCalledThenWorkaroundIsApplied) {
+    bool enableWa = true;
+    pKernel->waApplicable = false;
+
+    HardwareInterface<FamilyType>::dispatchWorkarounds(&linearStream, *pCommandQueue, *pKernel, enableWa);
+    EXPECT_EQ(0u, linearStream.getUsed());
+
+    enableWa = false;
+    HardwareInterface<FamilyType>::dispatchWorkarounds(&linearStream, *pCommandQueue, *pKernel, enableWa);
+    EXPECT_EQ(0u, linearStream.getUsed());
+}
+
+GEN12LPTEST_F(HardwareInterfaceTests, GivenKernelWithApplicableWaDisableRccRhwoOptimizationWhenCalculatingCommandsSizeThenAppropriateSizeIsReturned) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
+
+    pKernel->waApplicable = true;
+    auto cmdSize = GpgpuWalkerHelper<FamilyType>::getSizeForWaDisableRccRhwoOptimization(pKernel);
+    size_t expectedSize = 2 * (sizeof(PIPE_CONTROL) + sizeof(MI_LOAD_REGISTER_IMM));
+    EXPECT_EQ(expectedSize, cmdSize);
+}
+
+GEN12LPTEST_F(HardwareInterfaceTests, GivenKernelWithoutApplicableWaDisableRccRhwoOptimizationWhenCalculatingCommandsSizeThenZeroIsReturned) {
+    pKernel->waApplicable = false;
+    auto cmdSize = GpgpuWalkerHelper<FamilyType>::getSizeForWaDisableRccRhwoOptimization(pKernel);
+    EXPECT_EQ(0u, cmdSize);
+}
+
 } // namespace NEO
--- a/opencl/test/unit_test/gen12lp/kernel_tests_gen12lp.inl
+++ b/opencl/test/unit_test/gen12lp/kernel_tests_gen12lp.inl
@@ -7,6 +7,7 @@

 #include "opencl/source/helpers/hardware_commands_helper.h"
 #include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
+#include "opencl/test/unit_test/mocks/mock_buffer.h"
 #include "opencl/test/unit_test/mocks/mock_kernel.h"
 #include "test.h"

@@ -18,3 +19,55 @@ GEN12LPTEST_F(Gen12LpKernelTest, givenKernelWhenCanTransformImagesIsCalledThenRe
    auto retVal = mockKernel.mockKernel->Kernel::canTransformImages();
    EXPECT_FALSE(retVal);
 }
+
+GEN12LPTEST_F(Gen12LpKernelTest, GivenKernelWhenNotRunningOnGen12lpThenWaDisableRccRhwoOptimizationIsNotRequired) {
+    HardwareInfo hwInfoToModify = hardwareInfo;
+    hwInfoToModify.platform.eRenderCoreFamily = IGFX_GEN11_CORE;
+    delete pClDevice;
+    pDevice = MockDevice::createWithNewExecutionEnvironment<MockDevice>(&hwInfoToModify, rootDeviceIndex);
+    ASSERT_NE(nullptr, pDevice);
+    pClDevice = new MockClDevice{pDevice};
+    ASSERT_NE(nullptr, pClDevice);
+
+    MockKernelWithInternals kernel(*pClDevice);
+    EXPECT_FALSE(kernel.mockKernel->requiresWaDisableRccRhwoOptimization());
+}
+
+GEN12LPTEST_F(Gen12LpKernelTest, GivenKernelWhenNotUsingSharedObjArgsThenWaDisableRccRhwoOptimizationIsNotRequired) {
+    MockKernelWithInternals kernel(*pClDevice);
+    EXPECT_FALSE(kernel.mockKernel->requiresWaDisableRccRhwoOptimization());
+}
+
+GEN12LPTEST_F(Gen12LpKernelTest, GivenKernelWhenAtLeastOneArgIsMediaCompressedThenWaDisableRccRhwoOptimizationIsRequired) {
+    MockKernelWithInternals kernel(*pClDevice);
+    kernel.kernelInfo.kernelArgInfo.resize(3);
+    kernel.kernelInfo.kernelArgInfo.at(0).isBuffer = true;
+    kernel.kernelInfo.kernelArgInfo.at(1).isBuffer = false;
+    kernel.kernelInfo.kernelArgInfo.at(2).isBuffer = true;
+    for (auto &kernelInfo : kernel.kernelInfo.kernelArgInfo) {
+        kernelInfo.kernelArgPatchInfoVector.resize(1);
+    }
+    kernel.mockKernel->initialize();
+
+    MockBuffer buffer;
+    auto allocation = buffer.getGraphicsAllocation(pClDevice->getRootDeviceIndex());
+    MockGmm gmm1;
+    allocation->setGmm(&gmm1, 0);
+
+    cl_mem clMem = &buffer;
+    kernel.mockKernel->setArgBuffer(0, sizeof(cl_mem *), &clMem);
+
+    uint32_t immediateArg = 0;
+    kernel.mockKernel->setArgImmediate(1, sizeof(uint32_t), &immediateArg);
+
+    MockBuffer bufferMediaCompressed;
+    bufferMediaCompressed.setSharingHandler(new SharingHandler());
+    allocation = bufferMediaCompressed.getGraphicsAllocation(pClDevice->getRootDeviceIndex());
+    MockGmm gmm2;
+    allocation->setGmm(&gmm2, 0);
+    allocation->getGmm(0)->gmmResourceInfo->getResourceFlags()->Info.MediaCompressed = 1;
+    cl_mem clMem2 = &bufferMediaCompressed;
+    kernel.mockKernel->setArgBuffer(2, sizeof(cl_mem *), &clMem2);
+
+    EXPECT_TRUE(kernel.mockKernel->requiresWaDisableRccRhwoOptimization());
+}
--- a/opencl/test/unit_test/mem_obj/buffer_tests.cpp
+++ b/opencl/test/unit_test/mem_obj/buffer_tests.cpp
@@ -24,6 +24,7 @@
 #include "opencl/test/unit_test/mocks/mock_buffer.h"
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_execution_environment.h"
+#include "opencl/test/unit_test/mocks/mock_gmm.h"
 #include "test.h"

 using namespace NEO;
@@ -1724,6 +1725,11 @@ HWTEST_F(BufferUnmapTest, givenBufferWithSharingHandlerWhenUnmappingThenUseNonBl
    buffer->setSharingHandler(new SharingHandler());
    EXPECT_NE(nullptr, buffer->peekSharingHandler());

+    auto gfxAllocation = buffer->getGraphicsAllocation(pDevice->getRootDeviceIndex());
+    for (auto handleId = 0u; handleId < gfxAllocation->getNumGmms(); handleId++) {
+        gfxAllocation->setGmm(new MockGmm(), handleId);
+    }
+
    auto mappedPtr = clEnqueueMapBuffer(&cmdQ, buffer.get(), CL_TRUE, CL_MAP_WRITE, 0, 1, 0, nullptr, nullptr, &retVal);
    EXPECT_EQ(CL_SUCCESS, retVal);

--- a/opencl/test/unit_test/mocks/gl/windows/mock_gl_sharing_windows.h
+++ b/opencl/test/unit_test/mocks/gl/windows/mock_gl_sharing_windows.h
@@ -140,9 +140,10 @@ class MockGlSharing {
    void uploadDataToBufferInfo() {
        dllParam->loadBuffer(m_bufferInfoOutput);
    }
-    void uploadDataToBufferInfo(unsigned int sharedHandle, int bufferOffset) {
+    void uploadDataToBufferInfo(unsigned int sharedHandle, int bufferOffset, GMM_RESOURCE_INFO *gmmResInfo) {
        m_bufferInfoOutput.globalShareHandle = sharedHandle;
        m_bufferInfoOutput.bufferOffset = bufferOffset;
+        m_bufferInfoOutput.pGmmResInfo = gmmResInfo;
        dllParam->loadBuffer(m_bufferInfoOutput);
    }
    void uploadDataToTextureInfo() {
--- a/opencl/test/unit_test/sharings/gl/windows/gl_sharing_tests.cpp
+++ b/opencl/test/unit_test/sharings/gl/windows/gl_sharing_tests.cpp
@@ -32,6 +32,7 @@
 #include "opencl/test/unit_test/mocks/mock_command_queue.h"
 #include "opencl/test/unit_test/mocks/mock_context.h"
 #include "opencl/test/unit_test/mocks/mock_event.h"
+#include "opencl/test/unit_test/mocks/mock_gmm.h"
 #include "opencl/test/unit_test/mocks/mock_gmm_resource_info.h"
 #include "opencl/test/unit_test/mocks/mock_memory_manager.h"
 #include "test.h"
@@ -898,6 +899,10 @@ TEST_F(glSharingTests, givenClGLBufferWhenMapAndUnmapBufferIsCalledThenCopyOnGpu
    auto glBuffer = clCreateFromGLBuffer(&context, 0, bufferId, &retVal);
    auto buffer = castToObject<Buffer>(glBuffer);
    EXPECT_EQ(buffer->getCpuAddressForMemoryTransfer(), nullptr); // no cpu ptr
+    auto gfxAllocation = buffer->getGraphicsAllocation(rootDeviceIndex);
+    for (auto handleId = 0u; handleId < gfxAllocation->getNumGmms(); handleId++) {
+        gfxAllocation->setGmm(new MockGmm(), handleId);
+    }

    auto commandQueue = CommandQueue::create(&context, context.getDevice(0), 0, false, retVal);
    ASSERT_EQ(CL_SUCCESS, retVal);
@@ -937,6 +942,10 @@ TEST_F(glSharingTests, givenClGLBufferWhenMapAndUnmapBufferIsCalledTwiceThenReus
    auto glBuffer = clCreateFromGLBuffer(&context, 0, bufferId, &retVal);
    auto buffer = castToObject<Buffer>(glBuffer);
    EXPECT_EQ(buffer->getCpuAddressForMemoryTransfer(), nullptr); // no cpu ptr
+    auto gfxAllocation = buffer->getGraphicsAllocation(rootDeviceIndex);
+    for (auto handleId = 0u; handleId < gfxAllocation->getNumGmms(); handleId++) {
+        gfxAllocation->setGmm(new MockGmm(), handleId);
+    }

    auto commandQueue = CommandQueue::create(&context, context.getDevice(0), 0, false, retVal);
    ASSERT_EQ(CL_SUCCESS, retVal);
--- a/shared/source/gen12lp/hw_helper_gen12lp.cpp
+++ b/shared/source/gen12lp/hw_helper_gen12lp.cpp
@@ -45,6 +45,11 @@ bool HwHelperHw<Family>::isForceEmuInt32DivRemSPWARequired(const HardwareInfo &h
    return Gen12LPHelpers::isForceEmuInt32DivRemSPWARequired(hwInfo);
 }

+template <>
+bool HwHelperHw<Family>::isWaDisableRccRhwoOptimizationRequired() const {
+    return true;
+}
+
 template <>
 uint32_t HwHelperHw<Family>::getComputeUnitsUsedForScratch(const HardwareInfo *pHwInfo) const {
    /* For ICL+ maxThreadCount equals (EUCount * 8).
--- a/shared/source/helpers/hw_helper.h
+++ b/shared/source/helpers/hw_helper.h
@@ -109,6 +109,7 @@ class HwHelper {
    virtual uint32_t computeSlmValues(uint32_t slmSize) = 0;

    virtual bool isForceEmuInt32DivRemSPWARequired(const HardwareInfo &hwInfo) = 0;
+    virtual bool isWaDisableRccRhwoOptimizationRequired() const = 0;
    virtual uint32_t getMinimalSIMDSize() = 0;
    virtual uint32_t getHwRevIdFromStepping(uint32_t stepping, const HardwareInfo &hwInfo) const = 0;
    virtual uint32_t getSteppingFromHwRevId(uint32_t hwRevId, const HardwareInfo &hwInfo) const = 0;
@@ -284,6 +285,8 @@ class HwHelperHw : public HwHelper {

    bool isForceEmuInt32DivRemSPWARequired(const HardwareInfo &hwInfo) override;

+    bool isWaDisableRccRhwoOptimizationRequired() const override;
+
    uint32_t getMinimalSIMDSize() override;

    uint64_t getGpuTimeStampInNS(uint64_t timeStamp, double frequency) const override;
--- a/shared/source/helpers/hw_helper_base.inl
+++ b/shared/source/helpers/hw_helper_base.inl
@@ -394,6 +394,11 @@ bool HwHelperHw<GfxFamily>::isForceEmuInt32DivRemSPWARequired(const HardwareInfo
    return false;
 }

+template <typename GfxFamily>
+bool HwHelperHw<GfxFamily>::isWaDisableRccRhwoOptimizationRequired() const {
+    return false;
+}
+
 template <typename GfxFamily>
 inline uint32_t HwHelperHw<GfxFamily>::getMinimalSIMDSize() {
    return 8u;