diff --git a/opencl/test/unit_test/aub_tests/command_stream/CMakeLists.txt b/opencl/test/unit_test/aub_tests/command_stream/CMakeLists.txt
index a155938df6..d4aae77712 100644
--- a/opencl/test/unit_test/aub_tests/command_stream/CMakeLists.txt
+++ b/opencl/test/unit_test/aub_tests/command_stream/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2018-2021 Intel Corporation
+# Copyright (C) 2018-2022 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -14,4 +14,17 @@ target_sources(igdrcl_aub_tests PRIVATE
                ${CMAKE_CURRENT_SOURCE_DIR}/aub_mi_atomic_tests.cpp
 )
 
+if(TESTS_XEHP_AND_LATER)
+  target_sources(igdrcl_aub_tests PRIVATE
+                 ${CMAKE_CURRENT_SOURCE_DIR}/aub_range_based_flush_tests_xehp_and_later.cpp
+                 ${CMAKE_CURRENT_SOURCE_DIR}/aub_walker_partition_tests_xehp_and_later.cpp
+  )
+endif()
+
+if(TESTS_DG2_AND_LATER)
+  target_sources(igdrcl_aub_tests PRIVATE
+                 ${CMAKE_CURRENT_SOURCE_DIR}/mi_math_aub_tests_dg2_and_later.cpp
+  )
+endif()
+
 add_subdirectories()
diff --git a/opencl/test/unit_test/aub_tests/command_stream/aub_range_based_flush_tests_xehp_and_later.cpp b/opencl/test/unit_test/aub_tests/command_stream/aub_range_based_flush_tests_xehp_and_later.cpp
new file mode 100644
index 0000000000..f14b658e00
--- /dev/null
+++ b/opencl/test/unit_test/aub_tests/command_stream/aub_range_based_flush_tests_xehp_and_later.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/cache_flush_xehp_and_later.inl"
+#include "shared/source/helpers/hw_helper.h"
+#include "shared/source/helpers/timestamp_packet.h"
+#include "shared/source/utilities/tag_allocator.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/helpers/dispatch_flags_helper.h"
+#include "shared/test/common/mocks/mock_device.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/hello_world_fixture.h"
+#include "opencl/test/unit_test/helpers/cmd_buffer_validator.h"
+#include "opencl/test/unit_test/mocks/mock_command_queue.h"
+#include "opencl/test/unit_test/mocks/mock_context.h"
+
+#include "test_traits_common.h"
+
+using namespace NEO;
+
+struct RangeBasedFlushTest : public KernelAUBFixture<SimpleKernelFixture>, public ::testing::Test {
+
+    void SetUp() override {
+        DebugManager.flags.PerformImplicitFlushForNewResource.set(0);
+        DebugManager.flags.PerformImplicitFlushForIdleGpu.set(0);
+        KernelAUBFixture<SimpleKernelFixture>::SetUp();
+    };
+
+    void TearDown() override {
+        KernelAUBFixture<SimpleKernelFixture>::TearDown();
+    }
+
+    cl_int retVal = CL_SUCCESS;
+    DebugManagerStateRestore debugSettingsRestore;
+};
+
+struct L3ControlSupportedMatcher {
+    template <PRODUCT_FAMILY productFamily>
+    static constexpr bool isMatched() {
+        if constexpr (HwMapper<productFamily>::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) {
+            return TestTraits<NEO::ToGfxCoreFamily<productFamily>::get()>::l3ControlSupported;
+        }
+        return false;
+    }
+};
+
+HWTEST2_F(RangeBasedFlushTest, givenNoDcFlushInPipeControlWhenL3ControlFlushesCachesThenExpectFlushedCaches, L3ControlSupportedMatcher) {
+    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using WALKER = typename FamilyType::WALKER_TYPE;
+    using L3_CONTROL = typename FamilyType::L3_CONTROL;
+    using L3_FLUSH_ADDRESS_RANGE = typename FamilyType::L3_FLUSH_ADDRESS_RANGE;
+
+    DebugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0);
+
+    constexpr size_t bufferSize = MemoryConstants::pageSize;
+    char bufferAMemory[bufferSize];
+    char bufferBMemory[bufferSize];
+    for (uint32_t i = 0; i < bufferSize / MemoryConstants::pageSize; ++i) {
+        memset(bufferAMemory + i * MemoryConstants::pageSize, 1 + i, MemoryConstants::pageSize);
+        memset(bufferBMemory + i * MemoryConstants::pageSize, 129 + i, MemoryConstants::pageSize);
+    }
+
+    auto retVal = CL_INVALID_VALUE;
+    auto srcBuffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                                            bufferSize, bufferAMemory, retVal));
+
+    ASSERT_NE(nullptr, srcBuffer);
+    auto dstBuffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                            CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                            bufferSize, bufferBMemory, retVal));
+    ASSERT_NE(nullptr, dstBuffer);
+
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+
+    retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer.get(),
+                                      0, 0,
+                                      bufferSize, numEventsInWaitList,
+                                      eventWaitList, event);
+
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    L3RangesVec ranges;
+    ranges.push_back(L3Range::fromAddressSizeWithPolicy(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress(), MemoryConstants::pageSize,
+                                                        L3_FLUSH_ADDRESS_RANGE::L3_FLUSH_EVICTION_POLICY_FLUSH_L3_WITH_EVICTION));
+    size_t requiredSize = getSizeNeededToFlushGpuCache<FamilyType>(ranges, false) + 2 * sizeof(PIPE_CONTROL);
+    LinearStream &l3FlushCmdStream = pCmdQ->getCS(requiredSize);
+    auto offset = l3FlushCmdStream.getUsed();
+    auto pcBeforeFlush = l3FlushCmdStream.getSpaceForCmd<PIPE_CONTROL>();
+    *pcBeforeFlush = FamilyType::cmdInitPipeControl;
+
+    flushGpuCache<FamilyType>(&l3FlushCmdStream, ranges, 0U, device->getHardwareInfo());
+
+    auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
+    auto flags = DispatchFlagsHelper::createDefaultDispatchFlags();
+    flags.blocking = true;
+
+    DebugManager.flags.DisableDcFlushInEpilogue.set(true);
+    csr.flushTask(l3FlushCmdStream, offset,
+                  pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::DYNAMIC_STATE, 0),
+                  pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::INDIRECT_OBJECT, 0),
+                  pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::SURFACE_STATE, 0),
+                  pCmdQ->taskLevel,
+                  flags,
+                  pCmdQ->getDevice());
+
+    std::string err;
+
+    std::vector<MatchCmd *> expectedCommands{
+        new MatchAnyCmd(AnyNumber),
+        new MatchHwCmd<FamilyType, PIPE_CONTROL>(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getCommandStreamerStallEnable, true), EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)}),
+        new MatchHwCmd<FamilyType, L3_CONTROL>(1, Expects{EXPECT_MEMBER(L3_CONTROL, getPostSyncOperation, L3_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE)}),
+    };
+    if (MemorySynchronizationCommands<FamilyType>::isPipeControlWArequired(device->getHardwareInfo())) {
+        expectedCommands.push_back(new MatchHwCmd<FamilyType, PIPE_CONTROL>(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)}));
+        if (MemorySynchronizationCommands<FamilyType>::getSizeForAdditonalSynchronization(device->getHardwareInfo()) > 0) {
+            expectedCommands.push_back(new MatchHwCmd<FamilyType, MI_SEMAPHORE_WAIT>(1, Expects{EXPECT_MEMBER(MI_SEMAPHORE_WAIT, getSemaphoreDataDword, EncodeSempahore<FamilyType>::invalidHardwareTag)}));
+        }
+    }
+    expectedCommands.push_back(new MatchHwCmd<FamilyType, PIPE_CONTROL>(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)}));
+    expectedCommands.push_back(new MatchAnyCmd(AnyNumber));
+    expectedCommands.push_back(new MatchHwCmd<FamilyType, PIPE_CONTROL>(0));
+
+    auto cmdBuffOk = expectCmdBuff<FamilyType>(l3FlushCmdStream, 0, std::move(expectedCommands), &err);
+    EXPECT_TRUE(cmdBuffOk) << err;
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()),
+                             bufferAMemory, bufferSize);
+}
+
+HWTEST2_F(RangeBasedFlushTest, givenL3ControlWhenPostSyncIsSetThenExpectPostSyncWrite, L3ControlSupportedMatcher) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    using WALKER = typename FamilyType::WALKER_TYPE;
+    using L3_CONTROL = typename FamilyType::L3_CONTROL;
+    using L3_FLUSH_ADDRESS_RANGE = typename FamilyType::L3_FLUSH_ADDRESS_RANGE;
+
+    if (MemorySynchronizationCommands<FamilyType>::isPipeControlWArequired(device->getHardwareInfo())) {
+        GTEST_SKIP();
+    }
+
+    constexpr size_t bufferSize = MemoryConstants::pageSize;
+    char bufferAMemory[bufferSize];
+    char bufferBMemory[bufferSize];
+    for (uint32_t i = 0; i < bufferSize / MemoryConstants::pageSize; ++i) {
+        memset(bufferAMemory + i * MemoryConstants::pageSize, 1 + i, MemoryConstants::pageSize);
+        memset(bufferBMemory + i * MemoryConstants::pageSize, 129 + i, MemoryConstants::pageSize);
+    }
+
+    auto retVal = CL_INVALID_VALUE;
+    auto srcBuffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                                            bufferSize, bufferAMemory, retVal));
+
+    ASSERT_NE(nullptr, srcBuffer);
+    auto dstBuffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                            CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                            bufferSize, bufferBMemory, retVal));
+    ASSERT_NE(nullptr, dstBuffer);
+
+    auto postSyncBuffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                                 CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                                 sizeof(uint64_t), bufferAMemory, retVal));
+    ASSERT_NE(nullptr, dstBuffer);
+
+    uint64_t expectedPostSyncData = 0;
+
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+
+    retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer.get(),
+                                      0, 0,
+                                      bufferSize, numEventsInWaitList,
+                                      eventWaitList, event);
+
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    L3RangesVec ranges;
+    ranges.push_back(L3Range::fromAddressSizeWithPolicy(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress(),
+                                                        MemoryConstants::pageSize, L3_FLUSH_ADDRESS_RANGE::L3_FLUSH_EVICTION_POLICY_FLUSH_L3_WITH_EVICTION));
+    size_t requiredSize = getSizeNeededToFlushGpuCache<FamilyType>(ranges, true) + 2 * sizeof(PIPE_CONTROL);
+    LinearStream &l3FlushCmdStream = pCmdQ->getCS(requiredSize);
+    auto offset = l3FlushCmdStream.getUsed();
+    auto pcBeforeFlush = l3FlushCmdStream.getSpaceForCmd<PIPE_CONTROL>();
+    *pcBeforeFlush = FamilyType::cmdInitPipeControl;
+
+    flushGpuCache<FamilyType>(&l3FlushCmdStream, ranges, postSyncBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress(), device->getHardwareInfo());
+
+    auto &csr = pCmdQ->getGpgpuCommandStreamReceiver();
+    auto flags = DispatchFlagsHelper::createDefaultDispatchFlags();
+    flags.blocking = true;
+
+    DebugManager.flags.DisableDcFlushInEpilogue.set(true);
+    csr.makeResident(*postSyncBuffer->getGraphicsAllocation(rootDeviceIndex));
+    csr.flushTask(l3FlushCmdStream, offset,
+                  pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::DYNAMIC_STATE, 0),
+                  pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::INDIRECT_OBJECT, 0),
+                  pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::SURFACE_STATE, 0),
+                  pCmdQ->taskLevel,
+                  flags,
+                  pCmdQ->getDevice());
+
+    std::string err;
+    auto cmdBuffOk = expectCmdBuff<FamilyType>(l3FlushCmdStream, 0,
+                                               std::vector<MatchCmd *>{
+                                                   new MatchAnyCmd(AnyNumber),
+                                                   new MatchHwCmd<FamilyType, PIPE_CONTROL>(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getCommandStreamerStallEnable, true), EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)}),
+                                                   new MatchHwCmd<FamilyType, L3_CONTROL>(1, Expects{EXPECT_MEMBER(L3_CONTROL, getPostSyncOperation, L3_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA)}),
+                                                   new MatchHwCmd<FamilyType, PIPE_CONTROL>(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)}), // epilogue
+                                                   new MatchAnyCmd(AnyNumber),
+                                                   new MatchHwCmd<FamilyType, PIPE_CONTROL>(0),
+                                               },
+                                               &err);
+    EXPECT_TRUE(cmdBuffOk) << err;
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()),
+                             bufferAMemory, bufferSize);
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(postSyncBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()),
+                             &expectedPostSyncData, sizeof(expectedPostSyncData));
+}
diff --git a/opencl/test/unit_test/aub_tests/command_stream/aub_walker_partition_tests_xehp_and_later.cpp b/opencl/test/unit_test/aub_tests/command_stream/aub_walker_partition_tests_xehp_and_later.cpp
new file mode 100644
index 0000000000..3b85ada775
--- /dev/null
+++ b/opencl/test/unit_test/aub_tests/command_stream/aub_walker_partition_tests_xehp_and_later.cpp
@@ -0,0 +1,1198 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/command_container/walker_partition_xehp_and_later.h"
+#include "shared/source/helpers/array_count.h"
+#include "shared/source/helpers/basic_math.h"
+#include "shared/source/helpers/timestamp_packet.h"
+#include "shared/source/utilities/tag_allocator.h"
+#include "shared/test/common/cmd_parse/hw_parse.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/helpers/dispatch_flags_helper.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/source/event/event.h"
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/test/unit_test/aub_tests/command_stream/aub_command_stream_fixture.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
+#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
+#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
+#include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
+#include "opencl/test/unit_test/indirect_heap/indirect_heap_fixture.h"
+
+using namespace NEO;
+using namespace WalkerPartition;
+
+static int32_t testPartitionCount[] = {1, 2, 4, 8, 16};
+static int32_t testPartitionType[] = {1, 2, 3};
+static uint32_t testWorkingDimensions[] = {3};
+
+extern bool generateRandomInput;
+
+struct DispatchParamters {
+    size_t globalWorkSize[3];
+    size_t localWorkSize[3];
+} DispatchParamtersForTests[] = {
+    {{12, 25, 21}, {3, 5, 7}},
+    {{8, 16, 20}, {8, 4, 2}},
+    {{7, 13, 17}, {1, 1, 1}},
+};
+
+struct AubWalkerPartitionFixture : public KernelAUBFixture<SimpleKernelFixture> {
+    void SetUp() override {
+        debugRestorer = std::make_unique<DebugManagerStateRestore>();
+        DebugManager.flags.EnableTimestampPacket.set(1);
+        kernelIds |= (1 << 5);
+        KernelAUBFixture<SimpleKernelFixture>::SetUp();
+
+        size_t userMemorySize = 16 * MemoryConstants::kiloByte;
+        if (generateRandomInput) {
+            userMemorySize = 16000 * MemoryConstants::kiloByte;
+        }
+
+        sizeUserMemory = userMemorySize;
+        auto destMemory = alignedMalloc(sizeUserMemory, 4096);
+        ASSERT_NE(nullptr, destMemory);
+        memset(destMemory, 0x0, sizeUserMemory);
+
+        dstBuffer.reset(Buffer::create(context, CL_MEM_COPY_HOST_PTR, sizeUserMemory, destMemory, retVal));
+        ASSERT_NE(nullptr, dstBuffer);
+        alignedFree(destMemory);
+
+        kernels[5]->setArg(0, dstBuffer.get());
+    }
+
+    void TearDown() override {
+        pCmdQ->flush();
+
+        KernelAUBFixture<SimpleKernelFixture>::TearDown();
+    }
+    template <typename FamilyType>
+    void validatePartitionProgramming(uint64_t postSyncAddress, int32_t partitionCount) {
+        using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
+        using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+        uint32_t totalWorkgroupCount = 1u;
+        uint32_t totalWorkItemsInWorkgroup = 1u;
+        uint32_t totalWorkItemsCount = 1;
+
+        for (auto dimension = 0u; dimension < workingDimensions; dimension++) {
+            totalWorkgroupCount *= static_cast<uint32_t>(dispatchParamters.globalWorkSize[dimension] / dispatchParamters.localWorkSize[dimension]);
+            totalWorkItemsInWorkgroup *= static_cast<uint32_t>(dispatchParamters.localWorkSize[dimension]);
+            totalWorkItemsCount *= static_cast<uint32_t>(dispatchParamters.globalWorkSize[dimension]);
+        }
+
+        const uint32_t workgroupCount = static_cast<uint32_t>(dispatchParamters.globalWorkSize[partitionType - 1] / dispatchParamters.localWorkSize[partitionType - 1]);
+        auto partitionSize = Math::divideAndRoundUp(workgroupCount, partitionCount);
+
+        if (static_cast<uint32_t>(partitionType) > workingDimensions) {
+            partitionSize = 1;
+        }
+
+        hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
+
+        uint32_t walkersCount = hwParser.getCommandCount<WALKER_TYPE>();
+        EXPECT_EQ(walkersCount, 1u);
+        GenCmdList walkerList = hwParser.getCommandsList<WALKER_TYPE>();
+        WALKER_TYPE *walkerCmd = static_cast<WALKER_TYPE *>(*walkerList.begin());
+        EXPECT_EQ(0u, walkerCmd->getPartitionId());
+        if (partitionCount > 1) {
+            EXPECT_TRUE(walkerCmd->getWorkloadPartitionEnable());
+            EXPECT_EQ(partitionSize, walkerCmd->getPartitionSize());
+            EXPECT_EQ(partitionType, walkerCmd->getPartitionType());
+        } else {
+            EXPECT_FALSE(walkerCmd->getWorkloadPartitionEnable());
+            EXPECT_EQ(0u, walkerCmd->getPartitionSize());
+            EXPECT_EQ(0u, walkerCmd->getPartitionType());
+        }
+
+        EXPECT_EQ(FamilyType::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation());
+        EXPECT_EQ(postSyncAddress, walkerCmd->getPostSync().getDestinationAddress());
+
+        int notExpectedValue[] = {1, 1, 1, 1};
+
+        for (auto partitionId = 0; partitionId < DebugManager.flags.ExperimentalSetWalkerPartitionCount.get(); partitionId++) {
+            expectNotEqualMemory<FamilyType>(reinterpret_cast<void *>(postSyncAddress), &notExpectedValue, sizeof(notExpectedValue));
+            postSyncAddress += 16; //next post sync needs to be right after the previous one
+        }
+
+        auto dstGpuAddress = reinterpret_cast<void *>(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());
+        expectMemory<FamilyType>(dstGpuAddress, &totalWorkItemsCount, sizeof(uint32_t));
+        auto groupSpecificWorkCounts = ptrOffset(dstGpuAddress, 4);
+        StackVec<uint32_t, 8> workgroupCounts;
+        workgroupCounts.resize(totalWorkgroupCount);
+
+        for (uint32_t workgroupId = 0u; workgroupId < totalWorkgroupCount; workgroupId++) {
+            workgroupCounts[workgroupId] = totalWorkItemsInWorkgroup;
+        }
+
+        expectMemory<FamilyType>(groupSpecificWorkCounts, workgroupCounts.begin(), workgroupCounts.size() * sizeof(uint32_t));
+    }
+
+    template <typename FamilyType>
+    typename FamilyType::PIPE_CONTROL *retrieveSyncPipeControl(void *startAddress,
+                                                               const HardwareInfo &hwInfo) {
+        using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+        uint8_t buffer[256];
+        LinearStream stream(buffer, 256);
+        MemorySynchronizationCommands<FamilyType>::addPipeControlWA(stream, 0ull, hwInfo);
+        void *syncPipeControlAddress = reinterpret_cast<void *>(reinterpret_cast<size_t>(startAddress) + stream.getUsed());
+        PIPE_CONTROL *pipeControl = genCmdCast<PIPE_CONTROL *>(syncPipeControlAddress);
+        return pipeControl;
+    }
+
+    std::unique_ptr<DebugManagerStateRestore> debugRestorer;
+    std::unique_ptr<Buffer> dstBuffer;
+    size_t sizeUserMemory = 0;
+
+    cl_uint workingDimensions = 1;
+    int32_t partitionCount;
+    int32_t partitionType;
+
+    HardwareParse hwParser;
+    DispatchParamters dispatchParamters;
+};
+
+struct AubWalkerPartitionTest : public AubWalkerPartitionFixture,
+                                public ::testing::TestWithParam<std::tuple<int32_t, int32_t, DispatchParamters, uint32_t>> {
+    void SetUp() override {
+        AubWalkerPartitionFixture::SetUp();
+        std::tie(partitionCount, partitionType, dispatchParamters, workingDimensions) = GetParam();
+
+        if (generateRandomInput) {
+            workingDimensions = (rand() % 3 + 1);
+            partitionType = (rand() % 3 + 1);
+            partitionCount = rand() % 16 + 1;
+
+            //now generate dimensions that makes sense
+            auto goodWorkingSizeGenerated = false;
+            while (!goodWorkingSizeGenerated) {
+                dispatchParamters.localWorkSize[0] = rand() % 128 + 1;
+                dispatchParamters.localWorkSize[1] = rand() % 128 + 1;
+                dispatchParamters.localWorkSize[2] = rand() % 128 + 1;
+                auto totalWorkItemsInWorkgroup = 1;
+                for (auto dimension = 0u; dimension < workingDimensions; dimension++) {
+                    totalWorkItemsInWorkgroup *= static_cast<uint32_t>(dispatchParamters.localWorkSize[dimension]);
+                }
+                if (totalWorkItemsInWorkgroup <= 1024) {
+                    dispatchParamters.globalWorkSize[0] = dispatchParamters.localWorkSize[0] * (rand() % 32 + 1);
+                    dispatchParamters.globalWorkSize[1] = dispatchParamters.localWorkSize[1] * (rand() % 32 + 1);
+                    dispatchParamters.globalWorkSize[2] = dispatchParamters.localWorkSize[2] * (rand() % 32 + 1);
+
+                    printf("\n generated following dispatch paramters work dim %u gws %zu %zu %zu lws %zu %zu %zu, partition type %d partitionCount %d",
+                           workingDimensions,
+                           dispatchParamters.globalWorkSize[0],
+                           dispatchParamters.globalWorkSize[1],
+                           dispatchParamters.globalWorkSize[2],
+                           dispatchParamters.localWorkSize[0],
+                           dispatchParamters.localWorkSize[1],
+                           dispatchParamters.localWorkSize[2],
+                           partitionType,
+                           partitionCount);
+                    fflush(stdout);
+                    goodWorkingSizeGenerated = true;
+                }
+            };
+        }
+
+        DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(partitionCount);
+        DebugManager.flags.ExperimentalSetWalkerPartitionType.set(partitionType);
+        DebugManager.flags.EnableWalkerPartition.set(1u);
+    }
+    void TearDown() override {
+        AubWalkerPartitionFixture::TearDown();
+    }
+};
+
+struct AubWalkerPartitionZeroFixture : public AubWalkerPartitionFixture {
+    void SetUp() override {
+        AubWalkerPartitionFixture::SetUp();
+
+        partitionCount = 0;
+        partitionType = 0;
+
+        workingDimensions = 1;
+
+        DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(0);
+        DebugManager.flags.ExperimentalSetWalkerPartitionType.set(0);
+
+        commandBufferProperties = std::make_unique<AllocationProperties>(device->getRootDeviceIndex(), true, MemoryConstants::pageSize, GraphicsAllocation::AllocationType::COMMAND_BUFFER, false, device->getDeviceBitfield());
+        auto memoryManager = this->device->getMemoryManager();
+        streamAllocation = memoryManager->allocateGraphicsMemoryWithProperties(*commandBufferProperties);
+        helperSurface = memoryManager->allocateGraphicsMemoryWithProperties(*commandBufferProperties);
+        memset(helperSurface->getUnderlyingBuffer(), 0, MemoryConstants::pageSize);
+        taskStream = std::make_unique<LinearStream>(streamAllocation);
+    }
+    void TearDown() override {
+        auto memoryManager = this->device->getMemoryManager();
+        memoryManager->freeGraphicsMemory(streamAllocation);
+        memoryManager->freeGraphicsMemory(helperSurface);
+        AubWalkerPartitionFixture::TearDown();
+    }
+
+    void flushStream() {
+        DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
+        dispatchFlags.guardCommandBufferWithPipeControl = true;
+
+        csr->makeResident(*helperSurface);
+        csr->flushTask(*taskStream, 0,
+                       csr->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
+                       csr->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
+                       csr->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
+                       0u, dispatchFlags, device->getDevice());
+
+        csr->flushBatchedSubmissions();
+    }
+    std::unique_ptr<LinearStream> taskStream;
+    GraphicsAllocation *streamAllocation = nullptr;
+    GraphicsAllocation *helperSurface = nullptr;
+    std::unique_ptr<AllocationProperties> commandBufferProperties;
+};
+
+using AubWalkerPartitionZeroTest = Test<AubWalkerPartitionZeroFixture>;
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, whenPartitionCountSetToZeroThenProvideEqualSingleWalker) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using PARTITION_TYPE = typename FamilyType::WALKER_TYPE::PARTITION_TYPE;
+
+    size_t globalWorkOffset[3] = {0, 0, 0};
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+    size_t gwsSize[] = {128, 1, 1};
+    size_t lwsSize[] = {32, 1, 1};
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[5].get(),
+        workingDimensions,
+        globalWorkOffset,
+        gwsSize,
+        lwsSize,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    pCmdQ->flush();
+
+    auto cmdPartitionType = static_cast<PARTITION_TYPE>(partitionType);
+    uint32_t cmdPartitionCount = static_cast<uint32_t>(partitionCount);
+
+    hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
+    uint32_t walkersCount = hwParser.getCommandCount<WALKER_TYPE>();
+    EXPECT_EQ(cmdPartitionCount + 1, walkersCount);
+
+    GenCmdList walkerList = hwParser.getCommandsList<WALKER_TYPE>();
+    EXPECT_EQ(walkersCount, static_cast<uint32_t>(walkerList.size()));
+
+    uint32_t i = 0;
+    for (GenCmdList::iterator walker = walkerList.begin(); walker != walkerList.end(); ++walker, ++i) {
+        WALKER_TYPE *walkerCmd = static_cast<WALKER_TYPE *>(*walker);
+        EXPECT_EQ(cmdPartitionCount, walkerCmd->getPartitionId());
+        EXPECT_EQ(cmdPartitionType, walkerCmd->getPartitionType());
+        EXPECT_EQ(cmdPartitionCount, walkerCmd->getPartitionSize());
+    }
+
+    auto dstGpuAddress = reinterpret_cast<void *>(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());
+    expectMemory<FamilyType>(dstGpuAddress, &gwsSize[workingDimensions - 1], sizeof(uint32_t));
+
+    const uint32_t workgroupCount = static_cast<uint32_t>(gwsSize[workingDimensions - 1] / lwsSize[workingDimensions - 1]);
+    auto groupSpecificWorkCounts = ptrOffset(dstGpuAddress, 4);
+    StackVec<uint32_t, 8> workgroupCounts;
+    workgroupCounts.resize(workgroupCount);
+
+    for (uint32_t workgroupId = 0u; workgroupId < workgroupCount; workgroupId++) {
+        workgroupCounts[workgroupId] = static_cast<uint32_t>(lwsSize[workingDimensions - 1]);
+    }
+
+    expectMemory<FamilyType>(groupSpecificWorkCounts, workgroupCounts.begin(), workgroupCounts.size() * sizeof(uint32_t));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, whenPipeControlIsBeingEmittedWithPartitionBitSetThenMultipleFieldsAreBeingUpdatedWithValue) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto partitionId = 1u;
+    auto writeSize = 8u;
+    auto miAddressOffset = WalkerPartition::addressOffsetCCSOffset;
+    auto wparidOffset = WalkerPartition::wparidCCSOffset;
+    uint64_t writeValue = 7llu;
+
+    uint32_t totalBytesProgrammed = 0u;
+    auto streamCpuPointer = taskStream->getSpace(0);
+
+    WalkerPartition::programRegisterWithValue<FamilyType>(streamCpuPointer, wparidOffset, totalBytesProgrammed, partitionId);
+    WalkerPartition::programRegisterWithValue<FamilyType>(streamCpuPointer, miAddressOffset, totalBytesProgrammed, writeSize);
+    taskStream->getSpace(totalBytesProgrammed);
+
+    void *pipeControlAddress = taskStream->getSpace(0);
+    PipeControlArgs args;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    auto pipeControl = retrieveSyncPipeControl<FamilyType>(pipeControlAddress, device->getHardwareInfo());
+    ASSERT_NE(nullptr, pipeControl);
+    pipeControl->setWorkloadPartitionIdOffsetEnable(true);
+
+    flushStream();
+
+    expectNotEqualMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, 4u);
+    //write needs to happen after 8 bytes
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress + 8), &writeValue, 4u);
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenAtomicOperationDecOnLocalMemoryWhenItIsExecuteThenOperationUpdatesMemory) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto cpuAddress = reinterpret_cast<int *>(helperSurface->getUnderlyingBuffer());
+    *cpuAddress = 10;
+
+    auto streamCpuPointer = taskStream->getSpace(0);
+    uint32_t totalBytesProgrammed = 0u;
+    uint32_t expectedValue = 9u;
+    WalkerPartition::programMiAtomic<FamilyType>(streamCpuPointer, totalBytesProgrammed, writeAddress, false, WalkerPartition::MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_DECREMENT);
+    taskStream->getSpace(totalBytesProgrammed);
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &expectedValue, sizeof(expectedValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenAtomicOperationIncOnLocalMemoryWhenItIsExecuteThenOperationUpdatesMemory) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto cpuAddress = reinterpret_cast<int *>(helperSurface->getUnderlyingBuffer());
+    *cpuAddress = 10;
+
+    auto streamCpuPointer = taskStream->getSpace(0);
+    uint32_t totalBytesProgrammed = 0u;
+    uint32_t expectedValue = 11u;
+    WalkerPartition::programMiAtomic<FamilyType>(streamCpuPointer, totalBytesProgrammed, writeAddress, false, WalkerPartition::MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
+    taskStream->getSpace(totalBytesProgrammed);
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &expectedValue, sizeof(expectedValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenVariousCompareModesWhenConditionalBatchBufferEndIsEmittedItThenHandlesCompareCorrectly) {
+    using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END;
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto compareAddress = reinterpret_cast<int *>(helperSurface->getUnderlyingBuffer());
+
+    auto conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(taskStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setCompareAddress(writeAddress);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    uint32_t writeValue = 7u;
+    uint32_t pipeControlNotExecutedValue = 0u;
+
+    //this pipe control should be executed
+    void *pipeControlAddress = taskStream->getSpace(0);
+    PipeControlArgs args;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    auto pipeControl = retrieveSyncPipeControl<FamilyType>(pipeControlAddress, device->getHardwareInfo());
+    ASSERT_NE(nullptr, pipeControl);
+    auto programPipeControl = [&]() {
+        pipeControl->setImmediateData(writeValue);
+        pipeControl->setAddress(static_cast<uint32_t>(writeAddress & 0x0000FFFFFFFFULL));
+        pipeControl->setAddressHigh(static_cast<uint32_t>(writeAddress >> 32));
+    };
+
+    //we have now command buffer that has conditional batch buffer end and pipe control that tests whether batch buffer end acted correctly
+
+    //MAD_GREATER_THAN_IDD If Indirect fetched data is greater than inline data then continue.
+    //continue test
+    conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_GREATER_THAN_IDD);
+    *compareAddress = 11;
+    auto inlineData = 10u;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+    //terminate test
+    *compareAddress = 10;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue));
+
+    //MAD_GREATER_THAN_OR_EQUAL_IDD	If Indirect fetched data is greater than or equal to inline data then continue.
+
+    //continue test - greater
+    conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_GREATER_THAN_OR_EQUAL_IDD);
+    *compareAddress = 11;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+
+    //continue test - equal
+    *compareAddress = 10;
+    inlineData = 10u;
+
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+
+    //terminate test
+    *compareAddress = 9;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue));
+
+    //MAD_LESS_THAN_IDD	If Indirect fetched data is less than inline data then continue.
+
+    //continue test
+    conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_LESS_THAN_IDD);
+    *compareAddress = 9;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+
+    //terminate test
+    *compareAddress = 10;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue));
+
+    //MAD_LESS_THAN_OR_EQUAL_IDD	If Indirect fetched data is less than or equal to inline data then continue.
+
+    //continue test - less
+    conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_LESS_THAN_OR_EQUAL_IDD);
+    *compareAddress = 9;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+
+    //continue test - equal
+    *compareAddress = 10;
+    inlineData = 10u;
+
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+
+    //terminate test
+    *compareAddress = 11;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue));
+
+    //MAD_EQUAL_IDD	If Indirect fetched data is equal to inline data then continue.
+
+    //continue test equal
+    conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_EQUAL_IDD);
+    *compareAddress = 10;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+
+    //terminate test
+    *compareAddress = 0;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue));
+
+    //MAD_NOT_EQUAL_IDD	If Indirect fetched data is not equal to inline data then continue.
+
+    //continue test not equal
+    conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_NOT_EQUAL_IDD);
+    *compareAddress = 11;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+
+    //terminate test
+    *compareAddress = 10;
+    inlineData = 10u;
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+
+    conditionalBatchBufferEnd->setCompareDataDword(inlineData);
+    programPipeControl();
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue));
+}
+template <bool enableNesting>
+struct MultiLevelBatchAubFixture : public AubWalkerPartitionZeroFixture {
+    void SetUp() override {
+        if (enableNesting) {
+            //turn on Batch Buffer nesting
+            DebugManager.flags.AubDumpAddMmioRegistersList.set(
+                "0x1A09C;0x10001000");
+        } else {
+            //turn off Batch Buffer nesting
+            DebugManager.flags.AubDumpAddMmioRegistersList.set(
+                "0x1A09C;0x10000000");
+        }
+        AubWalkerPartitionZeroFixture::SetUp();
+        auto memoryManager = this->device->getMemoryManager();
+        secondLevelBatch = memoryManager->allocateGraphicsMemoryWithProperties(*commandBufferProperties);
+        thirdLevelBatch = memoryManager->allocateGraphicsMemoryWithProperties(*commandBufferProperties);
+        secondLevelBatchStream = std::make_unique<LinearStream>(secondLevelBatch);
+        thirdLevelBatchStream = std::make_unique<LinearStream>(thirdLevelBatch);
+    };
+    void TearDown() override {
+        debugRestorer.reset(nullptr);
+        DebugManager.flags.AubDumpAddMmioRegistersList.getRef() = "unk";
+        DebugManager.flags.AubDumpAddMmioRegistersList.getRef().shrink_to_fit();
+
+        auto memoryManager = this->device->getMemoryManager();
+        memoryManager->freeGraphicsMemory(thirdLevelBatch);
+        memoryManager->freeGraphicsMemory(secondLevelBatch);
+
+        AubWalkerPartitionZeroFixture::TearDown();
+    };
+
+    std::unique_ptr<LinearStream> secondLevelBatchStream;
+    std::unique_ptr<LinearStream> thirdLevelBatchStream;
+
+    GraphicsAllocation *secondLevelBatch = nullptr;
+    GraphicsAllocation *thirdLevelBatch = nullptr;
+};
+
+using MultiLevelBatchTestsWithNesting = Test<MultiLevelBatchAubFixture<true>>;
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithNesting, givenConditionalBatchBufferEndWhenItExitsThirdLevelCommandBufferThenSecondLevelBatchIsResumed) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto compareAddress = writeAddress;
+
+    using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+    //nest to second level
+    auto batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(taskStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress());
+    batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED);
+
+    //nest to third  level
+    batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(thirdLevelBatch->getGpuAddress());
+    batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED);
+
+    auto conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(thirdLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1);
+    conditionalBatchBufferEnd->setCompareAddress(compareAddress);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    auto writeValue = 7u;
+
+    //this pipe control should be executed
+    PipeControlArgs args;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(secondLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setCompareAddress(compareAddress);
+    conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    csr->makeResident(*secondLevelBatch);
+    csr->makeResident(*thirdLevelBatch);
+    flushStream();
+
+    writeAddress = helperSurface->getGpuAddress() + sizeof(uint64_t);
+    writeValue = 7u;
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithNesting, givenConditionalBatchBufferEndWhenItExitsToTheRingThenAllCommandBufferLevelsAreSkipped) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto compareAddress = writeAddress;
+
+    using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+    //nest to second level
+    auto batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(taskStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress());
+    batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED);
+
+    //nest to third  level
+    batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(thirdLevelBatch->getGpuAddress());
+    batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED);
+
+    auto conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(thirdLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(0);
+    conditionalBatchBufferEnd->setCompareAddress(compareAddress);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    auto writeValue = 7u;
+
+    //this pipe control should NOT be executed
+    PipeControlArgs args;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(secondLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setCompareAddress(compareAddress);
+    conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    csr->makeResident(*secondLevelBatch);
+    csr->makeResident(*thirdLevelBatch);
+    flushStream();
+
+    writeAddress = helperSurface->getGpuAddress() + sizeof(uint64_t);
+    writeValue = 0u;
+
+    //pipe controls are not emitted
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+    writeAddress += sizeof(uint64_t);
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithNesting, givenCommandBufferCacheOnWhenBatchBufferIsExecutedThenItWorksCorrectly) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto writeValue = 7u;
+
+    using BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+
+    //nest to second level
+    auto batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(taskStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress());
+    batchBufferStart->setEnableCommandCache(1u);
+    batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED);
+
+    //this pipe control should be executed
+    PipeControlArgs args;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    auto batchBufferEnd = reinterpret_cast<BATCH_BUFFER_END *>(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_END)));
+    batchBufferEnd->init();
+
+    csr->makeResident(*secondLevelBatch);
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+}
+using MultiLevelBatchTestsWithoutNesting = Test<MultiLevelBatchAubFixture<false>>;
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithoutNesting, givenConditionalBBEndWhenItExitsFromSecondLevelThenUpperLevelIsResumed) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto compareAddress = writeAddress;
+
+    using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+
+    //nest to second level
+    auto batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(taskStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress());
+    batchBufferStart->setSecondLevelBatchBuffer(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH);
+
+    //nest to third  level
+    batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(thirdLevelBatch->getGpuAddress());
+    batchBufferStart->setSecondLevelBatchBuffer(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH);
+
+    auto conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(thirdLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(0);
+    conditionalBatchBufferEnd->setCompareAddress(compareAddress);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    auto writeValue = 7u;
+
+    //this pipe control should't be executed
+    PipeControlArgs args;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(secondLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setCompareAddress(compareAddress);
+    conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+    //and this shouldn't as well, we returned to ring
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    csr->makeResident(*secondLevelBatch);
+    csr->makeResident(*thirdLevelBatch);
+    flushStream();
+
+    writeAddress = helperSurface->getGpuAddress() + sizeof(uint64_t);
+    auto zeroValue = 0llu;
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &zeroValue, sizeof(zeroValue));
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &zeroValue, sizeof(zeroValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithoutNesting, givenConditionalBBEndWhenExitsFromSecondLevelToRingThenFirstLevelIsNotExecuted) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto compareAddress = writeAddress;
+
+    using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END;
+    using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
+
+    //nest to second level
+    auto batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(taskStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress());
+    batchBufferStart->setSecondLevelBatchBuffer(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH);
+
+    //nest to third  level
+    batchBufferStart = reinterpret_cast<BATCH_BUFFER_START *>(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_START)));
+    batchBufferStart->init();
+    batchBufferStart->setBatchBufferStartAddress(thirdLevelBatch->getGpuAddress());
+    batchBufferStart->setSecondLevelBatchBuffer(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH);
+
+    auto conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(thirdLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1);
+    conditionalBatchBufferEnd->setCompareAddress(compareAddress);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    auto writeValue = 7u;
+
+    //this pipe control should't be executed
+    PipeControlArgs args;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    conditionalBatchBufferEnd = reinterpret_cast<CONDITIONAL_BATCH_BUFFER_END *>(secondLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END)));
+    conditionalBatchBufferEnd->init();
+    conditionalBatchBufferEnd->setCompareAddress(compareAddress);
+    conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1);
+    conditionalBatchBufferEnd->setCompareSemaphore(1);
+
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+    //and this should , we returned to primary batch
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    csr->makeResident(*secondLevelBatch);
+    csr->makeResident(*thirdLevelBatch);
+    flushStream();
+
+    writeAddress = helperSurface->getGpuAddress() + sizeof(uint64_t);
+    writeValue = 7u;
+    auto zeroValue = 0llu;
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &zeroValue, sizeof(zeroValue));
+    writeAddress += sizeof(uint64_t);
+    writeValue++;
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenBlockingAtomicOperationIncOnLocalMemoryWhenItIsExecutedThenOperationUpdatesMemory) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto cpuAddress = reinterpret_cast<int *>(helperSurface->getUnderlyingBuffer());
+    *cpuAddress = 10;
+
+    auto streamCpuPointer = taskStream->getSpace(0);
+    uint32_t totalBytesProgrammed = 0u;
+    uint32_t expectedValue = 11u;
+    WalkerPartition::programMiAtomic<FamilyType>(streamCpuPointer, totalBytesProgrammed, writeAddress, true, WalkerPartition::MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
+    taskStream->getSpace(totalBytesProgrammed);
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &expectedValue, sizeof(expectedValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenBlockingAtomicOperationIncOnSystemMemoryWhenItIsExecutedThenOperationUpdatesMemory) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto cpuAddress = reinterpret_cast<int *>(helperSurface->getUnderlyingBuffer());
+    *cpuAddress = 10;
+
+    auto streamCpuPointer = taskStream->getSpace(0);
+    uint32_t totalBytesProgrammed = 0u;
+    uint32_t expectedValue = 11u;
+    WalkerPartition::programMiAtomic<FamilyType>(streamCpuPointer, totalBytesProgrammed, writeAddress, true, WalkerPartition::MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
+    taskStream->getSpace(totalBytesProgrammed);
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &expectedValue, sizeof(expectedValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenNonBlockingAtomicOperationIncOnSystemMemoryWhenItIsExecutedThenOperationUpdatesMemory) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    auto cpuAddress = reinterpret_cast<int *>(helperSurface->getUnderlyingBuffer());
+    *cpuAddress = 10;
+
+    auto streamCpuPointer = taskStream->getSpace(0);
+    uint32_t totalBytesProgrammed = 0u;
+    uint32_t expectedValue = 11u;
+    WalkerPartition::programMiAtomic<FamilyType>(streamCpuPointer, totalBytesProgrammed, writeAddress, false, WalkerPartition::MI_ATOMIC<FamilyType>::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT);
+    taskStream->getSpace(totalBytesProgrammed);
+
+    flushStream();
+    expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &expectedValue, sizeof(expectedValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenPredicatedCommandBufferWhenItIsExecutedThenAtomicIsIncrementedEquallyToPartitionCountPlusOne) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+
+    auto streamCpuPointer = taskStream->getSpace(0);
+    auto postSyncAddress = helperSurface->getGpuAddress();
+
+    uint32_t totalBytesProgrammed = 0u;
+    WALKER_TYPE walkerCmd = FamilyType::cmdInitGpgpuWalker;
+    walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X);
+    walkerCmd.getInterfaceDescriptor().setNumberOfThreadsInGpgpuThreadGroup(1u);
+    walkerCmd.getPostSync().setDestinationAddress(postSyncAddress);
+    walkerCmd.getPostSync().setOperation(POSTSYNC_DATA<FamilyType>::OPERATION::OPERATION_WRITE_TIMESTAMP);
+
+    WalkerPartition::WalkerPartitionArgs testArgs = {};
+    testArgs.initializeWparidRegister = true;
+    testArgs.crossTileAtomicSynchronization = true;
+    testArgs.emitPipeControlStall = true;
+    testArgs.tileCount = 1;
+    testArgs.partitionCount = 16u;
+    testArgs.synchronizeBeforeExecution = false;
+    testArgs.secondaryBatchBuffer = false;
+    testArgs.emitSelfCleanup = false;
+
+    WalkerPartition::constructDynamicallyPartitionedCommandBuffer<FamilyType>(
+        streamCpuPointer,
+        taskStream->getGraphicsAllocation()->getGpuAddress(),
+        &walkerCmd,
+        totalBytesProgrammed,
+        testArgs,
+        *defaultHwInfo);
+    taskStream->getSpace(totalBytesProgrammed);
+    flushStream();
+    auto expectedGpuAddress = taskStream->getGraphicsAllocation()->getGpuAddress() +
+                              WalkerPartition::computeControlSectionOffset<FamilyType>(testArgs);
+
+    //16 partitions updated atomic to value 16
+    //17th partition updated it to 17 and was predicated out of the batch buffer
+    uint32_t expectedValue = 17u;
+    expectMemory<FamilyType>(reinterpret_cast<void *>(expectedGpuAddress), &expectedValue, sizeof(expectedValue));
+    //this is 1 tile scenario
+    uint32_t expectedTileValue = 1u;
+    expectMemory<FamilyType>(reinterpret_cast<void *>(expectedGpuAddress + 4llu), &expectedTileValue, sizeof(expectedTileValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenGeneralPurposeRegisterWhenItIsLoadedAndFetchedThenItIsNotPrivileged) {
+    auto writeAddress = helperSurface->getGpuAddress();
+    uint32_t writeValue = 7u;
+
+    auto streamCpuPointer = taskStream->getSpace(0);
+    uint32_t totalBytesProgrammed = 0u;
+    uint32_t wparidValue = 5u;
+    WalkerPartition::programRegisterWithValue<FamilyType>(streamCpuPointer, generalPurposeRegister0, totalBytesProgrammed, wparidValue);
+    WalkerPartition::programMiLoadRegisterReg<FamilyType>(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister0, generalPurposeRegister1);
+    WalkerPartition::programMiLoadRegisterReg<FamilyType>(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister1, generalPurposeRegister2);
+    WalkerPartition::programMiLoadRegisterReg<FamilyType>(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister2, generalPurposeRegister3);
+    WalkerPartition::programMiLoadRegisterReg<FamilyType>(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister3, generalPurposeRegister4);
+    WalkerPartition::programMiLoadRegisterReg<FamilyType>(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister4, generalPurposeRegister5);
+    WalkerPartition::programMiLoadRegisterReg<FamilyType>(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister5, wparidCCSOffset);
+    WalkerPartition::programWparidMask<FamilyType>(streamCpuPointer, totalBytesProgrammed, 4u);
+    WalkerPartition::programWparidPredication<FamilyType>(streamCpuPointer, totalBytesProgrammed, true);
+    //this command must not execute
+    taskStream->getSpace(totalBytesProgrammed);
+    PipeControlArgs args;
+    MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+        *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+        writeAddress, writeValue, device->getHardwareInfo(), args);
+
+    streamCpuPointer = taskStream->getSpace(0);
+    totalBytesProgrammed = 0u;
+    WalkerPartition::programWparidPredication<FamilyType>(streamCpuPointer, totalBytesProgrammed, false);
+    taskStream->getSpace(totalBytesProgrammed);
+    flushStream();
+    expectNotEqualMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, sizeof(writeValue));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenPredicationWhenItIsOnThenCommandMustNotBeExecuted) {
+    auto streamCpuPointer = taskStream->getSpace(0);
+    uint32_t totalBytesProgrammed = 0u;
+    auto writeValue = 1u;
+    auto zeroValue = 0u;
+    auto addressShift = 8u;
+    auto writeAddress = helperSurface->getGpuAddress();
+
+    //program WPARID mask to 16 partitions
+    WalkerPartition::programWparidMask<FamilyType>(streamCpuPointer, totalBytesProgrammed, 16u);
+    streamCpuPointer = taskStream->getSpace(totalBytesProgrammed);
+    //program WPARID to value within 0-19
+    for (uint32_t wparid = 0u; wparid < 20; wparid++) {
+        totalBytesProgrammed = 0;
+        streamCpuPointer = taskStream->getSpace(0);
+        WalkerPartition::programRegisterWithValue<FamilyType>(streamCpuPointer, WalkerPartition::wparidCCSOffset, totalBytesProgrammed, wparid);
+        WalkerPartition::programWparidPredication<FamilyType>(streamCpuPointer, totalBytesProgrammed, true);
+        taskStream->getSpace(totalBytesProgrammed);
+        //emit pipe control
+        PipeControlArgs args;
+        MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+            *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+            writeAddress, writeValue, device->getHardwareInfo(), args);
+
+        //turn off predication
+        streamCpuPointer = taskStream->getSpace(0);
+        totalBytesProgrammed = 0;
+        WalkerPartition::programWparidPredication<FamilyType>(streamCpuPointer, totalBytesProgrammed, false);
+        taskStream->getSpace(totalBytesProgrammed);
+
+        writeAddress += addressShift;
+        writeValue++;
+    }
+
+    flushStream();
+    writeAddress = helperSurface->getGpuAddress();
+    writeValue = 1u;
+    for (uint32_t wparid = 0u; wparid < 20; wparid++) {
+        if (wparid < 16) {
+            expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, 4u);
+        } else {
+            expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &zeroValue, 4u);
+        }
+        writeAddress += addressShift;
+        writeValue++;
+    }
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenPredicationWhenItIsOnThenPipeControlInWparidIsNotExecuted) {
+    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
+
+    auto streamCpuPointer = taskStream->getSpace(0);
+    uint32_t totalBytesProgrammed = 0u;
+    auto writeValue = 1u;
+    auto zeroValue = 0u;
+    auto addressShift = 32u;
+    auto writeAddress = helperSurface->getGpuAddress();
+
+    WalkerPartition::programRegisterWithValue<FamilyType>(streamCpuPointer, WalkerPartition::addressOffsetCCSOffset, totalBytesProgrammed, addressShift);
+    //program WPARID mask to 8 partitions
+    WalkerPartition::programWparidMask<FamilyType>(streamCpuPointer, totalBytesProgrammed, 8u);
+    streamCpuPointer = taskStream->getSpace(totalBytesProgrammed);
+    //program WPARID to value within 0-13
+    for (uint32_t wparid = 0u; wparid < 13; wparid++) {
+        totalBytesProgrammed = 0;
+        streamCpuPointer = taskStream->getSpace(0);
+        WalkerPartition::programRegisterWithValue<FamilyType>(streamCpuPointer, WalkerPartition::wparidCCSOffset, totalBytesProgrammed, wparid);
+        WalkerPartition::programWparidPredication<FamilyType>(streamCpuPointer, totalBytesProgrammed, true);
+        taskStream->getSpace(totalBytesProgrammed);
+
+        //emit pipe control
+        void *pipeControlAddress = taskStream->getSpace(0);
+        PipeControlArgs args;
+        MemorySynchronizationCommands<FamilyType>::addPipeControlAndProgramPostSyncOperation(
+            *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
+            writeAddress, writeValue, device->getHardwareInfo(), args);
+
+        auto pipeControl = retrieveSyncPipeControl<FamilyType>(pipeControlAddress, device->getHardwareInfo());
+        ASSERT_NE(nullptr, pipeControl);
+        pipeControl->setWorkloadPartitionIdOffsetEnable(true);
+        //turn off predication
+        streamCpuPointer = taskStream->getSpace(0);
+        totalBytesProgrammed = 0;
+        WalkerPartition::programWparidPredication<FamilyType>(streamCpuPointer, totalBytesProgrammed, false);
+        taskStream->getSpace(totalBytesProgrammed);
+
+        writeValue++;
+    }
+
+    flushStream();
+    writeAddress = helperSurface->getGpuAddress();
+    writeValue = 1u;
+    for (uint32_t wparid = 0u; wparid < 13; wparid++) {
+        if (wparid < 8) {
+            expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &writeValue, 4u);
+        } else {
+            expectMemory<FamilyType>(reinterpret_cast<void *>(writeAddress), &zeroValue, 4u);
+        }
+        writeAddress += addressShift;
+        writeValue++;
+    }
+}
+
+HWCMDTEST_P(IGFX_XE_HP_CORE, AubWalkerPartitionTest, whenPartitionsAreUsedWithVariousInputsThenHardwareProgrammingIsCorrect) {
+    size_t globalWorkOffset[3] = {0, 0, 0};
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event event;
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[5].get(),
+        workingDimensions,
+        globalWorkOffset,
+        dispatchParamters.globalWorkSize,
+        dispatchParamters.localWorkSize,
+        numEventsInWaitList,
+        eventWaitList,
+        &event);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    pCmdQ->flush();
+
+    auto neoEvent = castToObject<Event>(event);
+    auto container = neoEvent->getTimestampPacketNodes();
+    auto postSyncAddress = TimestampPacketHelper::getContextStartGpuAddress(*container->peekNodes()[0]);
+    validatePartitionProgramming<FamilyType>(postSyncAddress, partitionCount);
+
+    clReleaseEvent(event);
+}
+
+INSTANTIATE_TEST_CASE_P(
+    AUBWPARID,
+    AubWalkerPartitionTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(testPartitionCount),
+        ::testing::ValuesIn(testPartitionType),
+        ::testing::ValuesIn(DispatchParamtersForTests),
+        ::testing::ValuesIn(testWorkingDimensions)));
+
+using AubWparidTests = Test<AubWalkerPartitionFixture>;
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, AubWparidTests, whenPartitionCountSetAndPartitionIdSpecifiedViaWPARIDThenProvideEqualNumberWalkers) {
+    size_t globalWorkOffset[3] = {0, 0, 0};
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event event;
+    workingDimensions = 3;
+    dispatchParamters.globalWorkSize[0] = 30;
+    dispatchParamters.globalWorkSize[1] = 39;
+    dispatchParamters.globalWorkSize[2] = 5;
+    dispatchParamters.localWorkSize[0] = 10;
+    dispatchParamters.localWorkSize[1] = 3;
+    dispatchParamters.localWorkSize[2] = 1;
+
+    partitionType = 3;
+
+    int32_t partitionCount = 4;
+
+    DebugManager.flags.ExperimentalSetWalkerPartitionType.set(partitionType);
+    DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(partitionCount);
+    DebugManager.flags.EnableWalkerPartition.set(1u);
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[5].get(),
+        workingDimensions,
+        globalWorkOffset,
+        dispatchParamters.globalWorkSize,
+        dispatchParamters.localWorkSize,
+        numEventsInWaitList,
+        eventWaitList,
+        &event);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    pCmdQ->flush();
+
+    auto neoEvent = castToObject<Event>(event);
+    auto container = neoEvent->getTimestampPacketNodes();
+    auto postSyncAddress = TimestampPacketHelper::getContextStartGpuAddress(*container->peekNodes()[0]);
+
+    validatePartitionProgramming<FamilyType>(postSyncAddress, partitionCount);
+    clReleaseEvent(event);
+}
diff --git a/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp b/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp
new file mode 100644
index 0000000000..cf2944c02e
--- /dev/null
+++ b/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp
@@ -0,0 +1,516 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/register_offsets.h"
+#include "shared/test/common/helpers/dispatch_flags_helper.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
+#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
+
+namespace NEO {
+enum class NewAluOpcodes : uint32_t {
+    OPCODE_LOAD = 0x080,
+    OPCODE_LOAD0 = 0x081,
+    OPCODE_LOAD1 = 0x481,
+    OPCODE_LOADIND = 0x082,
+    OPCODE_STOREIND = 0x181,
+    OPCODE_SHL = 0x105,
+    OPCODE_SHR = 0x106,
+    OPCODE_SAR = 0x107,
+    OPCODE_FENCE = 0x001
+};
+
+struct MiMath : public AUBFixture, public ::testing::Test {
+    void SetUp() override {
+        AUBFixture::SetUp(defaultHwInfo.get());
+
+        streamAllocation = this->device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), MemoryConstants::pageSize, GraphicsAllocation::AllocationType::COMMAND_BUFFER, device->getDeviceBitfield()});
+        taskStream = std::make_unique<LinearStream>(streamAllocation);
+    }
+    void TearDown() override {
+        this->device->getMemoryManager()->freeGraphicsMemory(streamAllocation);
+        AUBFixture::TearDown();
+    }
+
+    void flushStream() {
+        DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
+        dispatchFlags.guardCommandBufferWithPipeControl = true;
+
+        csr->flushTask(*taskStream, 0,
+                       csr->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
+                       csr->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
+                       csr->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
+                       0u, dispatchFlags, device->getDevice());
+
+        csr->flushBatchedSubmissions();
+    }
+    uint32_t getPartOfGPUAddress(uint64_t address, bool lowPart) {
+        constexpr uint32_t shift = 32u;
+        constexpr uint32_t mask = 0xffffffff;
+        if (lowPart) {
+            return static_cast<uint32_t>(address & mask);
+        } else {
+            return static_cast<uint32_t>(address >> shift);
+        }
+    }
+    template <typename FamilyType>
+    void loadValueToRegister(int32_t value, int32_t reg) {
+        using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
+        MI_LOAD_REGISTER_IMM cmd = FamilyType::cmdInitLoadRegisterImm;
+        cmd.setDataDword(value);
+        cmd.setRegisterOffset(reg);
+        cmd.setMmioRemapEnable(1);
+        auto buffer = taskStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM));
+        *static_cast<MI_LOAD_REGISTER_IMM *>(buffer) = cmd;
+    }
+    template <typename FamilyType>
+    void storeValueInRegisterToMemory(int64_t address, int32_t reg) {
+        using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM;
+        MI_STORE_REGISTER_MEM cmd2 = FamilyType::cmdInitStoreRegisterMem;
+        cmd2.setRegisterAddress(reg);
+        cmd2.setMemoryAddress(address);
+        cmd2.setMmioRemapEnable(1);
+        auto buffer2 = taskStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
+        *static_cast<MI_STORE_REGISTER_MEM *>(buffer2) = cmd2;
+    }
+    template <typename FamilyType>
+    void loadAddressToRegisters(uint32_t registerWithLowPart, uint32_t registerWithHighPart, uint32_t registerWithShift, uint64_t address) {
+        loadValueToRegister<FamilyType>(getPartOfGPUAddress(address, true), registerWithLowPart);   // low part to R0
+        loadValueToRegister<FamilyType>(getPartOfGPUAddress(address, false), registerWithHighPart); // high part to R1
+        loadValueToRegister<FamilyType>(32u, registerWithShift);                                    // value to shift address
+    }
+    template <typename FamilyType>
+    void loadAddressToMiMathAccu(uint32_t lowAddressRegister, uint32_t highAddressRegister, uint32_t shiftReg) {
+        using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE;
+        MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(taskStream->getSpace(numberOfOperationToLoadAddressToMiMathAccu * sizeof(MI_MATH_ALU_INST_INLINE)));
+        pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load high part of address from register with older to SRCA
+        pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCA);
+        pAluParam->DW0.BitField.Operand2 = highAddressRegister;
+        pAluParam++;
+        pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load 32 - value from shiftReg , to SRCB (to shift high part in register)
+        pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+        pAluParam->DW0.BitField.Operand2 = shiftReg;
+        pAluParam++;
+        pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_SHL); // shift high part
+        pAluParam->DW0.BitField.Operand1 = 0;
+        pAluParam->DW0.BitField.Operand2 = 0;
+        pAluParam++;
+        pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // move result to highAddressRegister
+        pAluParam->DW0.BitField.Operand1 = highAddressRegister;
+        pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+        pAluParam++;
+        pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load highAddressRegister to SRCA
+        pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCA);
+        pAluParam->DW0.BitField.Operand2 = highAddressRegister;
+        pAluParam++;
+        pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load low part of address to SRCB
+        pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+        pAluParam->DW0.BitField.Operand2 = lowAddressRegister;
+        pAluParam++;
+        pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_OR); // join parts of address and locate in ACCU
+        pAluParam->DW0.BitField.Operand1 = 0;
+        pAluParam->DW0.BitField.Operand2 = 0;
+        pAluParam++;
+    }
+
+    static constexpr size_t bufferSize = MemoryConstants::pageSize;
+    const uint32_t numberOfOperationToLoadAddressToMiMathAccu = 7;
+    std::unique_ptr<LinearStream> taskStream;
+    GraphicsAllocation *streamAllocation = nullptr;
+};
+
+using MatcherIsDg2OrPvc = IsWithinProducts<IGFX_DG2, IGFX_PVC>;
+
+HWTEST2_F(MiMath, givenLoadIndirectFromMemoryWhenUseMiMathToSimpleOperationThenStoreStateOfRegisterInirectToMemory, MatcherIsDg2OrPvc) {
+    using MI_MATH = typename FamilyType::MI_MATH;
+    using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE;
+    uint64_t bufferMemory[bufferSize] = {};
+    bufferMemory[0] = 1u;
+    cl_int retVal = CL_SUCCESS;
+
+    auto buffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                         bufferSize, bufferMemory, retVal));
+    ASSERT_NE(nullptr, buffer);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
+    csr->makeResident(*allocation);
+
+    uint32_t valueToAdd = 5u;
+    uint64_t valueAfterMiMathOperation = bufferMemory[0] + valueToAdd;
+
+    loadAddressToRegisters<FamilyType>(CS_GPR_R0, CS_GPR_R1, CS_GPR_R2, allocation->getGpuAddress()); // prepare registers to mi_math operation
+    loadValueToRegister<FamilyType>(valueToAdd, CS_GPR_R3);
+
+    auto pCmd = reinterpret_cast<uint32_t *>(taskStream->getSpace(sizeof(MI_MATH)));
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.Value = 0x0;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.DwordLength = numberOfOperationToLoadAddressToMiMathAccu + 13 - 1;
+    loadAddressToMiMathAccu<FamilyType>(static_cast<uint32_t>(AluRegisters::R_0), static_cast<uint32_t>(AluRegisters::R_1), static_cast<uint32_t>(AluRegisters::R_2)); // GPU address of buffer load to ACCU register
+    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(taskStream->getSpace(13 * sizeof(MI_MATH_ALU_INST_INLINE)));
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_LOADIND); // load dword from memory address located in ACCU
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_0);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // copy address from ACCU to R2
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_2);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // R0 to SRCA
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCA);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_0);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // R3 to SRCB where is value of 'valueToAdd'
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_3);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_ADD); // do simple add on registers SRCA and SRCB
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // R3 to SRCB where is value of 'valueToAdd'
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_1);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load address from R2 where is copy of address to SRCA
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCA);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_2);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_LOAD0);
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_ADD); // move address to ACCU
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_STOREIND); // store to memory from ACCU, value from register R1
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_1);
+    pAluParam++;
+
+    flushStream();
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(allocation->getGpuAddress()), &valueAfterMiMathOperation, sizeof(valueAfterMiMathOperation));
+}
+HWTEST2_F(MiMath, givenLoadIndirectFromMemoryWhenUseMiMathThenStoreIndirectToAnotherMemory, MatcherIsDg2OrPvc) {
+    using MI_MATH = typename FamilyType::MI_MATH;
+    using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE;
+    uint64_t bufferMemory[bufferSize] = {};
+    bufferMemory[0] = 1u;
+    uint64_t bufferBMemory[bufferSize] = {};
+    cl_int retVal = CL_SUCCESS;
+
+    auto buffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                         bufferSize, bufferMemory, retVal));
+    ASSERT_NE(nullptr, buffer);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    auto bufferB = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                          CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                          bufferSize, bufferBMemory, retVal));
+    ASSERT_NE(nullptr, buffer);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    csr->makeResident(*buffer->getGraphicsAllocation(rootDeviceIndex));
+    csr->makeResident(*bufferB->getGraphicsAllocation(rootDeviceIndex));
+
+    loadAddressToRegisters<FamilyType>(CS_GPR_R0, CS_GPR_R1, CS_GPR_R2, buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());  // prepare registers to mi_math operation
+    loadAddressToRegisters<FamilyType>(CS_GPR_R3, CS_GPR_R4, CS_GPR_R2, bufferB->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()); // prepare registers to mi_math operation
+
+    auto pCmd = reinterpret_cast<uint32_t *>(taskStream->getSpace(sizeof(MI_MATH)));
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.Value = 0x0;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.DwordLength = numberOfOperationToLoadAddressToMiMathAccu * 2 + 6 - 1;
+
+    loadAddressToMiMathAccu<FamilyType>(static_cast<uint32_t>(AluRegisters::R_0), static_cast<uint32_t>(AluRegisters::R_1), static_cast<uint32_t>(AluRegisters::R_2)); // GPU address of buffer load to ACCU register
+
+    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(taskStream->getSpace(4 * sizeof(MI_MATH_ALU_INST_INLINE)));
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_LOADIND); // load dword from memory address located in ACCU to R0
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_0);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+
+    loadAddressToMiMathAccu<FamilyType>(static_cast<uint32_t>(AluRegisters::R_3), static_cast<uint32_t>(AluRegisters::R_4), static_cast<uint32_t>(AluRegisters::R_2)); // GPU address of bufferB load to ACCU register
+
+    pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(taskStream->getSpace(2 * sizeof(MI_MATH_ALU_INST_INLINE)));
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_STOREIND); // store to memory from ACCU, value from register R0
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_0);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+
+    flushStream();
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(bufferB->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()), bufferMemory, sizeof(uint64_t));
+}
+HWTEST2_F(MiMath, givenValueToMakeLeftLogicalShiftWhenUseMiMathThenShiftIsDoneProperly, MatcherIsDg2OrPvc) {
+    using MI_MATH = typename FamilyType::MI_MATH;
+    using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE;
+    uint64_t bufferMemory[bufferSize] = {};
+    bufferMemory[0] = 1u;
+    cl_int retVal = CL_SUCCESS;
+
+    auto buffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                         bufferSize, bufferMemory, retVal));
+    ASSERT_NE(nullptr, buffer);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    csr->makeResident(*buffer->getGraphicsAllocation(rootDeviceIndex));
+
+    uint32_t value = 1u;
+    uint32_t shift = 2u;
+    uint32_t notPowerOfTwoShift = 5u;
+    uint32_t expectedUsedShift = 4u;
+
+    loadValueToRegister<FamilyType>(value, CS_GPR_R0);
+    loadValueToRegister<FamilyType>(shift, CS_GPR_R1);
+    loadValueToRegister<FamilyType>(notPowerOfTwoShift, CS_GPR_R2);
+    auto pCmd = reinterpret_cast<uint32_t *>(taskStream->getSpace(sizeof(MI_MATH)));
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.Value = 0x0;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.DwordLength = 7 - 1;
+
+    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(taskStream->getSpace(7 * sizeof(MI_MATH_ALU_INST_INLINE)));
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value from R0 to SRCA
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCA);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_0);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_1);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_SHL); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_1);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_2);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_SHL); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_2);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+
+    storeValueInRegisterToMemory<FamilyType>(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress(), CS_GPR_R1);
+    storeValueInRegisterToMemory<FamilyType>(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress() + 4, CS_GPR_R2);
+    flushStream();
+
+    uint32_t firstShift = value << shift;
+    uint32_t secondShift = value << notPowerOfTwoShift;
+    uint32_t executeSecondShift = value << expectedUsedShift;
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()), &firstShift, sizeof(firstShift));
+    expectNotEqualMemory<FamilyType>(reinterpret_cast<void *>(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress() + 4), &secondShift, sizeof(secondShift));
+    expectMemory<FamilyType>(reinterpret_cast<void *>(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress() + 4), &executeSecondShift, sizeof(executeSecondShift));
+}
+HWTEST2_F(MiMath, givenValueToMakeRightLogicalShiftWhenUseMiMathThenShiftIsDoneProperly, MatcherIsDg2OrPvc) {
+    using MI_MATH = typename FamilyType::MI_MATH;
+    using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE;
+    uint64_t bufferMemory[bufferSize] = {};
+    bufferMemory[0] = 1u;
+    cl_int retVal = CL_SUCCESS;
+
+    auto buffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                         bufferSize, bufferMemory, retVal));
+    ASSERT_NE(nullptr, buffer);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
+    csr->makeResident(*allocation);
+
+    uint32_t value = 32u;
+    uint32_t shift = 2u;
+    uint32_t notPowerOfTwoShift = 5u;
+    uint32_t expectedUsedShift = 4u;
+
+    loadValueToRegister<FamilyType>(value, CS_GPR_R0);
+    loadValueToRegister<FamilyType>(shift, CS_GPR_R1);
+    loadValueToRegister<FamilyType>(notPowerOfTwoShift, CS_GPR_R2);
+    auto pCmd = reinterpret_cast<uint32_t *>(taskStream->getSpace(sizeof(MI_MATH)));
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.Value = 0x0;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.DwordLength = 7 - 1;
+
+    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(taskStream->getSpace(7 * sizeof(MI_MATH_ALU_INST_INLINE)));
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value from R0 to SRCA
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCA);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_0);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_1);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_SHR); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_1);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_2);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_SHR); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_2);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+
+    storeValueInRegisterToMemory<FamilyType>(allocation->getGpuAddress(), CS_GPR_R1);
+    storeValueInRegisterToMemory<FamilyType>(allocation->getGpuAddress() + 4, CS_GPR_R2);
+    flushStream();
+
+    uint32_t firstShift = value >> shift;
+    uint32_t secondShift = value >> notPowerOfTwoShift;
+    uint32_t executeSecondShift = value >> expectedUsedShift;
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(allocation->getGpuAddress()), &firstShift, sizeof(firstShift));
+    expectNotEqualMemory<FamilyType>(reinterpret_cast<void *>(allocation->getGpuAddress() + 4), &secondShift, sizeof(secondShift));
+    expectMemory<FamilyType>(reinterpret_cast<void *>(allocation->getGpuAddress() + 4), &executeSecondShift, sizeof(executeSecondShift));
+}
+HWTEST2_F(MiMath, givenValueToMakeRightAritmeticShiftWhenUseMiMathThenShiftIsDoneProperly, MatcherIsDg2OrPvc) {
+    using MI_MATH = typename FamilyType::MI_MATH;
+    using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE;
+
+    int64_t bufferMemory[bufferSize] = {};
+    bufferMemory[0] = -32;
+    cl_int retVal = CL_SUCCESS;
+
+    auto buffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                         bufferSize, bufferMemory, retVal));
+    ASSERT_NE(nullptr, buffer);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
+    csr->makeResident(*allocation);
+
+    uint32_t shift = 2u;
+    uint32_t notPowerOfTwoShift = 5u;
+    uint32_t expectedUsedShift = 4u;
+
+    loadAddressToRegisters<FamilyType>(CS_GPR_R0, CS_GPR_R1, CS_GPR_R2, allocation->getGpuAddress()); // prepare registers to mi_math operation
+    loadValueToRegister<FamilyType>(shift, CS_GPR_R4);
+    loadValueToRegister<FamilyType>(notPowerOfTwoShift, CS_GPR_R5);
+
+    auto pCmd = reinterpret_cast<uint32_t *>(taskStream->getSpace(sizeof(MI_MATH)));
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.Value = 0x0;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
+    reinterpret_cast<MI_MATH *>(pCmd)->DW0.BitField.DwordLength = numberOfOperationToLoadAddressToMiMathAccu + 9 - 1;
+    loadAddressToMiMathAccu<FamilyType>(static_cast<uint32_t>(AluRegisters::R_0), static_cast<uint32_t>(AluRegisters::R_1), static_cast<uint32_t>(AluRegisters::R_2)); // GPU address of buffer load to ACCU register
+    MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(taskStream->getSpace(9 * sizeof(MI_MATH_ALU_INST_INLINE)));
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_LOADIND); // load value from R0 to SRCA
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_3);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value from R0 to SRCA
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCA);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_3);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_4);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_SAR); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_4);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_SRCB);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_5);
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(NewAluOpcodes::OPCODE_SAR); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = 0;
+    pAluParam->DW0.BitField.Operand2 = 0;
+    pAluParam++;
+    pAluParam->DW0.BitField.ALUOpcode = static_cast<uint32_t>(AluRegisters::OPCODE_STORE); // load value to shift to SRCB
+    pAluParam->DW0.BitField.Operand1 = static_cast<uint32_t>(AluRegisters::R_5);
+    pAluParam->DW0.BitField.Operand2 = static_cast<uint32_t>(AluRegisters::R_ACCU);
+    pAluParam++;
+
+    storeValueInRegisterToMemory<FamilyType>(allocation->getGpuAddress(), CS_GPR_R4);
+    storeValueInRegisterToMemory<FamilyType>(allocation->getGpuAddress() + 4, CS_GPR_R5);
+    flushStream();
+
+    int64_t firstShift = bufferMemory[0];
+    for (uint32_t i = 0; i < shift; i++) {
+        firstShift /= 2;
+    }
+    int64_t secondShift = bufferMemory[0];
+    for (uint32_t i = 0; i < notPowerOfTwoShift; i++) {
+        secondShift /= 2;
+    }
+    int64_t executeSecondShift = bufferMemory[0];
+    for (uint32_t i = 0; i < expectedUsedShift; i++) {
+        executeSecondShift /= 2;
+    }
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(allocation->getGpuAddress()), &firstShift, sizeof(uint32_t));
+    expectNotEqualMemory<FamilyType>(reinterpret_cast<void *>(allocation->getGpuAddress() + 4), &secondShift, sizeof(uint32_t));
+    expectMemory<FamilyType>(reinterpret_cast<void *>(allocation->getGpuAddress() + 4), &executeSecondShift, sizeof(uint32_t));
+}
+} // namespace NEO