Add command queue aub tests

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
2026-01-12 17:33:00 +08:00 · 2022-01-24 16:37:55 +00:00
parent 010186d0da
commit 43e147d84f
10 changed files with 2269 additions and 1 deletions
--- a/opencl/test/unit_test/aub_tests/command_queue/CMakeLists.txt
+++ b/opencl/test/unit_test/aub_tests/command_queue/CMakeLists.txt
@@ -1,5 +1,5 @@
 #
-# Copyright (C) 2018-2021 Intel Corporation
+# Copyright (C) 2018-2022 Intel Corporation
 #
 # SPDX-License-Identifier: MIT
 #
@@ -27,6 +27,20 @@ target_sources(igdrcl_aub_tests PRIVATE
               ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_copy_read_buffer_aub_tests.cpp
               ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_copy_read_buffer_aub_tests.h
               ${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_image_aub_tests.cpp
+               ${CMAKE_CURRENT_SOURCE_DIR}/single_tile_products_excludes.cpp
 )

+if(TESTS_XEHP_AND_LATER)
+  target_sources(igdrcl_aub_tests PRIVATE
+                 ${CMAKE_CURRENT_SOURCE_DIR}/aub_enqueue_resource_barrier_xehp_and_later.cpp
+                 ${CMAKE_CURRENT_SOURCE_DIR}/aub_inline_data_local_id_tests_xehp_and_later.cpp
+                 ${CMAKE_CURRENT_SOURCE_DIR}/aub_multicontext_tests_xehp_and_later.cpp
+                 ${CMAKE_CURRENT_SOURCE_DIR}/aub_one_va_multi_physical_tests_xehp_and_later.cpp
+                 ${CMAKE_CURRENT_SOURCE_DIR}/aub_postsync_write_tests_xehp_and_later.cpp
+                 ${CMAKE_CURRENT_SOURCE_DIR}/aub_scratch_space_tests_xehp_and_later.cpp
+                 ${CMAKE_CURRENT_SOURCE_DIR}/compression_aub_tests_xehp_and_later.cpp
+                 ${CMAKE_CURRENT_SOURCE_DIR}/multi_tile_buffers_aub_tests_xehp_and_later.cpp
+  )
+endif()
+
 add_subdirectories()
--- a/opencl/test/unit_test/aub_tests/command_queue/aub_enqueue_resource_barrier_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/aub_enqueue_resource_barrier_xehp_and_later.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/test/common/mocks/mock_device.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/source/command_queue/resource_barrier.h"
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/hello_world_fixture.h"
+#include "opencl/test/unit_test/helpers/cmd_buffer_validator.h"
+#include "opencl/test/unit_test/mocks/mock_command_queue.h"
+#include "opencl/test/unit_test/mocks/mock_context.h"
+
+#include "test_traits_common.h"
+
+using namespace NEO;
+
+using ResourceBarrierAubTest = Test<KernelAUBFixture<SimpleKernelFixture>>;
+
+struct L3ControlSupportedMatcher {
+    template <PRODUCT_FAMILY productFamily>
+    static constexpr bool isMatched() {
+        if constexpr (HwMapper<productFamily>::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) {
+            return TestTraits<NEO::ToGfxCoreFamily<productFamily>::get()>::l3ControlSupported;
+        }
+        return false;
+    }
+};
+
+HWTEST2_F(ResourceBarrierAubTest, givenAllocationsWhenEnqueueResourceBarrierCalledThenL3FlushCommandWasSubmitted, L3ControlSupportedMatcher) {
+    using L3_CONTROL = typename FamilyType::L3_CONTROL;
+
+    constexpr size_t bufferSize = MemoryConstants::pageSize;
+    char bufferAMemory[bufferSize];
+    char bufferBMemory[bufferSize];
+
+    memset(bufferAMemory, 1, bufferSize);
+    memset(bufferBMemory, 129, bufferSize);
+
+    auto retVal = CL_INVALID_VALUE;
+    auto srcBuffer = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                            CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                                            bufferSize, bufferAMemory, retVal));
+
+    ASSERT_NE(nullptr, srcBuffer);
+    auto dstBuffer1 = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                             CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                             bufferSize, bufferBMemory, retVal));
+    ASSERT_NE(nullptr, dstBuffer1);
+
+    auto dstBuffer2 = std::unique_ptr<Buffer>(Buffer::create(context,
+                                                             CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                                             bufferSize, bufferBMemory, retVal));
+    ASSERT_NE(nullptr, dstBuffer2);
+
+    retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer1.get(),
+                                      0, 0,
+                                      bufferSize, 0,
+                                      nullptr, nullptr);
+
+    retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer2.get(),
+                                      0, 0,
+                                      bufferSize, 0,
+                                      nullptr, nullptr);
+
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    cl_resource_barrier_descriptor_intel descriptor{};
+    cl_resource_barrier_descriptor_intel descriptor2{};
+
+    descriptor.mem_object = dstBuffer1.get();
+    descriptor2.mem_object = dstBuffer2.get();
+
+    const cl_resource_barrier_descriptor_intel descriptors[] = {descriptor, descriptor2};
+
+    BarrierCommand bCmd(pCmdQ, descriptors, 2);
+
+    auto sizeUsed = pCmdQ->getCS(0).getUsed();
+
+    retVal = pCmdQ->enqueueResourceBarrier(&bCmd, 0, nullptr, nullptr);
+
+    LinearStream &l3FlushCmdStream = pCmdQ->getCS(0);
+
+    std::string err;
+    auto cmdBuffOk = expectCmdBuff<FamilyType>(l3FlushCmdStream, sizeUsed,
+                                               std::vector<MatchCmd *>{
+                                                   new MatchAnyCmd(AnyNumber),
+                                                   new MatchHwCmd<FamilyType, L3_CONTROL>(1),
+                                                   new MatchAnyCmd(AnyNumber),
+                                               },
+                                               &err);
+    EXPECT_TRUE(cmdBuffOk) << err;
+
+    retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer2.get(),
+                                      0, 0,
+                                      bufferSize, 0,
+                                      nullptr, nullptr);
+
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    pCmdQ->flush();
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(dstBuffer1->getGraphicsAllocation(device->getRootDeviceIndex())->getGpuAddress()),
+                             bufferAMemory, bufferSize);
+    expectMemory<FamilyType>(reinterpret_cast<void *>(dstBuffer2->getGraphicsAllocation(device->getRootDeviceIndex())->getGpuAddress()),
+                             bufferAMemory, bufferSize);
+}
--- a/opencl/test/unit_test/aub_tests/command_queue/aub_inline_data_local_id_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/aub_inline_data_local_id_tests_xehp_and_later.cpp
@@ -0,0 +1,475 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/array_count.h"
+#include "shared/test/common/cmd_parse/hw_parse.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/source/helpers/hardware_commands_helper.h"
+#include "opencl/test/unit_test/aub_tests/command_stream/aub_command_stream_fixture.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
+#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
+#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
+#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
+#include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
+#include "opencl/test/unit_test/indirect_heap/indirect_heap_fixture.h"
+
+using namespace NEO;
+
+struct AubDispatchThreadDataFixture : public KernelAUBFixture<SimpleKernelFixture> {
+    struct TestVariables {
+        Buffer *destBuffer = nullptr;
+        void *destMemory = nullptr;
+        size_t sizeUserMemory = 0;
+        size_t sizeWrittenMemory = 0;
+        size_t sizeRemainderMemory = 0;
+        void *expectedMemory = nullptr;
+        void *expectedRemainderMemory = nullptr;
+        char *remainderDestMemory = nullptr;
+        unsigned int scalarArg = 0;
+        size_t typeSize = 0;
+        size_t gwsSize = 0;
+        size_t lwsSize = 0;
+    };
+    void SetUp() override {
+        KernelAUBFixture<SimpleKernelFixture>::SetUp();
+        variablesCount = arrayCount(variables);
+
+        BufferDefaults::context = context;
+        for (size_t i = 0; i < variablesCount; i++) {
+            if (variables[i].sizeUserMemory) {
+                variables[i].destBuffer = Buffer::create(
+                    context,
+                    CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL,
+                    variables[i].sizeUserMemory,
+                    nullptr,
+                    retVal);
+                ASSERT_NE(nullptr, variables[i].destBuffer);
+                variables[i].destMemory = reinterpret_cast<void *>(variables[i].destBuffer->getCpuAddressForMapping());
+            }
+        }
+    }
+
+    void TearDown() override {
+        pCmdQ->flush();
+
+        for (size_t i = 0; i < variablesCount; i++) {
+            if (variables[i].destBuffer) {
+                delete variables[i].destBuffer;
+                variables[i].destBuffer = nullptr;
+            }
+            if (variables[i].expectedMemory) {
+                alignedFree(variables[i].expectedMemory);
+                variables[i].expectedMemory = nullptr;
+            }
+            if (variables[i].expectedRemainderMemory) {
+                alignedFree(variables[i].expectedRemainderMemory);
+                variables[i].expectedRemainderMemory = nullptr;
+            }
+        }
+        BufferDefaults::context = nullptr;
+        KernelAUBFixture<SimpleKernelFixture>::TearDown();
+    }
+
+    std::unique_ptr<DebugManagerStateRestore> debugRestorer;
+    TestVariables variables[5] = {};
+    size_t variablesCount;
+
+    HardwareParse hwParser;
+};
+
+struct InlineDataFixture : AubDispatchThreadDataFixture {
+    void SetUp() override {
+        debugRestorer = std::make_unique<DebugManagerStateRestore>();
+        DebugManager.flags.EnablePassInlineData.set(true);
+
+        initializeKernel3Variables();
+        initializeKernel4Variables();
+
+        AubDispatchThreadDataFixture::SetUp();
+
+        setUpKernel3();
+    }
+
+    void initializeKernel4Variables() {
+        kernelIds |= (1 << 4);
+        variables[4].gwsSize = 1;
+        variables[4].lwsSize = 1;
+    }
+
+    void initializeKernel3Variables() {
+        kernelIds |= (1 << 3);
+        variables[3].sizeUserMemory = 4096;
+        variables[3].typeSize = sizeof(unsigned int);
+        variables[3].gwsSize = 128;
+        variables[3].lwsSize = 32;
+    }
+
+    void setUpKernel3() {
+        memset(variables[3].destMemory, 0xFE, variables[3].sizeUserMemory);
+
+        kernels[3]->setArg(0, variables[3].destBuffer);
+
+        variables[3].sizeWrittenMemory = variables[3].gwsSize * variables[3].typeSize;
+        variables[3].expectedMemory = alignedMalloc(variables[3].sizeWrittenMemory, 4096);
+        memset(variables[3].expectedMemory, 0, variables[3].sizeWrittenMemory);
+        variables[3].sizeRemainderMemory = variables[3].sizeUserMemory - variables[3].sizeWrittenMemory;
+        variables[3].expectedRemainderMemory = alignedMalloc(variables[3].sizeRemainderMemory, 4096);
+        memcpy_s(variables[3].expectedRemainderMemory,
+                 variables[3].sizeRemainderMemory,
+                 variables[3].destMemory,
+                 variables[3].sizeRemainderMemory);
+
+        variables[3].remainderDestMemory = static_cast<char *>(variables[3].destMemory) + variables[3].sizeWrittenMemory;
+    }
+};
+
+using XeHPAndLaterAubInlineDataTest = Test<InlineDataFixture>;
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubInlineDataTest, givenCrossThreadFitIntoSingleGrfWhenInlineDataAllowedThenCopyAllCrossThreadIntoInline) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using INLINE_DATA = typename FamilyType::INLINE_DATA;
+
+    if (!HardwareCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*kernels[4])) {
+        return;
+    }
+
+    cl_uint workDim = 1;
+    size_t globalWorkOffset[3] = {0, 0, 0};
+    size_t globalWorkSize[3] = {variables[4].gwsSize, 1, 1};
+    size_t localWorkSize[3] = {variables[4].lwsSize, 1, 1};
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[4].get(),
+        workDim,
+        globalWorkOffset,
+        globalWorkSize,
+        localWorkSize,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    pCmdQ->flush();
+
+    hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
+    hwParser.findHardwareCommands<FamilyType>();
+    EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
+
+    auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
+    EXPECT_EQ(1u, walker->getEmitInlineParameter());
+
+    auto localId = kernels[4]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
+    uint32_t expectedEmitLocal = 0;
+    if (localId[0]) {
+        expectedEmitLocal |= (1 << 0);
+    }
+    if (localId[1]) {
+        expectedEmitLocal |= (1 << 1);
+    }
+    if (localId[2]) {
+        expectedEmitLocal |= (1 << 2);
+    }
+
+    EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
+    EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), kernels[4]->getCrossThreadData(), sizeof(INLINE_DATA)));
+    //this kernel does nothing, so no expectMemory because only such kernel can fit into single GRF
+    //this is for sake of testing inline data data copying by COMPUTE_WALKER
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubInlineDataTest, givenCrossThreadSizeMoreThanSingleGrfWhenInlineDataAllowedThenCopyGrfCrossThreadToInline) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using INLINE_DATA = typename FamilyType::INLINE_DATA;
+
+    if (!HardwareCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*kernels[3])) {
+        return;
+    }
+
+    cl_uint workDim = 1;
+    size_t globalWorkOffset[3] = {0, 0, 0};
+    size_t globalWorkSize[3] = {variables[3].gwsSize, 1, 1};
+    size_t localWorkSize[3] = {variables[3].lwsSize, 1, 1};
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+
+    IndirectHeap &ih = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 2048);
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[3].get(),
+        workDim,
+        globalWorkOffset,
+        globalWorkSize,
+        localWorkSize,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    pCmdQ->flush();
+
+    hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
+    hwParser.findHardwareCommands<FamilyType>();
+    EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
+
+    auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
+    EXPECT_EQ(1u, walker->getEmitInlineParameter());
+
+    auto localId = kernels[3]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
+    uint32_t expectedEmitLocal = 0;
+    if (localId[0]) {
+        expectedEmitLocal |= (1 << 0);
+    }
+    if (localId[1]) {
+        expectedEmitLocal |= (1 << 1);
+    }
+    if (localId[2]) {
+        expectedEmitLocal |= (1 << 2);
+    }
+    EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
+    char *crossThreadData = kernels[3]->getCrossThreadData();
+    size_t crossThreadDataSize = kernels[3]->getCrossThreadDataSize();
+    auto inlineSize = sizeof(INLINE_DATA);
+    EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadData, inlineSize));
+
+    crossThreadDataSize -= inlineSize;
+    crossThreadData += inlineSize;
+
+    void *payloadData = ih.getCpuBase();
+    EXPECT_EQ(0, memcmp(payloadData, crossThreadData, crossThreadDataSize));
+
+    expectMemory<FamilyType>(variables[3].destMemory, variables[3].expectedMemory, variables[3].sizeWrittenMemory);
+    expectMemory<FamilyType>(variables[3].remainderDestMemory, variables[3].expectedRemainderMemory, variables[3].sizeRemainderMemory);
+}
+
+struct HwLocalIdsFixture : AubDispatchThreadDataFixture {
+    void SetUp() override {
+        debugRestorer = std::make_unique<DebugManagerStateRestore>();
+        DebugManager.flags.EnableHwGenerationLocalIds.set(1);
+
+        initializeKernel2Variables();
+
+        AubDispatchThreadDataFixture::SetUp();
+
+        if (kernels[2]->getKernelInfo().kernelDescriptor.kernelAttributes.flags.passInlineData) {
+            DebugManager.flags.EnablePassInlineData.set(true);
+        }
+
+        setUpKernel2();
+    }
+
+    void initializeKernel2Variables() {
+        kernelIds |= (1 << 2);
+        variables[2].sizeUserMemory = 4096;
+        variables[2].scalarArg = 0xAA;
+        variables[2].typeSize = sizeof(unsigned int);
+        variables[2].gwsSize = 256;
+        variables[2].lwsSize = 32;
+    }
+
+    void setUpKernel2() {
+        memset(variables[2].destMemory, 0xFE, variables[2].sizeUserMemory);
+
+        kernels[2]->setArg(0, sizeof(variables[2].scalarArg), &variables[2].scalarArg);
+        kernels[2]->setArg(1, variables[2].destBuffer);
+
+        variables[2].sizeWrittenMemory = variables[2].gwsSize * variables[2].typeSize;
+        variables[2].expectedMemory = alignedMalloc(variables[2].sizeWrittenMemory, 4096);
+        unsigned int *expectedData = static_cast<unsigned int *>(variables[2].expectedMemory);
+        for (size_t i = 0; i < variables[2].gwsSize; i++) {
+            *(expectedData + i) = variables[2].scalarArg;
+        }
+        variables[2].sizeRemainderMemory = variables[2].sizeUserMemory - variables[2].sizeWrittenMemory;
+        variables[2].expectedRemainderMemory = alignedMalloc(variables[2].sizeRemainderMemory, 4096);
+        memcpy_s(variables[2].expectedRemainderMemory,
+                 variables[2].sizeRemainderMemory,
+                 variables[2].destMemory,
+                 variables[2].sizeRemainderMemory);
+
+        variables[2].remainderDestMemory = static_cast<char *>(variables[2].destMemory) + variables[2].sizeWrittenMemory;
+    }
+};
+
+using XeHPAndLaterAubHwLocalIdsTest = Test<HwLocalIdsFixture>;
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubHwLocalIdsTest, WhenEnqueueDimensionsArePow2ThenSetEmitLocalIdsAndGenerateLocalIdsFields) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
+
+    cl_uint workDim = 1;
+    size_t globalWorkOffset[3] = {0, 0, 0};
+    size_t globalWorkSize[3] = {variables[2].gwsSize, 1, 1};
+    size_t localWorkSize[3] = {variables[2].lwsSize, 1, 1};
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[2].get(),
+        workDim,
+        globalWorkOffset,
+        globalWorkSize,
+        localWorkSize,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
+    hwParser.findHardwareCommands<FamilyType>();
+    EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
+
+    auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
+
+    auto localId = kernels[2]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
+    uint32_t expectedEmitLocal = 0;
+    if (localId[0]) {
+        expectedEmitLocal |= (1 << 0);
+    }
+    if (localId[1]) {
+        expectedEmitLocal |= (1 << 1);
+    }
+    if (localId[2]) {
+        expectedEmitLocal |= (1 << 2);
+    }
+    EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
+    EXPECT_EQ(1u, walker->getGenerateLocalId());
+
+    auto kernelAllocationGpuAddr = kernels[2]->getKernelInfo().kernelAllocation->getGpuAddressToPatch();
+    auto skipOffset = kernels[2]->getKernelInfo().kernelDescriptor.entryPoints.skipPerThreadDataLoad;
+    uint64_t kernelStartPointer = kernelAllocationGpuAddr + skipOffset;
+
+    INTERFACE_DESCRIPTOR_DATA &idd = walker->getInterfaceDescriptor();
+    EXPECT_EQ(static_cast<uint32_t>(kernelStartPointer), idd.getKernelStartPointer());
+
+    pCmdQ->flush();
+
+    expectMemory<FamilyType>(variables[2].destMemory, variables[2].expectedMemory, variables[2].sizeWrittenMemory);
+    expectMemory<FamilyType>(variables[2].remainderDestMemory, variables[2].expectedRemainderMemory, variables[2].sizeRemainderMemory);
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubHwLocalIdsTest, givenNonPowOf2LocalWorkSizeButCompatibleWorkOrderWhenLocalIdsAreUsedThenDataVerifiesCorrectly) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
+
+    cl_uint workDim = 1;
+    size_t globalWorkSize[3] = {200, 1, 1};
+    size_t localWorkSize[3] = {200, 1, 1};
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[2].get(),
+        workDim,
+        nullptr,
+        globalWorkSize,
+        localWorkSize,
+        0,
+        nullptr,
+        nullptr);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
+    hwParser.findHardwareCommands<FamilyType>();
+    EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
+
+    auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
+
+    auto localId = kernels[2]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
+    uint32_t expectedEmitLocal = 0;
+    if (localId[0]) {
+        expectedEmitLocal |= (1 << 0);
+    }
+    if (localId[1]) {
+        expectedEmitLocal |= (1 << 1);
+    }
+    if (localId[2]) {
+        expectedEmitLocal |= (1 << 2);
+    }
+    EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
+    EXPECT_EQ(1u, walker->getGenerateLocalId());
+    EXPECT_EQ(4u, walker->getWalkOrder());
+
+    pCmdQ->flush();
+
+    expectMemory<FamilyType>(variables[2].destMemory, variables[2].expectedMemory, globalWorkSize[0] * variables[2].typeSize);
+}
+
+struct HwLocalIdsWithSubGroups : AubDispatchThreadDataFixture {
+    void SetUp() override {
+        debugRestorer = std::make_unique<DebugManagerStateRestore>();
+        DebugManager.flags.EnableHwGenerationLocalIds.set(1);
+
+        kernelIds |= (1 << 9);
+        variables[0].sizeUserMemory = 16 * KB;
+        AubDispatchThreadDataFixture::SetUp();
+
+        memset(variables[0].destMemory, 0, variables[0].sizeUserMemory);
+        variables[0].expectedMemory = alignedMalloc(variables[0].sizeUserMemory, 4096);
+        kernels[9]->setArg(0, variables[0].destBuffer);
+    }
+};
+
+using XeHPAndLaterAubHwLocalIdsWithSubgroupsTest = Test<HwLocalIdsWithSubGroups>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubHwLocalIdsWithSubgroupsTest, givenKernelUsingSubgroupsWhenLocalIdsAreGeneratedByHwThenValuesAreCorrect) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+    using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
+
+    cl_uint workDim = 1;
+    size_t globalWorkSize[3] = {200, 1, 1};
+    size_t localWorkSize[3] = {200, 1, 1};
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[9].get(),
+        workDim,
+        nullptr,
+        globalWorkSize,
+        localWorkSize,
+        0,
+        nullptr,
+        nullptr);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    HardwareParse hwParser;
+    hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
+    hwParser.findHardwareCommands<FamilyType>();
+    EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
+
+    auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
+
+    auto localId = kernels[9]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
+    uint32_t expectedEmitLocal = 0;
+    if (localId[0]) {
+        expectedEmitLocal |= (1 << 0);
+    }
+    if (localId[1]) {
+        expectedEmitLocal |= (1 << 1);
+    }
+    if (localId[2]) {
+        expectedEmitLocal |= (1 << 2);
+    }
+    EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
+    EXPECT_EQ(1u, walker->getGenerateLocalId());
+    EXPECT_EQ(4u, walker->getWalkOrder());
+
+    pCmdQ->finish();
+
+    //we expect sequence of local ids from 0..199
+    auto expectedMemory = reinterpret_cast<uint32_t *>(variables[0].expectedMemory);
+    auto currentWorkItem = 0u;
+
+    while (currentWorkItem < localWorkSize[0]) {
+        expectedMemory[0] = currentWorkItem++;
+        expectedMemory++;
+    }
+
+    expectMemory<FamilyType>(variables[0].destMemory, variables[0].expectedMemory, ptrDiff(expectedMemory, variables[0].expectedMemory));
+}
--- a/opencl/test/unit_test/aub_tests/command_queue/aub_multicontext_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/aub_multicontext_tests_xehp_and_later.cpp
@@ -0,0 +1,620 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/command_container/implicit_scaling.h"
+#include "shared/source/command_container/walker_partition_xehp_and_later.h"
+#include "shared/source/command_stream/aub_command_stream_receiver_hw.h"
+#include "shared/source/memory_manager/internal_allocation_storage.h"
+#include "shared/source/os_interface/os_context.h"
+#include "shared/test/common/cmd_parse/hw_parse.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/helpers/unit_test_helper.h"
+#include "shared/test/common/mocks/mock_allocation_properties.h"
+#include "shared/test/common/mocks/mock_device.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/extensions/public/cl_ext_private.h"
+#include "opencl/source/command_queue/command_queue.h"
+#include "opencl/source/helpers/cl_memory_properties_helpers.h"
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/source/mem_obj/image.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/multicontext_aub_fixture.h"
+#include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
+#include "opencl/test/unit_test/mocks/mock_command_queue.h"
+#include "opencl/test/unit_test/mocks/mock_context.h"
+
+using namespace NEO;
+
+template <uint32_t numberOfTiles, MulticontextAubFixture::EnabledCommandStreamers enabledCommandStreamers>
+struct MultitileMulticontextTests : public MulticontextAubFixture, public ::testing::Test {
+    void SetUp() override {
+        MulticontextAubFixture::SetUp(numberOfTiles, enabledCommandStreamers, false);
+    }
+    void TearDown() override {
+        MulticontextAubFixture::TearDown();
+    }
+
+    template <typename FamilyType>
+    void runAubTest() {
+        cl_int retVal = CL_SUCCESS;
+        const uint32_t bufferSize = 64 * KB;
+        uint8_t writePattern[bufferSize];
+        uint8_t initPattern[bufferSize];
+        std::fill(writePattern, writePattern + sizeof(writePattern), 1);
+        std::fill(initPattern, initPattern + sizeof(initPattern), 0);
+
+        std::vector<std::vector<std::unique_ptr<Buffer>>> regularBuffers;
+        std::vector<std::vector<std::unique_ptr<Buffer>>> tileOnlyBuffers;
+
+        cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
+
+        regularBuffers.resize(tileDevices.size());
+        tileOnlyBuffers.resize(tileDevices.size());
+        for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
+            for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
+                DebugManager.flags.DoCpuCopyOnWriteBuffer.set(true);
+                auto memoryProperties = ClMemoryPropertiesHelper::createMemoryProperties(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0, 0,
+                                                                                         &context->getDevice(0)->getDevice());
+                auto regularBuffer = Buffer::create(
+                    context.get(), memoryProperties, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0, bufferSize, initPattern, retVal);
+                auto tileOnlyProperties = ClMemoryPropertiesHelper::createMemoryProperties(
+                    flags, 0, 0, context->getDevice(0)->getDevice().getNearestGenericSubDevice(tile));
+                auto tileOnlyBuffer = Buffer::create(context.get(), tileOnlyProperties, flags, 0, bufferSize, initPattern, retVal);
+                DebugManager.flags.DoCpuCopyOnWriteBuffer.set(false);
+                regularBuffer->forceDisallowCPUCopy = true;
+                tileOnlyBuffer->forceDisallowCPUCopy = true;
+                regularBuffers[tile].push_back(std::unique_ptr<Buffer>(regularBuffer));
+                tileOnlyBuffers[tile].push_back(std::unique_ptr<Buffer>(tileOnlyBuffer));
+
+                commandQueues[tile][tileEngine]->enqueueWriteBuffer(regularBuffer, CL_FALSE, 0, bufferSize, writePattern, nullptr, 0, nullptr, nullptr);
+                commandQueues[tile][tileEngine]->enqueueWriteBuffer(tileOnlyBuffer, CL_FALSE, 0, bufferSize, writePattern, nullptr, 0, nullptr, nullptr);
+
+                commandQueues[tile][tileEngine]->flush();
+            }
+        }
+
+        for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
+            for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
+                getSimulatedCsr<FamilyType>(tile, tileEngine)->pollForCompletion();
+
+                auto regularBufferGpuAddress = static_cast<uintptr_t>(regularBuffers[tile][tileEngine]->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());
+                auto tileOnlyBufferGpuAddress = static_cast<uintptr_t>(tileOnlyBuffers[tile][tileEngine]->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());
+                expectMemory<FamilyType>(reinterpret_cast<void *>(regularBufferGpuAddress), writePattern, bufferSize, tile, tileEngine);
+                expectMemory<FamilyType>(reinterpret_cast<void *>(tileOnlyBufferGpuAddress), writePattern, bufferSize, tile, tileEngine);
+            }
+        }
+    }
+
+    template <typename FamilyType>
+    void runAubWriteImageTest() {
+        if (!tileDevices[0]->getSharedDeviceInfo().imageSupport) {
+            GTEST_SKIP();
+        }
+
+        cl_int retVal = CL_SUCCESS;
+        auto testWidth = 5u;
+        auto testHeight = 5u;
+        auto testDepth = 1u;
+        auto numPixels = testWidth * testHeight * testDepth;
+
+        cl_image_format imageFormat;
+        imageFormat.image_channel_data_type = CL_FLOAT;
+        imageFormat.image_channel_order = CL_RGBA;
+
+        cl_mem_flags flags = 0;
+        auto surfaceFormat = Image::getSurfaceFormatFromTable(flags, &imageFormat, context->getDevice(0)->getHardwareInfo().capabilityTable.supportsOcl21Features);
+
+        cl_image_desc imageDesc;
+        imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
+        imageDesc.image_width = testWidth;
+        imageDesc.image_height = testHeight;
+        imageDesc.image_depth = testDepth;
+        imageDesc.image_array_size = 1;
+        imageDesc.image_row_pitch = 0;
+        imageDesc.image_slice_pitch = 0;
+        imageDesc.num_mip_levels = 0;
+        imageDesc.num_samples = 0;
+        imageDesc.mem_object = NULL;
+
+        auto perChannelDataSize = 4u;
+        auto numChannels = 4u;
+        auto elementSize = perChannelDataSize * numChannels;
+        auto srcMemory = (uint8_t *)alignedMalloc(elementSize * numPixels, MemoryConstants::pageSize);
+        for (size_t i = 0; i < numPixels * elementSize; ++i) {
+            auto origValue = static_cast<uint8_t>(i);
+            memcpy(srcMemory + i, &origValue, sizeof(origValue));
+        }
+
+        size_t origin[3] = {0, 0, 0};
+        const size_t region[3] = {testWidth, testHeight, testDepth};
+        size_t inputRowPitch = testWidth * elementSize;
+        size_t inputSlicePitch = inputRowPitch * testHeight;
+
+        std::vector<std::vector<std::unique_ptr<Image>>> images;
+        images.resize(tileDevices.size());
+
+        for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
+            for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
+                Image *dstImage = Image::create(
+                    context.get(),
+                    ClMemoryPropertiesHelper::createMemoryProperties(flags, 0, 0, &context->getDevice(0)->getDevice()),
+                    flags,
+                    0,
+                    surfaceFormat,
+                    &imageDesc,
+                    nullptr,
+                    retVal);
+                ASSERT_NE(nullptr, dstImage);
+                memset(dstImage->getCpuAddress(), 0xFF, dstImage->getSize());
+
+                retVal = commandQueues[tile][tileEngine]->enqueueWriteImage(
+                    dstImage,
+                    CL_FALSE,
+                    origin,
+                    region,
+                    inputRowPitch,
+                    inputSlicePitch,
+                    srcMemory,
+                    nullptr,
+                    0,
+                    nullptr,
+                    nullptr);
+                EXPECT_EQ(CL_SUCCESS, retVal);
+
+                images[tile].push_back(std::unique_ptr<Image>(dstImage));
+            }
+        }
+
+        for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
+            for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
+                commandQueues[tile][tileEngine]->flush();
+            }
+        }
+
+        std::unique_ptr<uint8_t[]> dstMemory;
+
+        for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
+            for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
+
+                dstMemory.reset(new uint8_t[images[tile][tileEngine]->getSize()]);
+                memset(dstMemory.get(), 0xFF, images[tile][tileEngine]->getSize());
+
+                commandQueues[tile][tileEngine]->enqueueReadImage(
+                    images[tile][tileEngine].get(), CL_FALSE, origin, region, 0, 0, dstMemory.get(), nullptr, 0, nullptr, nullptr);
+
+                commandQueues[tile][tileEngine]->flush();
+
+                auto rowPitch = images[tile][tileEngine]->getHostPtrRowPitch();
+                auto slicePitch = images[tile][tileEngine]->getHostPtrSlicePitch();
+
+                auto pSrcMemory = srcMemory;
+                auto pDstMemory = dstMemory.get();
+                for (size_t z = 0; z < testDepth; ++z) {
+                    for (size_t y = 0; y < testHeight; ++y) {
+                        expectMemory<FamilyType>(pDstMemory, pSrcMemory, testWidth * elementSize, tile, tileEngine);
+                        pSrcMemory = ptrOffset(pSrcMemory, testWidth * elementSize);
+                        pDstMemory = ptrOffset(pDstMemory, rowPitch);
+                    }
+                    pDstMemory = ptrOffset(pDstMemory, slicePitch - (rowPitch * (testHeight > 0 ? testHeight : 1)));
+                }
+            }
+        }
+
+        alignedFree(srcMemory);
+    }
+};
+
+// 4 Tiles
+using FourTilesAllContextsTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::All>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesAllContextsTest, GENERATEONLY_givenFourTilesAndAllContextsWhenSubmittingThenDataIsValid) {
+    runAubTest<FamilyType>();
+}
+
+using FourTilesDualContextTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesDualContextTest, HEAVY_givenFourTilesAndDualContextWhenSubmittingThenDataIsValid) {
+    runAubTest<FamilyType>();
+}
+
+using FourTilesSingleContextTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::Single>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesSingleContextTest, givenFourTilesAndSingleContextWhenSubmittingThenDataIsValid) {
+    runAubTest<FamilyType>();
+}
+
+struct EnqueueWithWalkerPartitionFourTilesTests : public FourTilesSingleContextTest, SimpleKernelFixture {
+    void SetUp() override {
+        DebugManager.flags.EnableWalkerPartition.set(1u);
+        kernelIds |= (1 << 5);
+        kernelIds |= (1 << 8);
+
+        FourTilesSingleContextTest::SetUp();
+        SimpleKernelFixture::SetUp(rootDevice, context.get());
+
+        rootCsr = rootDevice->getDefaultEngine().commandStreamReceiver;
+        EXPECT_EQ(4u, rootCsr->getOsContext().getNumSupportedDevices());
+        engineControlForFusedQueue = {rootCsr, &rootCsr->getOsContext()};
+
+        bufferSize = 16 * MemoryConstants::kiloByte;
+
+        auto destMemory = std::make_unique<uint8_t[]>(bufferSize);
+        memset(destMemory.get(), 0x0, bufferSize);
+
+        cl_int retVal = CL_SUCCESS;
+        buffer.reset(Buffer::create(multiTileDefaultContext.get(), CL_MEM_COPY_HOST_PTR, bufferSize, destMemory.get(), retVal));
+
+        clBuffer = buffer.get();
+    }
+
+    void TearDown() override {
+        SimpleKernelFixture::TearDown();
+        FourTilesSingleContextTest::TearDown();
+    }
+
+    void *getGpuAddress(Buffer &buffer) {
+        return reinterpret_cast<void *>(buffer.getGraphicsAllocation(this->rootDeviceIndex)->getGpuAddress());
+    }
+
+    uint32_t bufferSize = 0;
+    std::unique_ptr<Buffer> buffer;
+    cl_mem clBuffer;
+    EngineControl engineControlForFusedQueue = {};
+    CommandStreamReceiver *rootCsr = nullptr;
+};
+
+struct DynamicWalkerPartitionFourTilesTests : EnqueueWithWalkerPartitionFourTilesTests {
+    void SetUp() override {
+        DebugManager.flags.EnableStaticPartitioning.set(0);
+        EnqueueWithWalkerPartitionFourTilesTests::SetUp();
+    }
+    DebugManagerStateRestore restore{};
+};
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, DynamicWalkerPartitionFourTilesTests, whenWalkerPartitionIsEnabledForKernelWithAtomicThenOutputDataIsValid) {
+    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
+
+    auto mockCommandQueue = new MockCommandQueueHw<FamilyType>(multiTileDefaultContext.get(), rootDevice, nullptr);
+
+    commandQueues[0][0].reset(mockCommandQueue);
+
+    constexpr size_t globalWorkOffset[] = {0, 0, 0};
+    constexpr size_t gwsSize[] = {512, 1, 1};
+    constexpr size_t lwsSize[] = {32, 1, 1};
+    constexpr cl_uint workingDimensions = 1;
+    cl_int retVal = CL_SUCCESS;
+
+    kernels[5]->setArg(0, sizeof(cl_mem), &clBuffer);
+    retVal = mockCommandQueue->enqueueKernel(kernels[5].get(), workingDimensions, globalWorkOffset, gwsSize, lwsSize, 0, nullptr, nullptr);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    mockCommandQueue->flush();
+
+    HardwareParse hwParser;
+    auto &cmdStream = mockCommandQueue->getCS(0);
+    hwParser.parseCommands<FamilyType>(cmdStream, 0);
+
+    bool lastSemaphoreFound = false;
+    uint64_t tileAtomicGpuAddress = 0;
+    for (auto it = hwParser.cmdList.rbegin(); it != hwParser.cmdList.rend(); it++) {
+        auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*it);
+        if (semaphoreCmd) {
+            if (UnitTestHelper<FamilyType>::isAdditionalMiSemaphoreWait(*semaphoreCmd)) {
+                continue;
+            }
+            EXPECT_EQ(4u, semaphoreCmd->getSemaphoreDataDword());
+            tileAtomicGpuAddress = semaphoreCmd->getSemaphoreGraphicsAddress();
+            lastSemaphoreFound = true;
+            break;
+        }
+    }
+
+    if (ImplicitScalingDispatch<FamilyType>::getPipeControlStallRequired()) {
+        EXPECT_TRUE(lastSemaphoreFound);
+        EXPECT_NE(0u, tileAtomicGpuAddress);
+    } else {
+        EXPECT_FALSE(lastSemaphoreFound);
+        EXPECT_EQ(0u, tileAtomicGpuAddress);
+    }
+
+    expectMemory<FamilyType>(getGpuAddress(*buffer), &gwsSize[workingDimensions - 1], sizeof(uint32_t), 0, 0);
+    uint32_t expectedAtomicValue = 4;
+    if (ImplicitScalingDispatch<FamilyType>::getPipeControlStallRequired()) {
+        expectMemory<FamilyType>(reinterpret_cast<void *>(tileAtomicGpuAddress), &expectedAtomicValue, sizeof(uint32_t), 0, 0);
+    }
+
+    constexpr uint32_t workgroupCount = static_cast<uint32_t>(gwsSize[workingDimensions - 1] / lwsSize[workingDimensions - 1]);
+    auto groupSpecificWorkCounts = ptrOffset(getGpuAddress(*buffer), 4);
+    std::array<uint32_t, workgroupCount> workgroupCounts;
+    std::fill(workgroupCounts.begin(), workgroupCounts.end(), static_cast<uint32_t>(lwsSize[workingDimensions - 1]));
+
+    expectMemory<FamilyType>(groupSpecificWorkCounts, &workgroupCounts[0], workgroupCounts.size() * sizeof(uint32_t), 0, 0);
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, DynamicWalkerPartitionFourTilesTests, whenWalkerPartitionIsEnabledForKernelWithoutAtomicThenOutputDataIsValid) {
+    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
+
+    auto mockCommandQueue = new MockCommandQueueHw<FamilyType>(multiTileDefaultContext.get(), rootDevice, nullptr);
+
+    commandQueues[0][0].reset(mockCommandQueue);
+
+    constexpr size_t globalWorkOffset[3] = {0, 0, 0};
+    constexpr size_t gwsSize[3] = {1024, 1, 1};
+    constexpr size_t lwsSize[3] = {32, 1, 1};
+    constexpr cl_uint workingDimensions = 1;
+    cl_uint kernelIncrementCounter = 1024;
+    cl_int retVal = CL_SUCCESS;
+
+    kernels[8]->setArg(0, sizeof(cl_mem), &clBuffer);
+    kernels[8]->setArg(1, kernelIncrementCounter);
+    retVal = mockCommandQueue->enqueueKernel(kernels[8].get(), workingDimensions, globalWorkOffset, gwsSize, lwsSize, 0, nullptr, nullptr);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    mockCommandQueue->flush();
+
+    constexpr uint32_t workgroupCount = static_cast<uint32_t>(gwsSize[workingDimensions - 1] / lwsSize[workingDimensions - 1]);
+    std::array<uint32_t, workgroupCount> workgroupCounts;
+    std::fill(workgroupCounts.begin(), workgroupCounts.end(), kernelIncrementCounter);
+
+    expectMemory<FamilyType>(getGpuAddress(*buffer), &workgroupCounts[0], workgroupCounts.size() * sizeof(uint32_t), 0, 0);
+}
+
+struct StaticWalkerPartitionFourTilesTests : EnqueueWithWalkerPartitionFourTilesTests {
+    void SetUp() override {
+        DebugManager.flags.EnableStaticPartitioning.set(1);
+        DebugManager.flags.EnableBlitterOperationsSupport.set(1);
+        EnqueueWithWalkerPartitionFourTilesTests::SetUp();
+    }
+
+    std::unique_ptr<LinearStream> createTaskStream() {
+        const AllocationProperties commandStreamAllocationProperties{rootDevice->getRootDeviceIndex(),
+                                                                     true,
+                                                                     MemoryConstants::pageSize,
+                                                                     GraphicsAllocation::AllocationType::COMMAND_BUFFER,
+                                                                     true,
+                                                                     false,
+                                                                     rootDevice->getDeviceBitfield()};
+        GraphicsAllocation *streamAllocation = rootDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
+        return std::make_unique<LinearStream>(streamAllocation);
+    }
+
+    void destroyTaskStream(LinearStream &stream) {
+        rootDevice->getMemoryManager()->freeGraphicsMemory(stream.getGraphicsAllocation());
+    }
+
+    void flushTaskStream(LinearStream &stream) {
+        DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
+        dispatchFlags.guardCommandBufferWithPipeControl = true;
+
+        rootCsr->flushTask(stream, 0,
+                           rootCsr->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
+                           rootCsr->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
+                           rootCsr->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
+                           0u, dispatchFlags, rootDevice->getDevice());
+
+        rootCsr->flushBatchedSubmissions();
+    }
+
+    template <typename FamilyType>
+    void expectMemoryOnRootCsr(void *gfxAddress, const void *srcAddress, size_t length) {
+        auto csr = static_cast<AUBCommandStreamReceiverHw<FamilyType> *>(rootCsr);
+        csr->expectMemoryEqual(gfxAddress, srcAddress, length);
+    }
+
+    DebugManagerStateRestore restore{};
+};
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, StaticWalkerPartitionFourTilesTests, givenFourTilesWhenStaticWalkerPartitionIsEnabledForKernelThenOutputDataIsValid) {
+    using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
+
+    auto mockCommandQueue = new MockCommandQueueHw<FamilyType>(multiTileDefaultContext.get(), rootDevice, nullptr);
+
+    commandQueues[0][0].reset(mockCommandQueue);
+
+    constexpr size_t globalWorkOffset[3] = {0, 0, 0};
+    constexpr size_t gwsSize[3] = {1024, 1, 1};
+    constexpr size_t lwsSize[3] = {32, 1, 1};
+    constexpr cl_uint workingDimensions = 1;
+    cl_uint kernelIncrementCounter = 1024;
+    cl_int retVal = CL_SUCCESS;
+
+    kernels[8]->setArg(0, sizeof(cl_mem), &clBuffer);
+    kernels[8]->setArg(1, kernelIncrementCounter);
+    retVal = mockCommandQueue->enqueueKernel(kernels[8].get(), workingDimensions, globalWorkOffset, gwsSize, lwsSize, 0, nullptr, nullptr);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    mockCommandQueue->flush();
+
+    constexpr uint32_t workgroupCount = static_cast<uint32_t>(gwsSize[workingDimensions - 1] / lwsSize[workingDimensions - 1]);
+    std::array<uint32_t, workgroupCount> workgroupCounts;
+    std::fill(workgroupCounts.begin(), workgroupCounts.end(), kernelIncrementCounter);
+
+    expectMemoryOnRootCsr<FamilyType>(getGpuAddress(*buffer), &workgroupCounts[0], workgroupCounts.size() * sizeof(uint32_t));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, StaticWalkerPartitionFourTilesTests, givenPreWalkerSyncWhenStaticWalkerPartitionIsThenAtomicsAreIncrementedCorrectly) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+
+    auto taskStream = createTaskStream();
+    auto taskStreamCpu = taskStream->getSpace(0);
+    auto taskStreamGpu = taskStream->getGraphicsAllocation()->getGpuAddress();
+
+    uint32_t totalBytesProgrammed = 0u;
+    WALKER_TYPE walkerCmd = FamilyType::cmdInitGpgpuWalker;
+    walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X);
+    walkerCmd.getInterfaceDescriptor().setNumberOfThreadsInGpgpuThreadGroup(1u);
+
+    WalkerPartition::WalkerPartitionArgs testArgs = {};
+    testArgs.initializeWparidRegister = true;
+    testArgs.crossTileAtomicSynchronization = true;
+    testArgs.emitPipeControlStall = true;
+    testArgs.tileCount = static_cast<uint32_t>(rootDevice->getDeviceBitfield().count());
+    testArgs.partitionCount = testArgs.tileCount;
+    testArgs.synchronizeBeforeExecution = true;
+    testArgs.secondaryBatchBuffer = false;
+    testArgs.emitSelfCleanup = false;
+    testArgs.staticPartitioning = true;
+    testArgs.workPartitionAllocationGpuVa = rootCsr->getWorkPartitionAllocationGpuAddress();
+    WalkerPartition::constructStaticallyPartitionedCommandBuffer<FamilyType>(
+        taskStreamCpu,
+        taskStreamGpu,
+        &walkerCmd,
+        totalBytesProgrammed,
+        testArgs,
+        *defaultHwInfo);
+    taskStream->getSpace(totalBytesProgrammed);
+    flushTaskStream(*taskStream);
+
+    const auto controlSectionAddress = taskStreamGpu + WalkerPartition::computeStaticPartitioningControlSectionOffset<FamilyType>(testArgs);
+    const auto preWalkerSyncAddress = controlSectionAddress + offsetof(WalkerPartition::StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
+    const auto postWalkerSyncAddress = controlSectionAddress + offsetof(WalkerPartition::StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
+    uint32_t expectedValue = 0x4;
+    expectMemoryOnRootCsr<FamilyType>(reinterpret_cast<void *>(preWalkerSyncAddress), &expectedValue, sizeof(expectedValue));
+    expectMemoryOnRootCsr<FamilyType>(reinterpret_cast<void *>(postWalkerSyncAddress), &expectedValue, sizeof(expectedValue));
+
+    destroyTaskStream(*taskStream);
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, StaticWalkerPartitionFourTilesTests, whenNoPreWalkerSyncThenAtomicsAreIncrementedCorrectly) {
+    using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
+
+    auto taskStream = createTaskStream();
+    auto taskStreamCpu = taskStream->getSpace(0);
+    auto taskStreamGpu = taskStream->getGraphicsAllocation()->getGpuAddress();
+
+    uint32_t totalBytesProgrammed = 0u;
+    WALKER_TYPE walkerCmd = FamilyType::cmdInitGpgpuWalker;
+    walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X);
+    walkerCmd.getInterfaceDescriptor().setNumberOfThreadsInGpgpuThreadGroup(1u);
+
+    WalkerPartition::WalkerPartitionArgs testArgs = {};
+    testArgs.initializeWparidRegister = true;
+    testArgs.crossTileAtomicSynchronization = true;
+    testArgs.emitPipeControlStall = true;
+    testArgs.tileCount = static_cast<uint32_t>(rootDevice->getDeviceBitfield().count());
+    testArgs.partitionCount = testArgs.tileCount;
+    testArgs.synchronizeBeforeExecution = false;
+    testArgs.secondaryBatchBuffer = false;
+    testArgs.emitSelfCleanup = false;
+    testArgs.staticPartitioning = true;
+    testArgs.workPartitionAllocationGpuVa = rootCsr->getWorkPartitionAllocationGpuAddress();
+
+    WalkerPartition::constructStaticallyPartitionedCommandBuffer<FamilyType>(
+        taskStreamCpu,
+        taskStreamGpu,
+        &walkerCmd,
+        totalBytesProgrammed,
+        testArgs,
+        *defaultHwInfo);
+    taskStream->getSpace(totalBytesProgrammed);
+    flushTaskStream(*taskStream);
+
+    const auto controlSectionAddress = taskStreamGpu + WalkerPartition::computeStaticPartitioningControlSectionOffset<FamilyType>(testArgs);
+    const auto preWalkerSyncAddress = controlSectionAddress + offsetof(WalkerPartition::StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
+    const auto postWalkerSyncAddress = controlSectionAddress + offsetof(WalkerPartition::StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
+    uint32_t expectedValue = 0x0;
+    expectMemoryOnRootCsr<FamilyType>(reinterpret_cast<void *>(preWalkerSyncAddress), &expectedValue, sizeof(expectedValue));
+    expectedValue = 0x4;
+    expectMemoryOnRootCsr<FamilyType>(reinterpret_cast<void *>(postWalkerSyncAddress), &expectedValue, sizeof(expectedValue));
+
+    destroyTaskStream(*taskStream);
+}
+
+// 2 Tiles
+using TwoTilesAllContextsTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::All>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesAllContextsTest, HEAVY_givenTwoTilesAndAllContextsWhenSubmittingThenDataIsValid) {
+    runAubTest<FamilyType>();
+}
+
+using TwoTilesDualContextTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesDualContextTest, givenTwoTilesAndDualContextWhenSubmittingThenDataIsValid) {
+    runAubTest<FamilyType>();
+}
+
+using TwoTilesSingleContextTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::Single>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesSingleContextTest, givenTwoTilesAndSingleContextWhenSubmittingThenDataIsValid) {
+    runAubTest<FamilyType>();
+}
+
+// 1 Tile
+
+using SingleTileAllContextsTest = MultitileMulticontextTests<1, MulticontextAubFixture::EnabledCommandStreamers::All>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileAllContextsTest, GENERATEONLY_givenSingleTileAndAllContextsWhenSubmittingThenDataIsValid) {
+    runAubTest<FamilyType>();
+}
+
+using SingleTileDualContextTest = MultitileMulticontextTests<1, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileDualContextTest, givenSingleTileAndDualContextWhenSubmittingThenDataIsValid) {
+    runAubTest<FamilyType>();
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileDualContextTest, givenSingleAllocationWhenUpdatedFromDifferentContextThenDataIsValid) {
+    cl_int retVal = CL_SUCCESS;
+    const uint32_t bufferSize = 256;
+    const uint32_t halfBufferSize = bufferSize / 2;
+    uint8_t writePattern1[halfBufferSize];
+    uint8_t writePattern2[halfBufferSize];
+    uint8_t initPattern[bufferSize];
+    std::fill(initPattern, initPattern + sizeof(initPattern), 0);
+    std::fill(writePattern1, writePattern1 + sizeof(writePattern1), 1);
+    std::fill(writePattern2, writePattern2 + sizeof(writePattern2), 2);
+
+    std::unique_ptr<Buffer> buffer(Buffer::create(context.get(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, bufferSize, initPattern, retVal));
+    buffer->forceDisallowCPUCopy = true;
+
+    auto simulatedCsr0 = getSimulatedCsr<FamilyType>(0, 0);
+    simulatedCsr0->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
+    auto simulatedCsr1 = getSimulatedCsr<FamilyType>(0, 1);
+    simulatedCsr1->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
+
+    commandQueues[0][0]->enqueueWriteBuffer(buffer.get(), CL_FALSE, 0, halfBufferSize, writePattern1, nullptr, 0, nullptr, nullptr);
+    commandQueues[0][1]->enqueueWriteBuffer(buffer.get(), CL_FALSE, halfBufferSize, halfBufferSize, writePattern2, nullptr, 0, nullptr, nullptr);
+
+    commandQueues[0][1]->finish(); // submit second enqueue first to make sure that residency flow is correct
+    commandQueues[0][0]->finish();
+
+    auto gpuPtr = reinterpret_cast<void *>(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());
+    expectMemory<FamilyType>(gpuPtr, writePattern1, halfBufferSize, 0, 0);
+    expectMemory<FamilyType>(ptrOffset(gpuPtr, halfBufferSize), writePattern2, halfBufferSize, 0, 1);
+}
+
+// 1 |Tile
+using SingleTileDualContextTest = MultitileMulticontextTests<1, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileDualContextTest, givenSingleTileAndDualContextWhenWritingImageThenDataIsValid) {
+    runAubWriteImageTest<FamilyType>();
+}
+
+using SingleTileAllContextsTest = MultitileMulticontextTests<1, MulticontextAubFixture::EnabledCommandStreamers::All>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileAllContextsTest, HEAVY_givenSingleTileAndAllContextsWhenWritingImageThenDataIsValid) {
+    runAubWriteImageTest<FamilyType>();
+}
+
+// 2 Tiles
+using TwoTilesSingleContextTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::Single>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesSingleContextTest, givenTwoTilesAndSingleContextWhenWritingImageThenDataIsValid) {
+    runAubWriteImageTest<FamilyType>();
+}
+
+using TwoTilesDualContextTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesDualContextTest, givenTwoTilesAndDualContextWhenWritingImageThenDataIsValid) {
+    runAubWriteImageTest<FamilyType>();
+}
+
+using TwoTilesAllContextsTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::All>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesAllContextsTest, GENERATEONLY_givenTwoTilesAndAllContextsWhenWritingImageThenDataIsValid) {
+    runAubWriteImageTest<FamilyType>();
+}
+
+// 4 Tiles
+using FourTilesSingleContextTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::Single>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesSingleContextTest, givenFourTilesAndSingleContextWhenWritingImageThenDataIsValid) {
+    runAubWriteImageTest<FamilyType>();
+}
+
+using FourTilesDualContextTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesDualContextTest, GENERATEONLY_givenFourTilesAndDualContextWhenWritingImageThenDataIsValid) {
+    runAubWriteImageTest<FamilyType>();
+}
+
+using FourTilesAllContextsTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::All>;
+HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesAllContextsTest, GENERATEONLY_givenFourTilesAndAllContextsWhenWritingImageThenDataIsValid) {
+    runAubWriteImageTest<FamilyType>();
+}
--- a/opencl/test/unit_test/aub_tests/command_queue/aub_one_va_multi_physical_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/aub_one_va_multi_physical_tests_xehp_and_later.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/test/common/test_macros/test.h"
+#include "shared/test/unit_test/tests_configuration.h"
+#include "shared/test/unit_test/utilities/base_object_utils.h"
+
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/multicontext_aub_fixture.h"
+
+using namespace NEO;
+
+struct OneVAFourPhysicalStoragesTest : public MulticontextAubFixture, public ::testing::Test {
+    static const uint32_t numTiles = 4;
+    void SetUp() override {
+        MulticontextAubFixture::SetUp(numTiles, MulticontextAubFixture::EnabledCommandStreamers::Single, false);
+    }
+    void TearDown() override {
+        MulticontextAubFixture::TearDown();
+    }
+};
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, OneVAFourPhysicalStoragesTest, givenBufferWithFourPhysicalStoragesWhenEnqueueReadBufferThenReadFromCorrectBank) {
+    if (is32bit) {
+        return;
+    }
+    cl_int retVal = CL_OUT_OF_HOST_MEMORY;
+    const uint32_t bufferSize = MemoryConstants::pageSize64k;
+    uint8_t *memoryToWrite[numTiles];
+    uint8_t *memoryToRead[numTiles];
+
+    auto buffer = clUniquePtr<Buffer>(Buffer::create(context.get(), {}, bufferSize, nullptr, retVal));
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    buffer->forceDisallowCPUCopy = true;
+    auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
+    EXPECT_EQ(MemoryPool::LocalMemory, allocation->getMemoryPool());
+    auto gpuAddress = allocation->getGpuAddress();
+    allocation->storageInfo.cloningOfPageTables = false;
+    allocation->storageInfo.memoryBanks = 0;
+    allocation->setAubWritable(false, static_cast<uint32_t>(maxNBitValue(numTiles)));
+
+    for (uint32_t tile = 0; tile < numTiles; tile++) {
+        memoryToWrite[tile] = reinterpret_cast<uint8_t *>(alignedMalloc(bufferSize, MemoryConstants::pageSize64k));
+        std::fill(memoryToWrite[tile], ptrOffset(memoryToWrite[tile], bufferSize), tile + 1);
+
+        auto hardwareContext = getSimulatedCsr<FamilyType>(tile, 0)->hardwareContextController->hardwareContexts[0].get();
+        hardwareContext->writeMemory2({gpuAddress, memoryToWrite[tile], bufferSize, (1u << tile), AubMemDump::DataTypeHintValues::TraceNotype, MemoryConstants::pageSize64k});
+    }
+
+    for (uint32_t tile = 0; tile < numTiles; tile++) {
+        memoryToRead[tile] = reinterpret_cast<uint8_t *>(alignedMalloc(bufferSize, MemoryConstants::pageSize64k));
+
+        commandQueues[tile][0]->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, bufferSize, memoryToRead[tile], nullptr, 0, nullptr, nullptr);
+
+        commandQueues[tile][0]->flush();
+    }
+
+    for (uint32_t tile = 0; tile < numTiles; tile++) {
+        expectMemory<FamilyType>(memoryToRead[tile], memoryToWrite[tile], bufferSize, tile, 0);
+        alignedFree(memoryToWrite[tile]);
+        alignedFree(memoryToRead[tile]);
+    }
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, OneVAFourPhysicalStoragesTest, givenBufferWithFourPhysicalStoragesWhenEnqueueWriteBufferThenCorrectMemoryIsWrittenToSpecificBank) {
+    if (is32bit) {
+        return;
+    }
+    cl_int retVal = CL_OUT_OF_HOST_MEMORY;
+    const uint32_t bufferSize = MemoryConstants::pageSize64k;
+    uint8_t *memoryToWrite[numTiles];
+
+    auto buffer = clUniquePtr<Buffer>(Buffer::create(context.get(), {}, bufferSize, nullptr, retVal));
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    buffer->forceDisallowCPUCopy = true;
+    auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
+    EXPECT_EQ(MemoryPool::LocalMemory, allocation->getMemoryPool());
+    auto gpuAddress = allocation->getGpuAddress();
+    allocation->storageInfo.cloningOfPageTables = false;
+    allocation->storageInfo.memoryBanks = 0;
+
+    for (uint32_t tile = 0; tile < numTiles; tile++) {
+        memoryToWrite[tile] = reinterpret_cast<uint8_t *>(alignedMalloc(bufferSize, MemoryConstants::pageSize64k));
+        std::fill(memoryToWrite[tile], ptrOffset(memoryToWrite[tile], bufferSize), tile + 1);
+        allocation->setAubWritable(true, 0xffffffff);
+
+        commandQueues[tile][0]->enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, memoryToWrite[tile], nullptr, 0, nullptr, nullptr);
+    }
+
+    for (uint32_t tile = 0; tile < numTiles; tile++) {
+        expectMemory<FamilyType>(reinterpret_cast<void *>(gpuAddress), memoryToWrite[tile], bufferSize, tile, 0);
+        alignedFree(memoryToWrite[tile]);
+    }
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, OneVAFourPhysicalStoragesTest, givenColouredBufferWhenEnqueueWriteBufferThenCorrectMemoryIsWrittenToSpecificBank) {
+    if (is32bit) {
+        return;
+    }
+
+    cl_int retVal = CL_OUT_OF_HOST_MEMORY;
+    const uint32_t bufferSize = numTiles * MemoryConstants::pageSize64k;
+    const auto allTilesValue = maxNBitValue(numTiles);
+    uint8_t *memoryToWrite = reinterpret_cast<uint8_t *>(alignedMalloc(bufferSize, MemoryConstants::pageSize64k));
+
+    auto buffer = clUniquePtr<Buffer>(Buffer::create(context.get(), {}, bufferSize, nullptr, retVal));
+    EXPECT_EQ(CL_SUCCESS, retVal);
+    buffer->forceDisallowCPUCopy = true;
+    auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
+    EXPECT_EQ(MemoryPool::LocalMemory, allocation->getMemoryPool());
+    EXPECT_EQ(allTilesValue, allocation->storageInfo.memoryBanks.to_ullong());
+    EXPECT_EQ(allTilesValue, allocation->storageInfo.pageTablesVisibility.to_ullong());
+    EXPECT_TRUE(allocation->storageInfo.cloningOfPageTables);
+
+    for (uint32_t tile = 0; tile < numTiles; tile++) {
+        std::fill(ptrOffset(memoryToWrite, tile * MemoryConstants::pageSize64k), ptrOffset(memoryToWrite, (tile + 1) * MemoryConstants::pageSize64k), tile + 1);
+    }
+
+    commandQueues[0][0]->enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, memoryToWrite, nullptr, 0, nullptr, nullptr);
+
+    auto gpuAddress = allocation->getGpuAddress();
+    for (uint32_t tile = 0; tile < numTiles; tile++) {
+        for (uint32_t offset = 0; offset < bufferSize; offset += MemoryConstants::pageSize64k) {
+            expectMemory<FamilyType>(reinterpret_cast<void *>(gpuAddress + offset), ptrOffset(memoryToWrite, offset), MemoryConstants::pageSize64k, tile, 0);
+        }
+    }
+
+    alignedFree(memoryToWrite);
+}
--- a/opencl/test/unit_test/aub_tests/command_queue/aub_postsync_write_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/aub_postsync_write_tests_xehp_and_later.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/timestamp_packet.h"
+#include "shared/source/utilities/tag_allocator.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/mocks/mock_device.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/source/helpers/hardware_commands_helper.h"
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/hello_world_fixture.h"
+#include "opencl/test/unit_test/mocks/mock_command_queue.h"
+#include "opencl/test/unit_test/mocks/mock_context.h"
+
+using namespace NEO;
+
+struct PostSyncWriteXeHPTests : public HelloWorldFixture<AUBHelloWorldFixtureFactory>, public ::testing::Test {
+    void SetUp() override {
+        DebugManager.flags.EnableTimestampPacket.set(true);
+
+        HelloWorldFixture<AUBHelloWorldFixtureFactory>::SetUp();
+        EXPECT_TRUE(pCommandStreamReceiver->peekTimestampPacketWriteEnabled());
+    };
+
+    void TearDown() override {
+        HelloWorldFixture<AUBHelloWorldFixtureFactory>::TearDown();
+    }
+
+    DebugManagerStateRestore restore;
+    cl_int retVal = CL_SUCCESS;
+};
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, PostSyncWriteXeHPTests, givenTimestampWriteEnabledWhenEnqueueingThenWritePostsyncOperation) {
+    MockCommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, nullptr);
+
+    const uint32_t bufferSize = 4;
+
+    std::unique_ptr<Buffer> buffer(Buffer::create(pContext, CL_MEM_READ_WRITE, bufferSize, nullptr, retVal));
+    auto graphicsAllocation = buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex());
+    memset(graphicsAllocation->getUnderlyingBuffer(), 0, graphicsAllocation->getUnderlyingBufferSize());
+    buffer->forceDisallowCPUCopy = true;
+
+    uint8_t writeData[bufferSize] = {1, 2, 3, 4};
+    cmdQ.enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, writeData, nullptr, 0, nullptr, nullptr);
+    expectMemory<FamilyType>(reinterpret_cast<void *>(graphicsAllocation->getGpuAddress()), writeData, bufferSize);
+
+    typename FamilyType::TimestampPacketType expectedTimestampValues[4] = {1, 1, 1, 1};
+    auto tagGpuAddress = reinterpret_cast<void *>(cmdQ.timestampPacketContainer->peekNodes().at(0)->getGpuAddress());
+    expectMemoryNotEqual<FamilyType>(tagGpuAddress, expectedTimestampValues, 4 * sizeof(typename FamilyType::TimestampPacketType));
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, PostSyncWriteXeHPTests, givenDebugVariableEnabledWhenEnqueueingThenWritePostsyncOperationInImmWriteMode) {
+    DebugManager.flags.UseImmDataWriteModeOnPostSyncOperation.set(true);
+    MockCommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, nullptr);
+
+    const uint32_t bufferSize = 4;
+
+    std::unique_ptr<Buffer> buffer(Buffer::create(pContext, CL_MEM_READ_WRITE, bufferSize, nullptr, retVal));
+    auto graphicsAllocation = buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex());
+    memset(graphicsAllocation->getUnderlyingBuffer(), 0, graphicsAllocation->getUnderlyingBufferSize());
+    buffer->forceDisallowCPUCopy = true;
+
+    uint8_t writeData[bufferSize] = {1, 2, 3, 4};
+    cmdQ.enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, writeData, nullptr, 0, nullptr, nullptr);
+    expectMemory<FamilyType>(reinterpret_cast<void *>(graphicsAllocation->getGpuAddress()), writeData, bufferSize);
+
+    auto tagGpuAddress = reinterpret_cast<void *>(cmdQ.timestampPacketContainer->peekNodes().at(0)->getGpuAddress());
+
+    constexpr auto timestampPacketTypeSize = sizeof(typename FamilyType::TimestampPacketType);
+    if constexpr (timestampPacketTypeSize == 4u) {
+        typename FamilyType::TimestampPacketType expectedTimestampValues[4] = {1, 1, 2, 2};
+        expectMemory<FamilyType>(tagGpuAddress, expectedTimestampValues, 4 * timestampPacketTypeSize);
+    } else {
+        typename FamilyType::TimestampPacketType expectedTimestampValues[4] = {1, 1, 0x2'0000'0002u, 1};
+        expectMemory<FamilyType>(tagGpuAddress, expectedTimestampValues, 4 * timestampPacketTypeSize);
+    }
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, PostSyncWriteXeHPTests, givenTwoBatchedEnqueuesWhenDependencyIsResolvedThenDecrementCounterOnGpu) {
+    MockContext context(pCmdQ->getDevice().getSpecializedDevice<ClDevice>());
+    pCommandStreamReceiver->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
+
+    const size_t bufferSize = 1024;
+    auto retVal = CL_SUCCESS;
+    uint8_t initialMemory[bufferSize] = {};
+    uint8_t writePattern1[bufferSize];
+    uint8_t writePattern2[bufferSize];
+    std::fill(writePattern1, writePattern1 + sizeof(writePattern1), 1);
+    std::fill(writePattern2, writePattern2 + sizeof(writePattern2), 1);
+
+    auto buffer = std::unique_ptr<Buffer>(Buffer::create(&context, CL_MEM_COPY_HOST_PTR, bufferSize, initialMemory, retVal));
+    //make sure that GPU copy is used
+    buffer->forceDisallowCPUCopy = true;
+    cl_event outEvent1, outEvent2;
+
+    pCmdQ->enqueueWriteBuffer(buffer.get(), CL_FALSE, 0, bufferSize, writePattern1, nullptr, 0, nullptr, &outEvent1);
+    auto node1 = castToObject<Event>(outEvent1)->getTimestampPacketNodes()->peekNodes().at(0);
+    node1->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation()->setAubWritable(true, 0xffffffff); // allow to write again after Buffer::create
+
+    pCmdQ->enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, writePattern2, nullptr, 0, nullptr, &outEvent2);
+    auto node2 = castToObject<Event>(outEvent2)->getTimestampPacketNodes()->peekNodes().at(0);
+
+    expectMemory<FamilyType>(reinterpret_cast<void *>(buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddress()), writePattern2, bufferSize);
+
+    typename FamilyType::TimestampPacketType expectedEndTimestamp = 1;
+    auto endTimestampAddress1 = TimestampPacketHelper::getContextEndGpuAddress(*node1);
+    auto endTimestampAddress2 = TimestampPacketHelper::getGlobalEndGpuAddress(*node1);
+    auto endTimestampAddress3 = TimestampPacketHelper::getContextEndGpuAddress(*node2);
+    auto endTimestampAddress4 = TimestampPacketHelper::getGlobalEndGpuAddress(*node2);
+    expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress1), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
+    expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress2), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
+    expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress3), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
+    expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress4), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
+
+    clReleaseEvent(outEvent1);
+    clReleaseEvent(outEvent2);
+}
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, PostSyncWriteXeHPTests, givenMultipleWalkersWhenEnqueueingThenWriteAllTimestamps) {
+    MockContext context(pCmdQ->getDevice().getSpecializedDevice<ClDevice>());
+    const size_t bufferSize = 70;
+    const size_t writeSize = bufferSize - 2;
+    uint8_t writeData[writeSize] = {};
+    cl_int retVal = CL_SUCCESS;
+    cl_event outEvent;
+
+    auto buffer = std::unique_ptr<Buffer>(Buffer::create(&context, CL_MEM_READ_WRITE, bufferSize, nullptr, retVal));
+    buffer->forceDisallowCPUCopy = true;
+
+    pCmdQ->enqueueWriteBuffer(buffer.get(), CL_TRUE, 1, writeSize, writeData, nullptr, 0, nullptr, &outEvent);
+
+    auto &timestampNodes = castToObject<Event>(outEvent)->getTimestampPacketNodes()->peekNodes();
+
+    EXPECT_EQ(2u, timestampNodes.size());
+
+    typename FamilyType::TimestampPacketType expectedEndTimestamp = 1;
+    auto endTimestampAddress1 = TimestampPacketHelper::getContextEndGpuAddress(*timestampNodes[0]);
+    auto endTimestampAddress2 = TimestampPacketHelper::getGlobalEndGpuAddress(*timestampNodes[0]);
+    auto endTimestampAddress3 = TimestampPacketHelper::getContextEndGpuAddress(*timestampNodes[1]);
+    auto endTimestampAddress4 = TimestampPacketHelper::getGlobalEndGpuAddress(*timestampNodes[1]);
+    expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress1), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
+    expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress2), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
+    expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress3), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
+    expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress4), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
+
+    clReleaseEvent(outEvent);
+}
--- a/opencl/test/unit_test/aub_tests/command_queue/aub_scratch_space_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/aub_scratch_space_tests_xehp_and_later.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/array_count.h"
+#include "shared/test/common/cmd_parse/hw_parse.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/mocks/mock_scratch_space_controller_xehp_and_later.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/source/helpers/hardware_commands_helper.h"
+#include "opencl/test/unit_test/aub_tests/command_stream/aub_command_stream_fixture.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
+#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
+#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
+#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
+#include "opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h"
+#include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
+#include "opencl/test/unit_test/indirect_heap/indirect_heap_fixture.h"
+
+using namespace NEO;
+
+struct Gen12AubScratchSpaceForPrivateFixture : public KernelAUBFixture<SimpleKernelFixture> {
+    void SetUp() override {
+        debugRestorer = std::make_unique<DebugManagerStateRestore>();
+
+        kernelIdx = 6;
+        kernelIds |= (1 << kernelIdx);
+        KernelAUBFixture<SimpleKernelFixture>::SetUp();
+
+        arraySize = 32;
+        vectorSize = 2;
+        typeSize = sizeof(uint32_t);
+
+        gwsSize = arraySize;
+        lwsSize = 32;
+        maxIterations1 = static_cast<uint32_t>(arraySize);
+        maxIterations2 = static_cast<uint32_t>(arraySize);
+        scalar = 0x4;
+
+        expectedMemorySize = arraySize * vectorSize * typeSize;
+
+        srcBuffer = alignedMalloc(expectedMemorySize, 0x1000);
+        ASSERT_NE(nullptr, srcBuffer);
+        auto srcBufferUint = static_cast<uint32_t *>(srcBuffer);
+        uint32_t valOdd = 0x1;
+        uint32_t valEven = 0x3;
+        for (uint32_t i = 0; i < arraySize * vectorSize; ++i) {
+            if (i % 2) {
+                srcBufferUint[i] = valOdd;
+            } else {
+                srcBufferUint[i] = valEven;
+            }
+        }
+        uint32_t sumOdd = 0;
+        uint32_t sumEven = 0;
+        for (uint32_t i = 0; i < arraySize; ++i) {
+            sumOdd += ((i + scalar) + valOdd);
+            sumEven += (i + valEven);
+        }
+
+        dstBuffer = alignedMalloc(expectedMemorySize, 0x1000);
+        ASSERT_NE(nullptr, dstBuffer);
+        memset(dstBuffer, 0, expectedMemorySize);
+
+        expectedMemory = alignedMalloc(expectedMemorySize, 0x1000);
+        ASSERT_NE(nullptr, expectedMemory);
+        auto expectedMemoryUint = static_cast<uint32_t *>(expectedMemory);
+        for (uint32_t i = 0; i < arraySize * vectorSize; ++i) {
+            if (i % 2) {
+                expectedMemoryUint[i] = sumOdd;
+            } else {
+                expectedMemoryUint[i] = sumEven;
+            }
+        }
+
+        kernels[kernelIdx]->setArgSvm(0, expectedMemorySize, dstBuffer, nullptr, 0u);
+        dstAllocation = createHostPtrAllocationFromSvmPtr(dstBuffer, expectedMemorySize);
+
+        kernels[kernelIdx]->setArgSvm(1, expectedMemorySize, srcBuffer, nullptr, 0u);
+        srcAllocation = createHostPtrAllocationFromSvmPtr(srcBuffer, expectedMemorySize);
+
+        kernels[kernelIdx]->setArg(2, sizeof(uint32_t), &scalar);
+        kernels[kernelIdx]->setArg(3, sizeof(uint32_t), &maxIterations1);
+        kernels[kernelIdx]->setArg(4, sizeof(uint32_t), &maxIterations2);
+    }
+
+    void TearDown() override {
+        pCmdQ->flush();
+
+        if (expectedMemory) {
+            alignedFree(expectedMemory);
+            expectedMemory = nullptr;
+        }
+        if (srcBuffer) {
+            alignedFree(srcBuffer);
+            srcBuffer = nullptr;
+        }
+        if (dstBuffer) {
+            alignedFree(dstBuffer);
+            dstBuffer = nullptr;
+        }
+
+        KernelAUBFixture<SimpleKernelFixture>::TearDown();
+    }
+
+    std::unique_ptr<DebugManagerStateRestore> debugRestorer;
+
+    size_t arraySize;
+    size_t vectorSize;
+    size_t typeSize;
+    size_t gwsSize;
+    size_t lwsSize;
+    uint32_t kernelIdx;
+
+    void *expectedMemory = nullptr;
+    size_t expectedMemorySize = 0;
+
+    void *srcBuffer = nullptr;
+    void *dstBuffer = nullptr;
+    GraphicsAllocation *srcAllocation;
+    GraphicsAllocation *dstAllocation;
+
+    uint32_t scalar;
+    uint32_t maxIterations1;
+    uint32_t maxIterations2;
+};
+
+using Gen12AubScratchSpaceForPrivateTest = Test<Gen12AubScratchSpaceForPrivateFixture>;
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, Gen12AubScratchSpaceForPrivateTest, WhenKernelUsesScratchSpaceForPrivateThenExpectCorrectResults) {
+    cl_uint workDim = 1;
+    size_t globalWorkOffset[3] = {0, 0, 0};
+    size_t globalWorkSize[3] = {gwsSize, 1, 1};
+    size_t localWorkSize[3] = {lwsSize, 1, 1};
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernels[kernelIdx].get(),
+        workDim,
+        globalWorkOffset,
+        globalWorkSize,
+        localWorkSize,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    pCmdQ->flush();
+
+    expectMemory<FamilyType>(dstBuffer, expectedMemory, expectedMemorySize);
+}
+
+class DefaultGrfKernelFixture : public ProgramFixture {
+  public:
+    using ProgramFixture::SetUp;
+
+  protected:
+    void SetUp(ClDevice *device, Context *context) {
+        ProgramFixture::SetUp();
+
+        std::string programName("simple_spill_fill_kernel");
+        CreateProgramFromBinary(
+            context,
+            context->getDevices(),
+            programName);
+        ASSERT_NE(nullptr, pProgram);
+
+        retVal = pProgram->build(
+            pProgram->getDevices(),
+            nullptr,
+            false);
+        ASSERT_EQ(CL_SUCCESS, retVal);
+
+        kernel.reset(Kernel::create<MockKernel>(
+            pProgram,
+            pProgram->getKernelInfoForKernel("spill_test"),
+            *device,
+            &retVal));
+    }
+
+    void TearDown() override {
+        if (kernel) {
+            kernel.reset(nullptr);
+        }
+
+        ProgramFixture::TearDown();
+    }
+
+    cl_int retVal = CL_SUCCESS;
+    std::unique_ptr<Kernel> kernel;
+};
+
+struct Gen12AubScratchSpaceForSpillFillFixture : public KernelAUBFixture<DefaultGrfKernelFixture> {
+    void SetUp() override {
+        debugRestorer = std::make_unique<DebugManagerStateRestore>();
+
+        KernelAUBFixture<DefaultGrfKernelFixture>::SetUp();
+
+        arraySize = 32;
+        typeSize = sizeof(cl_int);
+
+        gwsSize = arraySize;
+        lwsSize = 32;
+
+        expectedMemorySize = (arraySize * 2 + 1) * typeSize - 4;
+        inMemorySize = expectedMemorySize;
+        outMemorySize = expectedMemorySize;
+        offsetMemorySize = 128 * arraySize;
+
+        srcBuffer = alignedMalloc(inMemorySize, 0x1000);
+        ASSERT_NE(nullptr, srcBuffer);
+        memset(srcBuffer, 0, inMemorySize);
+
+        outBuffer = alignedMalloc(outMemorySize, 0x1000);
+        ASSERT_NE(nullptr, outBuffer);
+        memset(outBuffer, 0, outMemorySize);
+
+        expectedMemory = alignedMalloc(expectedMemorySize, 0x1000);
+        ASSERT_NE(nullptr, expectedMemory);
+        memset(expectedMemory, 0, expectedMemorySize);
+
+        offsetBuffer = alignedMalloc(offsetMemorySize, 0x1000);
+        ASSERT_NE(nullptr, expectedMemory);
+        memset(offsetBuffer, 0, offsetMemorySize);
+
+        auto srcBufferInt = static_cast<cl_int *>(srcBuffer);
+        auto expectedMemoryInt = static_cast<cl_int *>(expectedMemory);
+        const int expectedVal1 = 16256;
+        const int expectedVal2 = 512;
+
+        for (uint32_t i = 0; i < arraySize; ++i) {
+            srcBufferInt[i] = 2;
+            expectedMemoryInt[i * 2] = expectedVal1;
+            expectedMemoryInt[i * 2 + 1] = expectedVal2;
+        }
+
+        auto &kernelInfo = kernel->getKernelInfo();
+        EXPECT_NE(0u, kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0]);
+        EXPECT_EQ(128u, kernelInfo.kernelDescriptor.kernelAttributes.numGrfRequired);
+
+        kernel->setArgSvm(0, inMemorySize, srcBuffer, nullptr, 0u);
+        inAllocation = createHostPtrAllocationFromSvmPtr(srcBuffer, inMemorySize);
+
+        kernel->setArgSvm(1, outMemorySize, outBuffer, nullptr, 0u);
+        outAllocation = createHostPtrAllocationFromSvmPtr(outBuffer, outMemorySize);
+
+        kernel->setArgSvm(2, offsetMemorySize, offsetBuffer, nullptr, 0u);
+        offsetAllocation = createHostPtrAllocationFromSvmPtr(offsetBuffer, offsetMemorySize);
+    }
+
+    void TearDown() override {
+        pCmdQ->flush();
+
+        if (expectedMemory) {
+            alignedFree(expectedMemory);
+            expectedMemory = nullptr;
+        }
+        if (srcBuffer) {
+            alignedFree(srcBuffer);
+            srcBuffer = nullptr;
+        }
+        if (outBuffer) {
+            alignedFree(outBuffer);
+            outBuffer = nullptr;
+        }
+        if (offsetBuffer) {
+            alignedFree(offsetBuffer);
+            offsetBuffer = nullptr;
+        }
+
+        KernelAUBFixture<DefaultGrfKernelFixture>::TearDown();
+    }
+
+    std::unique_ptr<DebugManagerStateRestore> debugRestorer;
+
+    size_t arraySize;
+    size_t vectorSize;
+    size_t typeSize;
+    size_t gwsSize;
+    size_t lwsSize;
+
+    void *expectedMemory = nullptr;
+    size_t expectedMemorySize = 0;
+    size_t inMemorySize = 0;
+    size_t outMemorySize = 0;
+    size_t offsetMemorySize = 0;
+
+    void *srcBuffer = nullptr;
+    void *outBuffer = nullptr;
+    void *offsetBuffer = nullptr;
+    GraphicsAllocation *inAllocation;
+    GraphicsAllocation *outAllocation;
+    GraphicsAllocation *offsetAllocation;
+};
+
+using Gen12AubScratchSpaceForSpillFillTest = Test<Gen12AubScratchSpaceForSpillFillFixture>;
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, Gen12AubScratchSpaceForSpillFillTest, givenSurfaceStateScratchSpaceEnabledWhenKernelUsesScratchForSpillFillThenExpectCorrectResults) {
+    cl_uint workDim = 1;
+    size_t globalWorkOffset[3] = {0, 0, 0};
+    size_t globalWorkSize[3] = {gwsSize, 1, 1};
+    size_t localWorkSize[3] = {lwsSize, 1, 1};
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+
+    auto retVal = pCmdQ->enqueueKernel(
+        kernel.get(),
+        workDim,
+        globalWorkOffset,
+        globalWorkSize,
+        localWorkSize,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+    ASSERT_EQ(CL_SUCCESS, retVal);
+
+    pCmdQ->finish();
+
+    expectMemory<FamilyType>(outBuffer, expectedMemory, expectedMemorySize);
+}
--- a/opencl/test/unit_test/aub_tests/command_queue/compression_aub_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/compression_aub_tests_xehp_and_later.cpp
@@ -0,0 +1,325 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/gmm_helper/gmm.h"
+#include "shared/source/gmm_helper/resource_info.h"
+#include "shared/source/memory_manager/internal_allocation_storage.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/mocks/mock_allocation_properties.h"
+#include "shared/test/common/test_macros/test.h"
+#include "shared/test/common/test_macros/test_checks_shared.h"
+
+#include "opencl/extensions/public/cl_ext_private.h"
+#include "opencl/source/helpers/cl_memory_properties_helpers.h"
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/source/mem_obj/image.h"
+#include "opencl/source/platform/platform.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
+
+#include "test_traits_common.h"
+
+using namespace NEO;
+
+template <bool useLocalMemory = true>
+struct CompressionXeHPAndLater : public AUBFixture,
+                                 public ::testing::Test,
+                                 public ::testing::WithParamInterface<uint32_t /*EngineType*/> {
+    void SetUp() override {
+        REQUIRE_64BIT_OR_SKIP();
+
+        debugRestorer = std::make_unique<DebugManagerStateRestore>();
+        DebugManager.flags.RenderCompressedBuffersEnabled.set(true);
+        DebugManager.flags.RenderCompressedImagesEnabled.set(true);
+        DebugManager.flags.EnableLocalMemory.set(useLocalMemory);
+        DebugManager.flags.NodeOrdinal.set(GetParam());
+
+        auto &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily);
+
+        auto expectedEngine = static_cast<aub_stream::EngineType>(GetParam());
+        bool engineSupported = false;
+        for (auto &engine : hwHelper.getGpgpuEngineInstances(*defaultHwInfo)) {
+            if (engine.first == expectedEngine) {
+                engineSupported = true;
+                break;
+            }
+        }
+
+        if (!engineSupported) {
+            GTEST_SKIP();
+        }
+
+        AUBFixture::SetUp(defaultHwInfo.get());
+        auto &ftrTable = device->getHardwareInfo().featureTable;
+        if ((!ftrTable.flags.ftrFlatPhysCCS) ||
+            (!ftrTable.flags.ftrLocalMemory && useLocalMemory)) {
+            GTEST_SKIP();
+        }
+        context->contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
+    }
+    void TearDown() override {
+        AUBFixture::TearDown();
+    }
+    std::unique_ptr<DebugManagerStateRestore> debugRestorer;
+
+    cl_int retVal = CL_SUCCESS;
+
+    template <typename FamilyType>
+    void givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect();
+    template <typename FamilyType>
+    void givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect();
+    template <typename FamilyType>
+    void givenCompressedImageWhenReadingThenResultsAreCorrect();
+};
+
+template <bool testLocalMemory>
+template <typename FamilyType>
+void CompressionXeHPAndLater<testLocalMemory>::givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect() {
+    const size_t bufferSize = 2048;
+    uint8_t writePattern[bufferSize];
+    std::fill(writePattern, writePattern + sizeof(writePattern), 1);
+
+    device->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
+
+    auto compressedBuffer = std::unique_ptr<Buffer>(Buffer::create(context, CL_MEM_READ_WRITE | CL_MEM_COMPRESSED_HINT_INTEL, bufferSize, nullptr, retVal));
+    auto compressedAllocation = compressedBuffer->getGraphicsAllocation(device->getRootDeviceIndex());
+    memset(compressedAllocation->getUnderlyingBuffer(), 0, bufferSize);
+    EXPECT_NE(nullptr, compressedAllocation->getDefaultGmm()->gmmResourceInfo->peekHandle());
+    EXPECT_TRUE(compressedAllocation->getDefaultGmm()->isCompressionEnabled);
+    if (testLocalMemory) {
+        EXPECT_EQ(MemoryPool::LocalMemory, compressedAllocation->getMemoryPool());
+    } else {
+        EXPECT_EQ(MemoryPool::System4KBPages, compressedAllocation->getMemoryPool());
+    }
+
+    auto notCompressedBuffer = std::unique_ptr<Buffer>(Buffer::create(context, CL_MEM_READ_WRITE, bufferSize, nullptr, retVal));
+    auto nonCompressedAllocation = notCompressedBuffer->getGraphicsAllocation(device->getRootDeviceIndex());
+    nonCompressedAllocation->setAllocationType(GraphicsAllocation::AllocationType::BUFFER);
+    if (nonCompressedAllocation->getDefaultGmm()) {
+        nonCompressedAllocation->getDefaultGmm()->isCompressionEnabled = false;
+    }
+    memset(nonCompressedAllocation->getUnderlyingBuffer(), 0, bufferSize);
+
+    pCmdQ->enqueueWriteBuffer(compressedBuffer.get(), CL_FALSE, 0, bufferSize, writePattern, nullptr, 0, nullptr, nullptr);
+    pCmdQ->enqueueCopyBuffer(compressedBuffer.get(), notCompressedBuffer.get(), 0, 0, bufferSize, 0, nullptr, nullptr);
+    pCmdQ->finish();
+
+    expectNotEqualMemory<FamilyType>(AUBFixture::getGpuPointer(compressedAllocation),
+                                     writePattern, bufferSize);
+
+    expectMemory<FamilyType>(AUBFixture::getGpuPointer(nonCompressedAllocation),
+                             writePattern, bufferSize);
+}
+
+template <bool testLocalMemory>
+template <typename FamilyType>
+void CompressionXeHPAndLater<testLocalMemory>::givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect() {
+    const size_t imageWidth = 16;
+    const size_t imageHeight = 16;
+
+    const size_t bufferSize = 64 * KB;
+    uint8_t writePattern[bufferSize];
+    std::fill(writePattern, writePattern + sizeof(writePattern), 1);
+
+    device->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
+
+    auto compressedBuffer = std::unique_ptr<Buffer>(Buffer::create(context, CL_MEM_COPY_HOST_PTR | CL_MEM_COMPRESSED_HINT_INTEL, bufferSize, writePattern, retVal));
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    //now create image2DFromBuffer
+
+    cl_image_desc imageDescriptor = {};
+    imageDescriptor.mem_object = compressedBuffer.get();
+    imageDescriptor.image_height = imageWidth;
+    imageDescriptor.image_width = imageHeight;
+    imageDescriptor.image_type = CL_MEM_OBJECT_IMAGE2D;
+    cl_image_format imageFormat = {};
+    imageFormat.image_channel_data_type = CL_UNSIGNED_INT32;
+    imageFormat.image_channel_order = CL_RGBA;
+
+    auto clCompressedImage = clCreateImage(context, CL_MEM_READ_WRITE, &imageFormat, &imageDescriptor, nullptr, &retVal);
+    auto compressedImage = castToObject<Image>(clCompressedImage);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    const size_t perChannelDataSize = sizeof(cl_uint);
+    const size_t numChannels = 4;
+    const auto imageSize = imageWidth * imageHeight * perChannelDataSize * numChannels;
+    cl_uint destMemory[imageSize / sizeof(cl_uint)] = {0};
+    const size_t origin[] = {0, 0, 0};
+    const size_t region[] = {imageWidth, imageHeight, 1};
+
+    retVal = pCmdQ->enqueueReadImage(
+        compressedImage,
+        CL_FALSE,
+        origin,
+        region,
+        0,
+        0,
+        destMemory,
+        nullptr,
+        0,
+        nullptr,
+        nullptr);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    retVal = pCmdQ->flush();
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    expectMemory<FamilyType>(destMemory, writePattern, imageSize);
+
+    //make sure our objects are in in fact compressed
+    auto graphicsAllocation = compressedBuffer->getGraphicsAllocation(device->getRootDeviceIndex());
+    EXPECT_NE(nullptr, graphicsAllocation->getDefaultGmm());
+    EXPECT_TRUE(graphicsAllocation->getDefaultGmm()->isCompressionEnabled);
+    EXPECT_TRUE(compressedImage->getGraphicsAllocation(device->getRootDeviceIndex())->getDefaultGmm()->isCompressionEnabled);
+
+    expectNotEqualMemory<FamilyType>(reinterpret_cast<void *>(graphicsAllocation->getGpuAddress()), writePattern, bufferSize);
+
+    clReleaseMemObject(clCompressedImage);
+}
+
+template <bool testLocalMemory>
+template <typename FamilyType>
+void CompressionXeHPAndLater<testLocalMemory>::givenCompressedImageWhenReadingThenResultsAreCorrect() {
+    const size_t imageWidth = 8;
+    const size_t imageHeight = 4;
+    const size_t perChannelDataSize = sizeof(cl_float);
+    const size_t numChannels = 4;
+    const auto imageSize = imageWidth * imageHeight * perChannelDataSize * numChannels;
+    const auto rowSize = imageSize / imageHeight;
+    cl_float srcMemory[imageSize / sizeof(cl_float)] = {0};
+
+    const cl_float row[rowSize] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+                                   1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+                                   1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+                                   1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
+    cl_float *pixel = srcMemory;
+    for (uint32_t height = 0; height < imageHeight; height++) {
+        memcpy(pixel, row, rowSize);
+        pixel += imageWidth;
+    }
+
+    cl_float destMemory[imageSize / sizeof(cl_float)] = {0};
+
+    cl_image_format imageFormat;
+    cl_image_desc imageDesc;
+    imageFormat.image_channel_data_type = CL_FLOAT;
+    imageFormat.image_channel_order = CL_RGBA;
+
+    imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
+    imageDesc.image_width = imageWidth;
+    imageDesc.image_height = imageHeight;
+    imageDesc.image_depth = 1;
+    imageDesc.image_array_size = 1;
+    imageDesc.image_row_pitch = 0;
+    imageDesc.image_slice_pitch = 0;
+    imageDesc.num_mip_levels = 0;
+    imageDesc.num_samples = 0;
+    imageDesc.mem_object = NULL;
+
+    auto allocation = csr->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), false, imageSize}, destMemory);
+    csr->makeResidentHostPtrAllocation(allocation);
+    csr->getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), TEMPORARY_ALLOCATION);
+
+    cl_mem_flags flags = CL_MEM_USE_HOST_PTR;
+    auto surfaceFormat = Image::getSurfaceFormatFromTable(flags, &imageFormat, context->getDevice(0)->getHardwareInfo().capabilityTable.supportsOcl21Features);
+    auto retVal = CL_INVALID_VALUE;
+    std::unique_ptr<Image> srcImage(Image::create(
+        context,
+        ClMemoryPropertiesHelper::createMemoryProperties(flags, 0, 0, &context->getDevice(0)->getDevice()),
+        flags,
+        0,
+        surfaceFormat,
+        &imageDesc,
+        srcMemory,
+        retVal));
+    ASSERT_NE(nullptr, srcImage);
+
+    cl_bool blockingRead = CL_FALSE;
+    cl_uint numEventsInWaitList = 0;
+    cl_event *eventWaitList = nullptr;
+    cl_event *event = nullptr;
+    const size_t origin[] = {0, 0, 0};
+    const size_t region[] = {imageWidth, imageHeight, 1};
+
+    retVal = pCmdQ->enqueueReadImage(
+        srcImage.get(),
+        blockingRead,
+        origin,
+        region,
+        0,
+        0,
+        destMemory,
+        nullptr,
+        numEventsInWaitList,
+        eventWaitList,
+        event);
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    allocation = csr->getTemporaryAllocations().peekHead();
+    while (allocation && allocation->getUnderlyingBuffer() != destMemory) {
+        allocation = allocation->next;
+    }
+    auto pDestGpuAddress = reinterpret_cast<void *>(allocation->getGpuAddress());
+
+    pCmdQ->flush();
+    EXPECT_EQ(CL_SUCCESS, retVal);
+
+    expectMemory<FamilyType>(pDestGpuAddress, srcMemory, imageSize);
+    expectNotEqualMemory<FamilyType>(AUBFixture::getGpuPointer(srcImage->getGraphicsAllocation(rootDeviceIndex)), srcMemory, imageSize);
+}
+
+struct CompressionLocalAubsSupportedMatcher {
+    template <PRODUCT_FAMILY productFamily>
+    static constexpr bool isMatched() {
+        if constexpr (HwMapper<productFamily>::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) {
+            return TestTraits<NEO::ToGfxCoreFamily<productFamily>::get()>::localMemCompressionAubsSupported;
+        }
+        return false;
+    }
+};
+
+struct CompressionSystemAubsSupportedMatcher {
+    template <PRODUCT_FAMILY productFamily>
+    static constexpr bool isMatched() {
+        if constexpr (HwMapper<productFamily>::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) {
+            return TestTraits<NEO::ToGfxCoreFamily<productFamily>::get()>::systemMemCompressionAubsSupported;
+        }
+        return false;
+    }
+};
+
+using CompressionLocalXeHPAndLater = CompressionXeHPAndLater<true>;
+HWTEST2_P(CompressionLocalXeHPAndLater, givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect, CompressionLocalAubsSupportedMatcher) {
+    givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect<FamilyType>();
+}
+HWTEST2_P(CompressionLocalXeHPAndLater, givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect, CompressionLocalAubsSupportedMatcher) {
+    givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect<FamilyType>();
+}
+HWTEST2_P(CompressionLocalXeHPAndLater, givenCompressedImageWhenReadingThenResultsAreCorrect, CompressionLocalAubsSupportedMatcher) {
+    givenCompressedImageWhenReadingThenResultsAreCorrect<FamilyType>();
+}
+
+INSTANTIATE_TEST_CASE_P(,
+                        CompressionLocalXeHPAndLater,
+                        ::testing::Values(aub_stream::ENGINE_RCS,
+                                          aub_stream::ENGINE_CCS));
+
+using CompressionSystemXeHPAndLater = CompressionXeHPAndLater<false>;
+HWTEST2_P(CompressionSystemXeHPAndLater, GENERATEONLY_givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect, CompressionSystemAubsSupportedMatcher) {
+    givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect<FamilyType>();
+}
+HWTEST2_P(CompressionSystemXeHPAndLater, GENERATEONLY_givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect, CompressionSystemAubsSupportedMatcher) {
+    givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect<FamilyType>();
+}
+HWTEST2_P(CompressionSystemXeHPAndLater, givenCompressedImageWhenReadingThenResultsAreCorrect, CompressionSystemAubsSupportedMatcher) {
+    givenCompressedImageWhenReadingThenResultsAreCorrect<FamilyType>();
+}
+
+INSTANTIATE_TEST_CASE_P(,
+                        CompressionSystemXeHPAndLater,
+                        ::testing::Values(aub_stream::ENGINE_RCS,
+                                          aub_stream::ENGINE_CCS));
--- a/opencl/test/unit_test/aub_tests/command_queue/multi_tile_buffers_aub_tests_xehp_and_later.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/multi_tile_buffers_aub_tests_xehp_and_later.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/source/helpers/constants.h"
+#include "shared/test/common/helpers/debug_manager_state_restore.h"
+#include "shared/test/common/test_macros/test.h"
+
+#include "opencl/extensions/public/cl_ext_private.h"
+#include "opencl/source/helpers/cl_memory_properties_helpers.h"
+#include "opencl/source/mem_obj/buffer.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
+#include "opencl/test/unit_test/aub_tests/fixtures/multicontext_aub_fixture.h"
+
+#include <array>
+
+struct MultiTileBuffersXeHPAndLater : public MulticontextAubFixture, public ::testing::Test {
+    static constexpr uint32_t numTiles = 2;
+
+    void SetUp() override {
+        MulticontextAubFixture::SetUp(numTiles, EnabledCommandStreamers::Single, false);
+    }
+    void TearDown() override {
+        MulticontextAubFixture::TearDown();
+    }
+};
+
+HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileBuffersXeHPAndLater, givenTwoBuffersAllocatedOnDifferentTilesWhenCopiedThenDataValidates) {
+    if constexpr (is64bit) {
+
+        constexpr size_t bufferSize = 64 * 1024u;
+
+        char bufferTile0Memory[bufferSize] = {};
+        char bufferTile1Memory[bufferSize] = {};
+
+        for (auto index = 0u; index < bufferSize; index++) {
+            bufferTile0Memory[index] = index % 255;
+            bufferTile1Memory[index] = index % 255;
+        }
+
+        auto retVal = CL_INVALID_VALUE;
+
+        cl_mem_flags flags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
+        MemoryProperties memoryProperties =
+            ClMemoryPropertiesHelper::createMemoryProperties(flags, 0, 0, &context->getDevice(0)->getDevice());
+        memoryProperties.pDevice = &context->getDevice(1)->getDevice();
+        auto srcBuffer = std::unique_ptr<Buffer>(Buffer::create(context.get(), memoryProperties, flags, 0, bufferSize, bufferTile0Memory, retVal));
+        ASSERT_NE(nullptr, srcBuffer);
+
+        flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
+        memoryProperties.pDevice = &context->getDevice(2)->getDevice();
+        auto dstBuffer = std::unique_ptr<Buffer>(Buffer::create(context.get(), memoryProperties, flags, 0, bufferSize, bufferTile1Memory, retVal));
+        ASSERT_NE(nullptr, dstBuffer);
+
+        auto cmdQ = commandQueues[0][0].get();
+
+        expectMemory<FamilyType>(AUBFixture::getGpuPointer(srcBuffer->getGraphicsAllocation(rootDeviceIndex)), bufferTile0Memory, bufferSize, 0, 0);
+        expectMemory<FamilyType>(AUBFixture::getGpuPointer(dstBuffer->getGraphicsAllocation(rootDeviceIndex)), bufferTile1Memory, bufferSize, 0, 0);
+
+        cl_uint numEventsInWaitList = 0;
+        cl_event *eventWaitList = nullptr;
+        cl_event *event = nullptr;
+
+        retVal = cmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer.get(),
+                                         0, 0,
+                                         bufferSize, numEventsInWaitList,
+                                         eventWaitList, event);
+
+        EXPECT_EQ(CL_SUCCESS, retVal);
+
+        cmdQ->flush();
+
+        expectMemory<FamilyType>(AUBFixture::getGpuPointer(dstBuffer->getGraphicsAllocation(rootDeviceIndex)), bufferTile0Memory, bufferSize, 0, 0);
+    }
+}
--- a/opencl/test/unit_test/aub_tests/command_queue/single_tile_products_excludes.cpp
+++ b/opencl/test/unit_test/aub_tests/command_queue/single_tile_products_excludes.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (C) 2022 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#include "shared/test/common/test_macros/test.h"
+
+HWTEST_EXCLUDE_PRODUCT(FourTilesAllContextsTest, GENERATEONLY_givenFourTilesAndAllContextsWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(FourTilesDualContextTest, HEAVY_givenFourTilesAndDualContextWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(FourTilesSingleContextTest, givenFourTilesAndSingleContextWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(DynamicWalkerPartitionFourTilesTests, whenWalkerPartitionIsEnabledForKernelWithAtomicThenOutputDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(DynamicWalkerPartitionFourTilesTests, whenWalkerPartitionIsEnabledForKernelWithoutAtomicThenOutputDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(TwoTilesAllContextsTest, HEAVY_givenTwoTilesAndAllContextsWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(TwoTilesDualContextTest, givenTwoTilesAndDualContextWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(TwoTilesSingleContextTest, givenTwoTilesAndSingleContextWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(TwoTilesSingleContextTest, givenTwoTilesAndSingleContextWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(TwoTilesDualContextTest, givenTwoTilesAndDualContextWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(TwoTilesAllContextsTest, GENERATEONLY_givenTwoTilesAndAllContextsWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(FourTilesSingleContextTest, givenFourTilesAndSingleContextWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(FourTilesDualContextTest, GENERATEONLY_givenFourTilesAndDualContextWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(FourTilesAllContextsTest, GENERATEONLY_givenFourTilesAndAllContextsWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(OneVAFourPhysicalStoragesTest, givenBufferWithFourPhysicalStoragesWhenEnqueueReadBufferThenReadFromCorrectBank, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(OneVAFourPhysicalStoragesTest, givenBufferWithFourPhysicalStoragesWhenEnqueueWriteBufferThenCorrectMemoryIsWrittenToSpecificBank, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(OneVAFourPhysicalStoragesTest, givenColouredBufferWhenEnqueueWriteBufferThenCorrectMemoryIsWrittenToSpecificBank, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(MultiTileBuffersXeHPAndLater, givenTwoBuffersAllocatedOnDifferentTilesWhenCopiedThenDataValidates, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(StaticWalkerPartitionFourTilesTests, givenFourTilesWhenStaticWalkerPartitionIsEnabledForKernelThenOutputDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(StaticWalkerPartitionFourTilesTests, givenPreWalkerSyncWhenStaticWalkerPartitionIsThenAtomicsAreIncrementedCorrectly, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(StaticWalkerPartitionFourTilesTests, whenNoPreWalkerSyncThenAtomicsAreIncrementedCorrectly, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(SingleTileAllContextsTest, HEAVY_givenSingleTileAndAllContextsWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
+HWTEST_EXCLUDE_PRODUCT(SingleTileAllContextsTest, GENERATEONLY_givenSingleTileAndAllContextsWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);