Add command queue aub tests

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2022-01-24 16:37:55 +00:00
committed by Compute-Runtime-Automation
parent 010186d0da
commit 43e147d84f
10 changed files with 2269 additions and 1 deletions

View File

@@ -1,5 +1,5 @@
#
# Copyright (C) 2018-2021 Intel Corporation
# Copyright (C) 2018-2022 Intel Corporation
#
# SPDX-License-Identifier: MIT
#
@@ -27,6 +27,20 @@ target_sources(igdrcl_aub_tests PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_copy_read_buffer_aub_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_copy_read_buffer_aub_tests.h
${CMAKE_CURRENT_SOURCE_DIR}/enqueue_write_image_aub_tests.cpp
${CMAKE_CURRENT_SOURCE_DIR}/single_tile_products_excludes.cpp
)
if(TESTS_XEHP_AND_LATER)
target_sources(igdrcl_aub_tests PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/aub_enqueue_resource_barrier_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/aub_inline_data_local_id_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/aub_multicontext_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/aub_one_va_multi_physical_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/aub_postsync_write_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/aub_scratch_space_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/compression_aub_tests_xehp_and_later.cpp
${CMAKE_CURRENT_SOURCE_DIR}/multi_tile_buffers_aub_tests_xehp_and_later.cpp
)
endif()
add_subdirectories()

View File

@@ -0,0 +1,112 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/test_macros/test.h"
#include "opencl/source/command_queue/resource_barrier.h"
#include "opencl/source/mem_obj/buffer.h"
#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
#include "opencl/test/unit_test/aub_tests/fixtures/hello_world_fixture.h"
#include "opencl/test/unit_test/helpers/cmd_buffer_validator.h"
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
#include "opencl/test/unit_test/mocks/mock_context.h"
#include "test_traits_common.h"
using namespace NEO;
using ResourceBarrierAubTest = Test<KernelAUBFixture<SimpleKernelFixture>>;
struct L3ControlSupportedMatcher {
template <PRODUCT_FAMILY productFamily>
static constexpr bool isMatched() {
if constexpr (HwMapper<productFamily>::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) {
return TestTraits<NEO::ToGfxCoreFamily<productFamily>::get()>::l3ControlSupported;
}
return false;
}
};
HWTEST2_F(ResourceBarrierAubTest, givenAllocationsWhenEnqueueResourceBarrierCalledThenL3FlushCommandWasSubmitted, L3ControlSupportedMatcher) {
using L3_CONTROL = typename FamilyType::L3_CONTROL;
constexpr size_t bufferSize = MemoryConstants::pageSize;
char bufferAMemory[bufferSize];
char bufferBMemory[bufferSize];
memset(bufferAMemory, 1, bufferSize);
memset(bufferBMemory, 129, bufferSize);
auto retVal = CL_INVALID_VALUE;
auto srcBuffer = std::unique_ptr<Buffer>(Buffer::create(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
bufferSize, bufferAMemory, retVal));
ASSERT_NE(nullptr, srcBuffer);
auto dstBuffer1 = std::unique_ptr<Buffer>(Buffer::create(context,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
bufferSize, bufferBMemory, retVal));
ASSERT_NE(nullptr, dstBuffer1);
auto dstBuffer2 = std::unique_ptr<Buffer>(Buffer::create(context,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
bufferSize, bufferBMemory, retVal));
ASSERT_NE(nullptr, dstBuffer2);
retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer1.get(),
0, 0,
bufferSize, 0,
nullptr, nullptr);
retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer2.get(),
0, 0,
bufferSize, 0,
nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
cl_resource_barrier_descriptor_intel descriptor{};
cl_resource_barrier_descriptor_intel descriptor2{};
descriptor.mem_object = dstBuffer1.get();
descriptor2.mem_object = dstBuffer2.get();
const cl_resource_barrier_descriptor_intel descriptors[] = {descriptor, descriptor2};
BarrierCommand bCmd(pCmdQ, descriptors, 2);
auto sizeUsed = pCmdQ->getCS(0).getUsed();
retVal = pCmdQ->enqueueResourceBarrier(&bCmd, 0, nullptr, nullptr);
LinearStream &l3FlushCmdStream = pCmdQ->getCS(0);
std::string err;
auto cmdBuffOk = expectCmdBuff<FamilyType>(l3FlushCmdStream, sizeUsed,
std::vector<MatchCmd *>{
new MatchAnyCmd(AnyNumber),
new MatchHwCmd<FamilyType, L3_CONTROL>(1),
new MatchAnyCmd(AnyNumber),
},
&err);
EXPECT_TRUE(cmdBuffOk) << err;
retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer2.get(),
0, 0,
bufferSize, 0,
nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
pCmdQ->flush();
expectMemory<FamilyType>(reinterpret_cast<void *>(dstBuffer1->getGraphicsAllocation(device->getRootDeviceIndex())->getGpuAddress()),
bufferAMemory, bufferSize);
expectMemory<FamilyType>(reinterpret_cast<void *>(dstBuffer2->getGraphicsAllocation(device->getRootDeviceIndex())->getGpuAddress()),
bufferAMemory, bufferSize);
}

View File

@@ -0,0 +1,475 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/array_count.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/test_macros/test.h"
#include "opencl/source/helpers/hardware_commands_helper.h"
#include "opencl/test/unit_test/aub_tests/command_stream/aub_command_stream_fixture.h"
#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
#include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
#include "opencl/test/unit_test/indirect_heap/indirect_heap_fixture.h"
using namespace NEO;
struct AubDispatchThreadDataFixture : public KernelAUBFixture<SimpleKernelFixture> {
struct TestVariables {
Buffer *destBuffer = nullptr;
void *destMemory = nullptr;
size_t sizeUserMemory = 0;
size_t sizeWrittenMemory = 0;
size_t sizeRemainderMemory = 0;
void *expectedMemory = nullptr;
void *expectedRemainderMemory = nullptr;
char *remainderDestMemory = nullptr;
unsigned int scalarArg = 0;
size_t typeSize = 0;
size_t gwsSize = 0;
size_t lwsSize = 0;
};
void SetUp() override {
KernelAUBFixture<SimpleKernelFixture>::SetUp();
variablesCount = arrayCount(variables);
BufferDefaults::context = context;
for (size_t i = 0; i < variablesCount; i++) {
if (variables[i].sizeUserMemory) {
variables[i].destBuffer = Buffer::create(
context,
CL_MEM_READ_WRITE | CL_MEM_FORCE_HOST_MEMORY_INTEL,
variables[i].sizeUserMemory,
nullptr,
retVal);
ASSERT_NE(nullptr, variables[i].destBuffer);
variables[i].destMemory = reinterpret_cast<void *>(variables[i].destBuffer->getCpuAddressForMapping());
}
}
}
void TearDown() override {
pCmdQ->flush();
for (size_t i = 0; i < variablesCount; i++) {
if (variables[i].destBuffer) {
delete variables[i].destBuffer;
variables[i].destBuffer = nullptr;
}
if (variables[i].expectedMemory) {
alignedFree(variables[i].expectedMemory);
variables[i].expectedMemory = nullptr;
}
if (variables[i].expectedRemainderMemory) {
alignedFree(variables[i].expectedRemainderMemory);
variables[i].expectedRemainderMemory = nullptr;
}
}
BufferDefaults::context = nullptr;
KernelAUBFixture<SimpleKernelFixture>::TearDown();
}
std::unique_ptr<DebugManagerStateRestore> debugRestorer;
TestVariables variables[5] = {};
size_t variablesCount;
HardwareParse hwParser;
};
struct InlineDataFixture : AubDispatchThreadDataFixture {
void SetUp() override {
debugRestorer = std::make_unique<DebugManagerStateRestore>();
DebugManager.flags.EnablePassInlineData.set(true);
initializeKernel3Variables();
initializeKernel4Variables();
AubDispatchThreadDataFixture::SetUp();
setUpKernel3();
}
void initializeKernel4Variables() {
kernelIds |= (1 << 4);
variables[4].gwsSize = 1;
variables[4].lwsSize = 1;
}
void initializeKernel3Variables() {
kernelIds |= (1 << 3);
variables[3].sizeUserMemory = 4096;
variables[3].typeSize = sizeof(unsigned int);
variables[3].gwsSize = 128;
variables[3].lwsSize = 32;
}
void setUpKernel3() {
memset(variables[3].destMemory, 0xFE, variables[3].sizeUserMemory);
kernels[3]->setArg(0, variables[3].destBuffer);
variables[3].sizeWrittenMemory = variables[3].gwsSize * variables[3].typeSize;
variables[3].expectedMemory = alignedMalloc(variables[3].sizeWrittenMemory, 4096);
memset(variables[3].expectedMemory, 0, variables[3].sizeWrittenMemory);
variables[3].sizeRemainderMemory = variables[3].sizeUserMemory - variables[3].sizeWrittenMemory;
variables[3].expectedRemainderMemory = alignedMalloc(variables[3].sizeRemainderMemory, 4096);
memcpy_s(variables[3].expectedRemainderMemory,
variables[3].sizeRemainderMemory,
variables[3].destMemory,
variables[3].sizeRemainderMemory);
variables[3].remainderDestMemory = static_cast<char *>(variables[3].destMemory) + variables[3].sizeWrittenMemory;
}
};
using XeHPAndLaterAubInlineDataTest = Test<InlineDataFixture>;
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubInlineDataTest, givenCrossThreadFitIntoSingleGrfWhenInlineDataAllowedThenCopyAllCrossThreadIntoInline) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using INLINE_DATA = typename FamilyType::INLINE_DATA;
if (!HardwareCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*kernels[4])) {
return;
}
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {variables[4].gwsSize, 1, 1};
size_t localWorkSize[3] = {variables[4].lwsSize, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
auto retVal = pCmdQ->enqueueKernel(
kernels[4].get(),
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
pCmdQ->flush();
hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
hwParser.findHardwareCommands<FamilyType>();
EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
EXPECT_EQ(1u, walker->getEmitInlineParameter());
auto localId = kernels[4]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
uint32_t expectedEmitLocal = 0;
if (localId[0]) {
expectedEmitLocal |= (1 << 0);
}
if (localId[1]) {
expectedEmitLocal |= (1 << 1);
}
if (localId[2]) {
expectedEmitLocal |= (1 << 2);
}
EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), kernels[4]->getCrossThreadData(), sizeof(INLINE_DATA)));
//this kernel does nothing, so no expectMemory because only such kernel can fit into single GRF
//this is for sake of testing inline data data copying by COMPUTE_WALKER
}
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubInlineDataTest, givenCrossThreadSizeMoreThanSingleGrfWhenInlineDataAllowedThenCopyGrfCrossThreadToInline) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using INLINE_DATA = typename FamilyType::INLINE_DATA;
if (!HardwareCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*kernels[3])) {
return;
}
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {variables[3].gwsSize, 1, 1};
size_t localWorkSize[3] = {variables[3].lwsSize, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
IndirectHeap &ih = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 2048);
auto retVal = pCmdQ->enqueueKernel(
kernels[3].get(),
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
pCmdQ->flush();
hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
hwParser.findHardwareCommands<FamilyType>();
EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
EXPECT_EQ(1u, walker->getEmitInlineParameter());
auto localId = kernels[3]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
uint32_t expectedEmitLocal = 0;
if (localId[0]) {
expectedEmitLocal |= (1 << 0);
}
if (localId[1]) {
expectedEmitLocal |= (1 << 1);
}
if (localId[2]) {
expectedEmitLocal |= (1 << 2);
}
EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
char *crossThreadData = kernels[3]->getCrossThreadData();
size_t crossThreadDataSize = kernels[3]->getCrossThreadDataSize();
auto inlineSize = sizeof(INLINE_DATA);
EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadData, inlineSize));
crossThreadDataSize -= inlineSize;
crossThreadData += inlineSize;
void *payloadData = ih.getCpuBase();
EXPECT_EQ(0, memcmp(payloadData, crossThreadData, crossThreadDataSize));
expectMemory<FamilyType>(variables[3].destMemory, variables[3].expectedMemory, variables[3].sizeWrittenMemory);
expectMemory<FamilyType>(variables[3].remainderDestMemory, variables[3].expectedRemainderMemory, variables[3].sizeRemainderMemory);
}
struct HwLocalIdsFixture : AubDispatchThreadDataFixture {
void SetUp() override {
debugRestorer = std::make_unique<DebugManagerStateRestore>();
DebugManager.flags.EnableHwGenerationLocalIds.set(1);
initializeKernel2Variables();
AubDispatchThreadDataFixture::SetUp();
if (kernels[2]->getKernelInfo().kernelDescriptor.kernelAttributes.flags.passInlineData) {
DebugManager.flags.EnablePassInlineData.set(true);
}
setUpKernel2();
}
void initializeKernel2Variables() {
kernelIds |= (1 << 2);
variables[2].sizeUserMemory = 4096;
variables[2].scalarArg = 0xAA;
variables[2].typeSize = sizeof(unsigned int);
variables[2].gwsSize = 256;
variables[2].lwsSize = 32;
}
void setUpKernel2() {
memset(variables[2].destMemory, 0xFE, variables[2].sizeUserMemory);
kernels[2]->setArg(0, sizeof(variables[2].scalarArg), &variables[2].scalarArg);
kernels[2]->setArg(1, variables[2].destBuffer);
variables[2].sizeWrittenMemory = variables[2].gwsSize * variables[2].typeSize;
variables[2].expectedMemory = alignedMalloc(variables[2].sizeWrittenMemory, 4096);
unsigned int *expectedData = static_cast<unsigned int *>(variables[2].expectedMemory);
for (size_t i = 0; i < variables[2].gwsSize; i++) {
*(expectedData + i) = variables[2].scalarArg;
}
variables[2].sizeRemainderMemory = variables[2].sizeUserMemory - variables[2].sizeWrittenMemory;
variables[2].expectedRemainderMemory = alignedMalloc(variables[2].sizeRemainderMemory, 4096);
memcpy_s(variables[2].expectedRemainderMemory,
variables[2].sizeRemainderMemory,
variables[2].destMemory,
variables[2].sizeRemainderMemory);
variables[2].remainderDestMemory = static_cast<char *>(variables[2].destMemory) + variables[2].sizeWrittenMemory;
}
};
using XeHPAndLaterAubHwLocalIdsTest = Test<HwLocalIdsFixture>;
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubHwLocalIdsTest, WhenEnqueueDimensionsArePow2ThenSetEmitLocalIdsAndGenerateLocalIdsFields) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {variables[2].gwsSize, 1, 1};
size_t localWorkSize[3] = {variables[2].lwsSize, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
auto retVal = pCmdQ->enqueueKernel(
kernels[2].get(),
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
hwParser.findHardwareCommands<FamilyType>();
EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
auto localId = kernels[2]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
uint32_t expectedEmitLocal = 0;
if (localId[0]) {
expectedEmitLocal |= (1 << 0);
}
if (localId[1]) {
expectedEmitLocal |= (1 << 1);
}
if (localId[2]) {
expectedEmitLocal |= (1 << 2);
}
EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
EXPECT_EQ(1u, walker->getGenerateLocalId());
auto kernelAllocationGpuAddr = kernels[2]->getKernelInfo().kernelAllocation->getGpuAddressToPatch();
auto skipOffset = kernels[2]->getKernelInfo().kernelDescriptor.entryPoints.skipPerThreadDataLoad;
uint64_t kernelStartPointer = kernelAllocationGpuAddr + skipOffset;
INTERFACE_DESCRIPTOR_DATA &idd = walker->getInterfaceDescriptor();
EXPECT_EQ(static_cast<uint32_t>(kernelStartPointer), idd.getKernelStartPointer());
pCmdQ->flush();
expectMemory<FamilyType>(variables[2].destMemory, variables[2].expectedMemory, variables[2].sizeWrittenMemory);
expectMemory<FamilyType>(variables[2].remainderDestMemory, variables[2].expectedRemainderMemory, variables[2].sizeRemainderMemory);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubHwLocalIdsTest, givenNonPowOf2LocalWorkSizeButCompatibleWorkOrderWhenLocalIdsAreUsedThenDataVerifiesCorrectly) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
cl_uint workDim = 1;
size_t globalWorkSize[3] = {200, 1, 1};
size_t localWorkSize[3] = {200, 1, 1};
auto retVal = pCmdQ->enqueueKernel(
kernels[2].get(),
workDim,
nullptr,
globalWorkSize,
localWorkSize,
0,
nullptr,
nullptr);
ASSERT_EQ(CL_SUCCESS, retVal);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
hwParser.findHardwareCommands<FamilyType>();
EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
auto localId = kernels[2]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
uint32_t expectedEmitLocal = 0;
if (localId[0]) {
expectedEmitLocal |= (1 << 0);
}
if (localId[1]) {
expectedEmitLocal |= (1 << 1);
}
if (localId[2]) {
expectedEmitLocal |= (1 << 2);
}
EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
EXPECT_EQ(1u, walker->getGenerateLocalId());
EXPECT_EQ(4u, walker->getWalkOrder());
pCmdQ->flush();
expectMemory<FamilyType>(variables[2].destMemory, variables[2].expectedMemory, globalWorkSize[0] * variables[2].typeSize);
}
struct HwLocalIdsWithSubGroups : AubDispatchThreadDataFixture {
void SetUp() override {
debugRestorer = std::make_unique<DebugManagerStateRestore>();
DebugManager.flags.EnableHwGenerationLocalIds.set(1);
kernelIds |= (1 << 9);
variables[0].sizeUserMemory = 16 * KB;
AubDispatchThreadDataFixture::SetUp();
memset(variables[0].destMemory, 0, variables[0].sizeUserMemory);
variables[0].expectedMemory = alignedMalloc(variables[0].sizeUserMemory, 4096);
kernels[9]->setArg(0, variables[0].destBuffer);
}
};
using XeHPAndLaterAubHwLocalIdsWithSubgroupsTest = Test<HwLocalIdsWithSubGroups>;
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterAubHwLocalIdsWithSubgroupsTest, givenKernelUsingSubgroupsWhenLocalIdsAreGeneratedByHwThenValuesAreCorrect) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
cl_uint workDim = 1;
size_t globalWorkSize[3] = {200, 1, 1};
size_t localWorkSize[3] = {200, 1, 1};
auto retVal = pCmdQ->enqueueKernel(
kernels[9].get(),
workDim,
nullptr,
globalWorkSize,
localWorkSize,
0,
nullptr,
nullptr);
ASSERT_EQ(CL_SUCCESS, retVal);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(pCmdQ->getCS(0), 0);
hwParser.findHardwareCommands<FamilyType>();
EXPECT_NE(hwParser.itorWalker, hwParser.cmdList.end());
auto walker = genCmdCast<WALKER_TYPE *>(*hwParser.itorWalker);
auto localId = kernels[9]->getKernelInfo().kernelDescriptor.kernelAttributes.localId;
uint32_t expectedEmitLocal = 0;
if (localId[0]) {
expectedEmitLocal |= (1 << 0);
}
if (localId[1]) {
expectedEmitLocal |= (1 << 1);
}
if (localId[2]) {
expectedEmitLocal |= (1 << 2);
}
EXPECT_EQ(expectedEmitLocal, walker->getEmitLocalId());
EXPECT_EQ(1u, walker->getGenerateLocalId());
EXPECT_EQ(4u, walker->getWalkOrder());
pCmdQ->finish();
//we expect sequence of local ids from 0..199
auto expectedMemory = reinterpret_cast<uint32_t *>(variables[0].expectedMemory);
auto currentWorkItem = 0u;
while (currentWorkItem < localWorkSize[0]) {
expectedMemory[0] = currentWorkItem++;
expectedMemory++;
}
expectMemory<FamilyType>(variables[0].destMemory, variables[0].expectedMemory, ptrDiff(expectedMemory, variables[0].expectedMemory));
}

View File

@@ -0,0 +1,620 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_container/walker_partition_xehp_and_later.h"
#include "shared/source/command_stream/aub_command_stream_receiver_hw.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/mocks/mock_allocation_properties.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/test_macros/test.h"
#include "opencl/extensions/public/cl_ext_private.h"
#include "opencl/source/command_queue/command_queue.h"
#include "opencl/source/helpers/cl_memory_properties_helpers.h"
#include "opencl/source/mem_obj/buffer.h"
#include "opencl/source/mem_obj/image.h"
#include "opencl/test/unit_test/aub_tests/fixtures/multicontext_aub_fixture.h"
#include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
#include "opencl/test/unit_test/mocks/mock_context.h"
using namespace NEO;
template <uint32_t numberOfTiles, MulticontextAubFixture::EnabledCommandStreamers enabledCommandStreamers>
struct MultitileMulticontextTests : public MulticontextAubFixture, public ::testing::Test {
void SetUp() override {
MulticontextAubFixture::SetUp(numberOfTiles, enabledCommandStreamers, false);
}
void TearDown() override {
MulticontextAubFixture::TearDown();
}
template <typename FamilyType>
void runAubTest() {
cl_int retVal = CL_SUCCESS;
const uint32_t bufferSize = 64 * KB;
uint8_t writePattern[bufferSize];
uint8_t initPattern[bufferSize];
std::fill(writePattern, writePattern + sizeof(writePattern), 1);
std::fill(initPattern, initPattern + sizeof(initPattern), 0);
std::vector<std::vector<std::unique_ptr<Buffer>>> regularBuffers;
std::vector<std::vector<std::unique_ptr<Buffer>>> tileOnlyBuffers;
cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
regularBuffers.resize(tileDevices.size());
tileOnlyBuffers.resize(tileDevices.size());
for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
DebugManager.flags.DoCpuCopyOnWriteBuffer.set(true);
auto memoryProperties = ClMemoryPropertiesHelper::createMemoryProperties(CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0, 0,
&context->getDevice(0)->getDevice());
auto regularBuffer = Buffer::create(
context.get(), memoryProperties, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 0, bufferSize, initPattern, retVal);
auto tileOnlyProperties = ClMemoryPropertiesHelper::createMemoryProperties(
flags, 0, 0, context->getDevice(0)->getDevice().getNearestGenericSubDevice(tile));
auto tileOnlyBuffer = Buffer::create(context.get(), tileOnlyProperties, flags, 0, bufferSize, initPattern, retVal);
DebugManager.flags.DoCpuCopyOnWriteBuffer.set(false);
regularBuffer->forceDisallowCPUCopy = true;
tileOnlyBuffer->forceDisallowCPUCopy = true;
regularBuffers[tile].push_back(std::unique_ptr<Buffer>(regularBuffer));
tileOnlyBuffers[tile].push_back(std::unique_ptr<Buffer>(tileOnlyBuffer));
commandQueues[tile][tileEngine]->enqueueWriteBuffer(regularBuffer, CL_FALSE, 0, bufferSize, writePattern, nullptr, 0, nullptr, nullptr);
commandQueues[tile][tileEngine]->enqueueWriteBuffer(tileOnlyBuffer, CL_FALSE, 0, bufferSize, writePattern, nullptr, 0, nullptr, nullptr);
commandQueues[tile][tileEngine]->flush();
}
}
for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
getSimulatedCsr<FamilyType>(tile, tileEngine)->pollForCompletion();
auto regularBufferGpuAddress = static_cast<uintptr_t>(regularBuffers[tile][tileEngine]->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());
auto tileOnlyBufferGpuAddress = static_cast<uintptr_t>(tileOnlyBuffers[tile][tileEngine]->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());
expectMemory<FamilyType>(reinterpret_cast<void *>(regularBufferGpuAddress), writePattern, bufferSize, tile, tileEngine);
expectMemory<FamilyType>(reinterpret_cast<void *>(tileOnlyBufferGpuAddress), writePattern, bufferSize, tile, tileEngine);
}
}
}
template <typename FamilyType>
void runAubWriteImageTest() {
if (!tileDevices[0]->getSharedDeviceInfo().imageSupport) {
GTEST_SKIP();
}
cl_int retVal = CL_SUCCESS;
auto testWidth = 5u;
auto testHeight = 5u;
auto testDepth = 1u;
auto numPixels = testWidth * testHeight * testDepth;
cl_image_format imageFormat;
imageFormat.image_channel_data_type = CL_FLOAT;
imageFormat.image_channel_order = CL_RGBA;
cl_mem_flags flags = 0;
auto surfaceFormat = Image::getSurfaceFormatFromTable(flags, &imageFormat, context->getDevice(0)->getHardwareInfo().capabilityTable.supportsOcl21Features);
cl_image_desc imageDesc;
imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
imageDesc.image_width = testWidth;
imageDesc.image_height = testHeight;
imageDesc.image_depth = testDepth;
imageDesc.image_array_size = 1;
imageDesc.image_row_pitch = 0;
imageDesc.image_slice_pitch = 0;
imageDesc.num_mip_levels = 0;
imageDesc.num_samples = 0;
imageDesc.mem_object = NULL;
auto perChannelDataSize = 4u;
auto numChannels = 4u;
auto elementSize = perChannelDataSize * numChannels;
auto srcMemory = (uint8_t *)alignedMalloc(elementSize * numPixels, MemoryConstants::pageSize);
for (size_t i = 0; i < numPixels * elementSize; ++i) {
auto origValue = static_cast<uint8_t>(i);
memcpy(srcMemory + i, &origValue, sizeof(origValue));
}
size_t origin[3] = {0, 0, 0};
const size_t region[3] = {testWidth, testHeight, testDepth};
size_t inputRowPitch = testWidth * elementSize;
size_t inputSlicePitch = inputRowPitch * testHeight;
std::vector<std::vector<std::unique_ptr<Image>>> images;
images.resize(tileDevices.size());
for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
Image *dstImage = Image::create(
context.get(),
ClMemoryPropertiesHelper::createMemoryProperties(flags, 0, 0, &context->getDevice(0)->getDevice()),
flags,
0,
surfaceFormat,
&imageDesc,
nullptr,
retVal);
ASSERT_NE(nullptr, dstImage);
memset(dstImage->getCpuAddress(), 0xFF, dstImage->getSize());
retVal = commandQueues[tile][tileEngine]->enqueueWriteImage(
dstImage,
CL_FALSE,
origin,
region,
inputRowPitch,
inputSlicePitch,
srcMemory,
nullptr,
0,
nullptr,
nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
images[tile].push_back(std::unique_ptr<Image>(dstImage));
}
}
for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
commandQueues[tile][tileEngine]->flush();
}
}
std::unique_ptr<uint8_t[]> dstMemory;
for (uint32_t tile = 0; tile < tileDevices.size(); tile++) {
for (uint32_t tileEngine = 0; tileEngine < commandQueues[tile].size(); tileEngine++) {
dstMemory.reset(new uint8_t[images[tile][tileEngine]->getSize()]);
memset(dstMemory.get(), 0xFF, images[tile][tileEngine]->getSize());
commandQueues[tile][tileEngine]->enqueueReadImage(
images[tile][tileEngine].get(), CL_FALSE, origin, region, 0, 0, dstMemory.get(), nullptr, 0, nullptr, nullptr);
commandQueues[tile][tileEngine]->flush();
auto rowPitch = images[tile][tileEngine]->getHostPtrRowPitch();
auto slicePitch = images[tile][tileEngine]->getHostPtrSlicePitch();
auto pSrcMemory = srcMemory;
auto pDstMemory = dstMemory.get();
for (size_t z = 0; z < testDepth; ++z) {
for (size_t y = 0; y < testHeight; ++y) {
expectMemory<FamilyType>(pDstMemory, pSrcMemory, testWidth * elementSize, tile, tileEngine);
pSrcMemory = ptrOffset(pSrcMemory, testWidth * elementSize);
pDstMemory = ptrOffset(pDstMemory, rowPitch);
}
pDstMemory = ptrOffset(pDstMemory, slicePitch - (rowPitch * (testHeight > 0 ? testHeight : 1)));
}
}
}
alignedFree(srcMemory);
}
};
// 4 Tiles
using FourTilesAllContextsTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::All>;
HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesAllContextsTest, GENERATEONLY_givenFourTilesAndAllContextsWhenSubmittingThenDataIsValid) {
runAubTest<FamilyType>();
}
using FourTilesDualContextTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesDualContextTest, HEAVY_givenFourTilesAndDualContextWhenSubmittingThenDataIsValid) {
runAubTest<FamilyType>();
}
using FourTilesSingleContextTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::Single>;
HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesSingleContextTest, givenFourTilesAndSingleContextWhenSubmittingThenDataIsValid) {
runAubTest<FamilyType>();
}
struct EnqueueWithWalkerPartitionFourTilesTests : public FourTilesSingleContextTest, SimpleKernelFixture {
void SetUp() override {
DebugManager.flags.EnableWalkerPartition.set(1u);
kernelIds |= (1 << 5);
kernelIds |= (1 << 8);
FourTilesSingleContextTest::SetUp();
SimpleKernelFixture::SetUp(rootDevice, context.get());
rootCsr = rootDevice->getDefaultEngine().commandStreamReceiver;
EXPECT_EQ(4u, rootCsr->getOsContext().getNumSupportedDevices());
engineControlForFusedQueue = {rootCsr, &rootCsr->getOsContext()};
bufferSize = 16 * MemoryConstants::kiloByte;
auto destMemory = std::make_unique<uint8_t[]>(bufferSize);
memset(destMemory.get(), 0x0, bufferSize);
cl_int retVal = CL_SUCCESS;
buffer.reset(Buffer::create(multiTileDefaultContext.get(), CL_MEM_COPY_HOST_PTR, bufferSize, destMemory.get(), retVal));
clBuffer = buffer.get();
}
void TearDown() override {
SimpleKernelFixture::TearDown();
FourTilesSingleContextTest::TearDown();
}
void *getGpuAddress(Buffer &buffer) {
return reinterpret_cast<void *>(buffer.getGraphicsAllocation(this->rootDeviceIndex)->getGpuAddress());
}
uint32_t bufferSize = 0;
std::unique_ptr<Buffer> buffer;
cl_mem clBuffer;
EngineControl engineControlForFusedQueue = {};
CommandStreamReceiver *rootCsr = nullptr;
};
struct DynamicWalkerPartitionFourTilesTests : EnqueueWithWalkerPartitionFourTilesTests {
void SetUp() override {
DebugManager.flags.EnableStaticPartitioning.set(0);
EnqueueWithWalkerPartitionFourTilesTests::SetUp();
}
DebugManagerStateRestore restore{};
};
HWCMDTEST_F(IGFX_XE_HP_CORE, DynamicWalkerPartitionFourTilesTests, whenWalkerPartitionIsEnabledForKernelWithAtomicThenOutputDataIsValid) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
auto mockCommandQueue = new MockCommandQueueHw<FamilyType>(multiTileDefaultContext.get(), rootDevice, nullptr);
commandQueues[0][0].reset(mockCommandQueue);
constexpr size_t globalWorkOffset[] = {0, 0, 0};
constexpr size_t gwsSize[] = {512, 1, 1};
constexpr size_t lwsSize[] = {32, 1, 1};
constexpr cl_uint workingDimensions = 1;
cl_int retVal = CL_SUCCESS;
kernels[5]->setArg(0, sizeof(cl_mem), &clBuffer);
retVal = mockCommandQueue->enqueueKernel(kernels[5].get(), workingDimensions, globalWorkOffset, gwsSize, lwsSize, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
mockCommandQueue->flush();
HardwareParse hwParser;
auto &cmdStream = mockCommandQueue->getCS(0);
hwParser.parseCommands<FamilyType>(cmdStream, 0);
bool lastSemaphoreFound = false;
uint64_t tileAtomicGpuAddress = 0;
for (auto it = hwParser.cmdList.rbegin(); it != hwParser.cmdList.rend(); it++) {
auto semaphoreCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(*it);
if (semaphoreCmd) {
if (UnitTestHelper<FamilyType>::isAdditionalMiSemaphoreWait(*semaphoreCmd)) {
continue;
}
EXPECT_EQ(4u, semaphoreCmd->getSemaphoreDataDword());
tileAtomicGpuAddress = semaphoreCmd->getSemaphoreGraphicsAddress();
lastSemaphoreFound = true;
break;
}
}
if (ImplicitScalingDispatch<FamilyType>::getPipeControlStallRequired()) {
EXPECT_TRUE(lastSemaphoreFound);
EXPECT_NE(0u, tileAtomicGpuAddress);
} else {
EXPECT_FALSE(lastSemaphoreFound);
EXPECT_EQ(0u, tileAtomicGpuAddress);
}
expectMemory<FamilyType>(getGpuAddress(*buffer), &gwsSize[workingDimensions - 1], sizeof(uint32_t), 0, 0);
uint32_t expectedAtomicValue = 4;
if (ImplicitScalingDispatch<FamilyType>::getPipeControlStallRequired()) {
expectMemory<FamilyType>(reinterpret_cast<void *>(tileAtomicGpuAddress), &expectedAtomicValue, sizeof(uint32_t), 0, 0);
}
constexpr uint32_t workgroupCount = static_cast<uint32_t>(gwsSize[workingDimensions - 1] / lwsSize[workingDimensions - 1]);
auto groupSpecificWorkCounts = ptrOffset(getGpuAddress(*buffer), 4);
std::array<uint32_t, workgroupCount> workgroupCounts;
std::fill(workgroupCounts.begin(), workgroupCounts.end(), static_cast<uint32_t>(lwsSize[workingDimensions - 1]));
expectMemory<FamilyType>(groupSpecificWorkCounts, &workgroupCounts[0], workgroupCounts.size() * sizeof(uint32_t), 0, 0);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, DynamicWalkerPartitionFourTilesTests, whenWalkerPartitionIsEnabledForKernelWithoutAtomicThenOutputDataIsValid) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
auto mockCommandQueue = new MockCommandQueueHw<FamilyType>(multiTileDefaultContext.get(), rootDevice, nullptr);
commandQueues[0][0].reset(mockCommandQueue);
constexpr size_t globalWorkOffset[3] = {0, 0, 0};
constexpr size_t gwsSize[3] = {1024, 1, 1};
constexpr size_t lwsSize[3] = {32, 1, 1};
constexpr cl_uint workingDimensions = 1;
cl_uint kernelIncrementCounter = 1024;
cl_int retVal = CL_SUCCESS;
kernels[8]->setArg(0, sizeof(cl_mem), &clBuffer);
kernels[8]->setArg(1, kernelIncrementCounter);
retVal = mockCommandQueue->enqueueKernel(kernels[8].get(), workingDimensions, globalWorkOffset, gwsSize, lwsSize, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
mockCommandQueue->flush();
constexpr uint32_t workgroupCount = static_cast<uint32_t>(gwsSize[workingDimensions - 1] / lwsSize[workingDimensions - 1]);
std::array<uint32_t, workgroupCount> workgroupCounts;
std::fill(workgroupCounts.begin(), workgroupCounts.end(), kernelIncrementCounter);
expectMemory<FamilyType>(getGpuAddress(*buffer), &workgroupCounts[0], workgroupCounts.size() * sizeof(uint32_t), 0, 0);
}
struct StaticWalkerPartitionFourTilesTests : EnqueueWithWalkerPartitionFourTilesTests {
void SetUp() override {
DebugManager.flags.EnableStaticPartitioning.set(1);
DebugManager.flags.EnableBlitterOperationsSupport.set(1);
EnqueueWithWalkerPartitionFourTilesTests::SetUp();
}
std::unique_ptr<LinearStream> createTaskStream() {
const AllocationProperties commandStreamAllocationProperties{rootDevice->getRootDeviceIndex(),
true,
MemoryConstants::pageSize,
GraphicsAllocation::AllocationType::COMMAND_BUFFER,
true,
false,
rootDevice->getDeviceBitfield()};
GraphicsAllocation *streamAllocation = rootDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(commandStreamAllocationProperties);
return std::make_unique<LinearStream>(streamAllocation);
}
void destroyTaskStream(LinearStream &stream) {
rootDevice->getMemoryManager()->freeGraphicsMemory(stream.getGraphicsAllocation());
}
void flushTaskStream(LinearStream &stream) {
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.guardCommandBufferWithPipeControl = true;
rootCsr->flushTask(stream, 0,
rootCsr->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u),
rootCsr->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u),
rootCsr->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u),
0u, dispatchFlags, rootDevice->getDevice());
rootCsr->flushBatchedSubmissions();
}
template <typename FamilyType>
void expectMemoryOnRootCsr(void *gfxAddress, const void *srcAddress, size_t length) {
auto csr = static_cast<AUBCommandStreamReceiverHw<FamilyType> *>(rootCsr);
csr->expectMemoryEqual(gfxAddress, srcAddress, length);
}
DebugManagerStateRestore restore{};
};
HWCMDTEST_F(IGFX_XE_HP_CORE, StaticWalkerPartitionFourTilesTests, givenFourTilesWhenStaticWalkerPartitionIsEnabledForKernelThenOutputDataIsValid) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
auto mockCommandQueue = new MockCommandQueueHw<FamilyType>(multiTileDefaultContext.get(), rootDevice, nullptr);
commandQueues[0][0].reset(mockCommandQueue);
constexpr size_t globalWorkOffset[3] = {0, 0, 0};
constexpr size_t gwsSize[3] = {1024, 1, 1};
constexpr size_t lwsSize[3] = {32, 1, 1};
constexpr cl_uint workingDimensions = 1;
cl_uint kernelIncrementCounter = 1024;
cl_int retVal = CL_SUCCESS;
kernels[8]->setArg(0, sizeof(cl_mem), &clBuffer);
kernels[8]->setArg(1, kernelIncrementCounter);
retVal = mockCommandQueue->enqueueKernel(kernels[8].get(), workingDimensions, globalWorkOffset, gwsSize, lwsSize, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
mockCommandQueue->flush();
constexpr uint32_t workgroupCount = static_cast<uint32_t>(gwsSize[workingDimensions - 1] / lwsSize[workingDimensions - 1]);
std::array<uint32_t, workgroupCount> workgroupCounts;
std::fill(workgroupCounts.begin(), workgroupCounts.end(), kernelIncrementCounter);
expectMemoryOnRootCsr<FamilyType>(getGpuAddress(*buffer), &workgroupCounts[0], workgroupCounts.size() * sizeof(uint32_t));
}
HWCMDTEST_F(IGFX_XE_HP_CORE, StaticWalkerPartitionFourTilesTests, givenPreWalkerSyncWhenStaticWalkerPartitionIsThenAtomicsAreIncrementedCorrectly) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
auto taskStream = createTaskStream();
auto taskStreamCpu = taskStream->getSpace(0);
auto taskStreamGpu = taskStream->getGraphicsAllocation()->getGpuAddress();
uint32_t totalBytesProgrammed = 0u;
WALKER_TYPE walkerCmd = FamilyType::cmdInitGpgpuWalker;
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X);
walkerCmd.getInterfaceDescriptor().setNumberOfThreadsInGpgpuThreadGroup(1u);
WalkerPartition::WalkerPartitionArgs testArgs = {};
testArgs.initializeWparidRegister = true;
testArgs.crossTileAtomicSynchronization = true;
testArgs.emitPipeControlStall = true;
testArgs.tileCount = static_cast<uint32_t>(rootDevice->getDeviceBitfield().count());
testArgs.partitionCount = testArgs.tileCount;
testArgs.synchronizeBeforeExecution = true;
testArgs.secondaryBatchBuffer = false;
testArgs.emitSelfCleanup = false;
testArgs.staticPartitioning = true;
testArgs.workPartitionAllocationGpuVa = rootCsr->getWorkPartitionAllocationGpuAddress();
WalkerPartition::constructStaticallyPartitionedCommandBuffer<FamilyType>(
taskStreamCpu,
taskStreamGpu,
&walkerCmd,
totalBytesProgrammed,
testArgs,
*defaultHwInfo);
taskStream->getSpace(totalBytesProgrammed);
flushTaskStream(*taskStream);
const auto controlSectionAddress = taskStreamGpu + WalkerPartition::computeStaticPartitioningControlSectionOffset<FamilyType>(testArgs);
const auto preWalkerSyncAddress = controlSectionAddress + offsetof(WalkerPartition::StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
const auto postWalkerSyncAddress = controlSectionAddress + offsetof(WalkerPartition::StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
uint32_t expectedValue = 0x4;
expectMemoryOnRootCsr<FamilyType>(reinterpret_cast<void *>(preWalkerSyncAddress), &expectedValue, sizeof(expectedValue));
expectMemoryOnRootCsr<FamilyType>(reinterpret_cast<void *>(postWalkerSyncAddress), &expectedValue, sizeof(expectedValue));
destroyTaskStream(*taskStream);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, StaticWalkerPartitionFourTilesTests, whenNoPreWalkerSyncThenAtomicsAreIncrementedCorrectly) {
using WALKER_TYPE = typename FamilyType::WALKER_TYPE;
auto taskStream = createTaskStream();
auto taskStreamCpu = taskStream->getSpace(0);
auto taskStreamGpu = taskStream->getGraphicsAllocation()->getGpuAddress();
uint32_t totalBytesProgrammed = 0u;
WALKER_TYPE walkerCmd = FamilyType::cmdInitGpgpuWalker;
walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X);
walkerCmd.getInterfaceDescriptor().setNumberOfThreadsInGpgpuThreadGroup(1u);
WalkerPartition::WalkerPartitionArgs testArgs = {};
testArgs.initializeWparidRegister = true;
testArgs.crossTileAtomicSynchronization = true;
testArgs.emitPipeControlStall = true;
testArgs.tileCount = static_cast<uint32_t>(rootDevice->getDeviceBitfield().count());
testArgs.partitionCount = testArgs.tileCount;
testArgs.synchronizeBeforeExecution = false;
testArgs.secondaryBatchBuffer = false;
testArgs.emitSelfCleanup = false;
testArgs.staticPartitioning = true;
testArgs.workPartitionAllocationGpuVa = rootCsr->getWorkPartitionAllocationGpuAddress();
WalkerPartition::constructStaticallyPartitionedCommandBuffer<FamilyType>(
taskStreamCpu,
taskStreamGpu,
&walkerCmd,
totalBytesProgrammed,
testArgs,
*defaultHwInfo);
taskStream->getSpace(totalBytesProgrammed);
flushTaskStream(*taskStream);
const auto controlSectionAddress = taskStreamGpu + WalkerPartition::computeStaticPartitioningControlSectionOffset<FamilyType>(testArgs);
const auto preWalkerSyncAddress = controlSectionAddress + offsetof(WalkerPartition::StaticPartitioningControlSection, synchronizeBeforeWalkerCounter);
const auto postWalkerSyncAddress = controlSectionAddress + offsetof(WalkerPartition::StaticPartitioningControlSection, synchronizeAfterWalkerCounter);
uint32_t expectedValue = 0x0;
expectMemoryOnRootCsr<FamilyType>(reinterpret_cast<void *>(preWalkerSyncAddress), &expectedValue, sizeof(expectedValue));
expectedValue = 0x4;
expectMemoryOnRootCsr<FamilyType>(reinterpret_cast<void *>(postWalkerSyncAddress), &expectedValue, sizeof(expectedValue));
destroyTaskStream(*taskStream);
}
// 2 Tiles
using TwoTilesAllContextsTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::All>;
HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesAllContextsTest, HEAVY_givenTwoTilesAndAllContextsWhenSubmittingThenDataIsValid) {
runAubTest<FamilyType>();
}
using TwoTilesDualContextTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesDualContextTest, givenTwoTilesAndDualContextWhenSubmittingThenDataIsValid) {
runAubTest<FamilyType>();
}
using TwoTilesSingleContextTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::Single>;
HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesSingleContextTest, givenTwoTilesAndSingleContextWhenSubmittingThenDataIsValid) {
runAubTest<FamilyType>();
}
// 1 Tile
using SingleTileAllContextsTest = MultitileMulticontextTests<1, MulticontextAubFixture::EnabledCommandStreamers::All>;
HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileAllContextsTest, GENERATEONLY_givenSingleTileAndAllContextsWhenSubmittingThenDataIsValid) {
runAubTest<FamilyType>();
}
using SingleTileDualContextTest = MultitileMulticontextTests<1, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileDualContextTest, givenSingleTileAndDualContextWhenSubmittingThenDataIsValid) {
runAubTest<FamilyType>();
}
HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileDualContextTest, givenSingleAllocationWhenUpdatedFromDifferentContextThenDataIsValid) {
cl_int retVal = CL_SUCCESS;
const uint32_t bufferSize = 256;
const uint32_t halfBufferSize = bufferSize / 2;
uint8_t writePattern1[halfBufferSize];
uint8_t writePattern2[halfBufferSize];
uint8_t initPattern[bufferSize];
std::fill(initPattern, initPattern + sizeof(initPattern), 0);
std::fill(writePattern1, writePattern1 + sizeof(writePattern1), 1);
std::fill(writePattern2, writePattern2 + sizeof(writePattern2), 2);
std::unique_ptr<Buffer> buffer(Buffer::create(context.get(), CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, bufferSize, initPattern, retVal));
buffer->forceDisallowCPUCopy = true;
auto simulatedCsr0 = getSimulatedCsr<FamilyType>(0, 0);
simulatedCsr0->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
auto simulatedCsr1 = getSimulatedCsr<FamilyType>(0, 1);
simulatedCsr1->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
commandQueues[0][0]->enqueueWriteBuffer(buffer.get(), CL_FALSE, 0, halfBufferSize, writePattern1, nullptr, 0, nullptr, nullptr);
commandQueues[0][1]->enqueueWriteBuffer(buffer.get(), CL_FALSE, halfBufferSize, halfBufferSize, writePattern2, nullptr, 0, nullptr, nullptr);
commandQueues[0][1]->finish(); // submit second enqueue first to make sure that residency flow is correct
commandQueues[0][0]->finish();
auto gpuPtr = reinterpret_cast<void *>(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress());
expectMemory<FamilyType>(gpuPtr, writePattern1, halfBufferSize, 0, 0);
expectMemory<FamilyType>(ptrOffset(gpuPtr, halfBufferSize), writePattern2, halfBufferSize, 0, 1);
}
// 1 |Tile
using SingleTileDualContextTest = MultitileMulticontextTests<1, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileDualContextTest, givenSingleTileAndDualContextWhenWritingImageThenDataIsValid) {
runAubWriteImageTest<FamilyType>();
}
using SingleTileAllContextsTest = MultitileMulticontextTests<1, MulticontextAubFixture::EnabledCommandStreamers::All>;
HWCMDTEST_F(IGFX_XE_HP_CORE, SingleTileAllContextsTest, HEAVY_givenSingleTileAndAllContextsWhenWritingImageThenDataIsValid) {
runAubWriteImageTest<FamilyType>();
}
// 2 Tiles
using TwoTilesSingleContextTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::Single>;
HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesSingleContextTest, givenTwoTilesAndSingleContextWhenWritingImageThenDataIsValid) {
runAubWriteImageTest<FamilyType>();
}
using TwoTilesDualContextTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesDualContextTest, givenTwoTilesAndDualContextWhenWritingImageThenDataIsValid) {
runAubWriteImageTest<FamilyType>();
}
using TwoTilesAllContextsTest = MultitileMulticontextTests<2, MulticontextAubFixture::EnabledCommandStreamers::All>;
HWCMDTEST_F(IGFX_XE_HP_CORE, TwoTilesAllContextsTest, GENERATEONLY_givenTwoTilesAndAllContextsWhenWritingImageThenDataIsValid) {
runAubWriteImageTest<FamilyType>();
}
// 4 Tiles
using FourTilesSingleContextTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::Single>;
HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesSingleContextTest, givenFourTilesAndSingleContextWhenWritingImageThenDataIsValid) {
runAubWriteImageTest<FamilyType>();
}
using FourTilesDualContextTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::Dual>;
HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesDualContextTest, GENERATEONLY_givenFourTilesAndDualContextWhenWritingImageThenDataIsValid) {
runAubWriteImageTest<FamilyType>();
}
using FourTilesAllContextsTest = MultitileMulticontextTests<4, MulticontextAubFixture::EnabledCommandStreamers::All>;
HWCMDTEST_F(IGFX_XE_HP_CORE, FourTilesAllContextsTest, GENERATEONLY_givenFourTilesAndAllContextsWhenWritingImageThenDataIsValid) {
runAubWriteImageTest<FamilyType>();
}

View File

@@ -0,0 +1,133 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/test/common/test_macros/test.h"
#include "shared/test/unit_test/tests_configuration.h"
#include "shared/test/unit_test/utilities/base_object_utils.h"
#include "opencl/source/mem_obj/buffer.h"
#include "opencl/test/unit_test/aub_tests/fixtures/multicontext_aub_fixture.h"
using namespace NEO;
struct OneVAFourPhysicalStoragesTest : public MulticontextAubFixture, public ::testing::Test {
static const uint32_t numTiles = 4;
void SetUp() override {
MulticontextAubFixture::SetUp(numTiles, MulticontextAubFixture::EnabledCommandStreamers::Single, false);
}
void TearDown() override {
MulticontextAubFixture::TearDown();
}
};
HWCMDTEST_F(IGFX_XE_HP_CORE, OneVAFourPhysicalStoragesTest, givenBufferWithFourPhysicalStoragesWhenEnqueueReadBufferThenReadFromCorrectBank) {
if (is32bit) {
return;
}
cl_int retVal = CL_OUT_OF_HOST_MEMORY;
const uint32_t bufferSize = MemoryConstants::pageSize64k;
uint8_t *memoryToWrite[numTiles];
uint8_t *memoryToRead[numTiles];
auto buffer = clUniquePtr<Buffer>(Buffer::create(context.get(), {}, bufferSize, nullptr, retVal));
EXPECT_EQ(CL_SUCCESS, retVal);
buffer->forceDisallowCPUCopy = true;
auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
EXPECT_EQ(MemoryPool::LocalMemory, allocation->getMemoryPool());
auto gpuAddress = allocation->getGpuAddress();
allocation->storageInfo.cloningOfPageTables = false;
allocation->storageInfo.memoryBanks = 0;
allocation->setAubWritable(false, static_cast<uint32_t>(maxNBitValue(numTiles)));
for (uint32_t tile = 0; tile < numTiles; tile++) {
memoryToWrite[tile] = reinterpret_cast<uint8_t *>(alignedMalloc(bufferSize, MemoryConstants::pageSize64k));
std::fill(memoryToWrite[tile], ptrOffset(memoryToWrite[tile], bufferSize), tile + 1);
auto hardwareContext = getSimulatedCsr<FamilyType>(tile, 0)->hardwareContextController->hardwareContexts[0].get();
hardwareContext->writeMemory2({gpuAddress, memoryToWrite[tile], bufferSize, (1u << tile), AubMemDump::DataTypeHintValues::TraceNotype, MemoryConstants::pageSize64k});
}
for (uint32_t tile = 0; tile < numTiles; tile++) {
memoryToRead[tile] = reinterpret_cast<uint8_t *>(alignedMalloc(bufferSize, MemoryConstants::pageSize64k));
commandQueues[tile][0]->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, bufferSize, memoryToRead[tile], nullptr, 0, nullptr, nullptr);
commandQueues[tile][0]->flush();
}
for (uint32_t tile = 0; tile < numTiles; tile++) {
expectMemory<FamilyType>(memoryToRead[tile], memoryToWrite[tile], bufferSize, tile, 0);
alignedFree(memoryToWrite[tile]);
alignedFree(memoryToRead[tile]);
}
}
HWCMDTEST_F(IGFX_XE_HP_CORE, OneVAFourPhysicalStoragesTest, givenBufferWithFourPhysicalStoragesWhenEnqueueWriteBufferThenCorrectMemoryIsWrittenToSpecificBank) {
if (is32bit) {
return;
}
cl_int retVal = CL_OUT_OF_HOST_MEMORY;
const uint32_t bufferSize = MemoryConstants::pageSize64k;
uint8_t *memoryToWrite[numTiles];
auto buffer = clUniquePtr<Buffer>(Buffer::create(context.get(), {}, bufferSize, nullptr, retVal));
EXPECT_EQ(CL_SUCCESS, retVal);
buffer->forceDisallowCPUCopy = true;
auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
EXPECT_EQ(MemoryPool::LocalMemory, allocation->getMemoryPool());
auto gpuAddress = allocation->getGpuAddress();
allocation->storageInfo.cloningOfPageTables = false;
allocation->storageInfo.memoryBanks = 0;
for (uint32_t tile = 0; tile < numTiles; tile++) {
memoryToWrite[tile] = reinterpret_cast<uint8_t *>(alignedMalloc(bufferSize, MemoryConstants::pageSize64k));
std::fill(memoryToWrite[tile], ptrOffset(memoryToWrite[tile], bufferSize), tile + 1);
allocation->setAubWritable(true, 0xffffffff);
commandQueues[tile][0]->enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, memoryToWrite[tile], nullptr, 0, nullptr, nullptr);
}
for (uint32_t tile = 0; tile < numTiles; tile++) {
expectMemory<FamilyType>(reinterpret_cast<void *>(gpuAddress), memoryToWrite[tile], bufferSize, tile, 0);
alignedFree(memoryToWrite[tile]);
}
}
HWCMDTEST_F(IGFX_XE_HP_CORE, OneVAFourPhysicalStoragesTest, givenColouredBufferWhenEnqueueWriteBufferThenCorrectMemoryIsWrittenToSpecificBank) {
if (is32bit) {
return;
}
cl_int retVal = CL_OUT_OF_HOST_MEMORY;
const uint32_t bufferSize = numTiles * MemoryConstants::pageSize64k;
const auto allTilesValue = maxNBitValue(numTiles);
uint8_t *memoryToWrite = reinterpret_cast<uint8_t *>(alignedMalloc(bufferSize, MemoryConstants::pageSize64k));
auto buffer = clUniquePtr<Buffer>(Buffer::create(context.get(), {}, bufferSize, nullptr, retVal));
EXPECT_EQ(CL_SUCCESS, retVal);
buffer->forceDisallowCPUCopy = true;
auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex);
EXPECT_EQ(MemoryPool::LocalMemory, allocation->getMemoryPool());
EXPECT_EQ(allTilesValue, allocation->storageInfo.memoryBanks.to_ullong());
EXPECT_EQ(allTilesValue, allocation->storageInfo.pageTablesVisibility.to_ullong());
EXPECT_TRUE(allocation->storageInfo.cloningOfPageTables);
for (uint32_t tile = 0; tile < numTiles; tile++) {
std::fill(ptrOffset(memoryToWrite, tile * MemoryConstants::pageSize64k), ptrOffset(memoryToWrite, (tile + 1) * MemoryConstants::pageSize64k), tile + 1);
}
commandQueues[0][0]->enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, memoryToWrite, nullptr, 0, nullptr, nullptr);
auto gpuAddress = allocation->getGpuAddress();
for (uint32_t tile = 0; tile < numTiles; tile++) {
for (uint32_t offset = 0; offset < bufferSize; offset += MemoryConstants::pageSize64k) {
expectMemory<FamilyType>(reinterpret_cast<void *>(gpuAddress + offset), ptrOffset(memoryToWrite, offset), MemoryConstants::pageSize64k, tile, 0);
}
}
alignedFree(memoryToWrite);
}

View File

@@ -0,0 +1,152 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/timestamp_packet.h"
#include "shared/source/utilities/tag_allocator.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/mocks/mock_device.h"
#include "shared/test/common/test_macros/test.h"
#include "opencl/source/helpers/hardware_commands_helper.h"
#include "opencl/source/mem_obj/buffer.h"
#include "opencl/test/unit_test/aub_tests/fixtures/hello_world_fixture.h"
#include "opencl/test/unit_test/mocks/mock_command_queue.h"
#include "opencl/test/unit_test/mocks/mock_context.h"
using namespace NEO;
struct PostSyncWriteXeHPTests : public HelloWorldFixture<AUBHelloWorldFixtureFactory>, public ::testing::Test {
void SetUp() override {
DebugManager.flags.EnableTimestampPacket.set(true);
HelloWorldFixture<AUBHelloWorldFixtureFactory>::SetUp();
EXPECT_TRUE(pCommandStreamReceiver->peekTimestampPacketWriteEnabled());
};
void TearDown() override {
HelloWorldFixture<AUBHelloWorldFixtureFactory>::TearDown();
}
DebugManagerStateRestore restore;
cl_int retVal = CL_SUCCESS;
};
HWCMDTEST_F(IGFX_XE_HP_CORE, PostSyncWriteXeHPTests, givenTimestampWriteEnabledWhenEnqueueingThenWritePostsyncOperation) {
MockCommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, nullptr);
const uint32_t bufferSize = 4;
std::unique_ptr<Buffer> buffer(Buffer::create(pContext, CL_MEM_READ_WRITE, bufferSize, nullptr, retVal));
auto graphicsAllocation = buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex());
memset(graphicsAllocation->getUnderlyingBuffer(), 0, graphicsAllocation->getUnderlyingBufferSize());
buffer->forceDisallowCPUCopy = true;
uint8_t writeData[bufferSize] = {1, 2, 3, 4};
cmdQ.enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, writeData, nullptr, 0, nullptr, nullptr);
expectMemory<FamilyType>(reinterpret_cast<void *>(graphicsAllocation->getGpuAddress()), writeData, bufferSize);
typename FamilyType::TimestampPacketType expectedTimestampValues[4] = {1, 1, 1, 1};
auto tagGpuAddress = reinterpret_cast<void *>(cmdQ.timestampPacketContainer->peekNodes().at(0)->getGpuAddress());
expectMemoryNotEqual<FamilyType>(tagGpuAddress, expectedTimestampValues, 4 * sizeof(typename FamilyType::TimestampPacketType));
}
HWCMDTEST_F(IGFX_XE_HP_CORE, PostSyncWriteXeHPTests, givenDebugVariableEnabledWhenEnqueueingThenWritePostsyncOperationInImmWriteMode) {
DebugManager.flags.UseImmDataWriteModeOnPostSyncOperation.set(true);
MockCommandQueueHw<FamilyType> cmdQ(pContext, pClDevice, nullptr);
const uint32_t bufferSize = 4;
std::unique_ptr<Buffer> buffer(Buffer::create(pContext, CL_MEM_READ_WRITE, bufferSize, nullptr, retVal));
auto graphicsAllocation = buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex());
memset(graphicsAllocation->getUnderlyingBuffer(), 0, graphicsAllocation->getUnderlyingBufferSize());
buffer->forceDisallowCPUCopy = true;
uint8_t writeData[bufferSize] = {1, 2, 3, 4};
cmdQ.enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, writeData, nullptr, 0, nullptr, nullptr);
expectMemory<FamilyType>(reinterpret_cast<void *>(graphicsAllocation->getGpuAddress()), writeData, bufferSize);
auto tagGpuAddress = reinterpret_cast<void *>(cmdQ.timestampPacketContainer->peekNodes().at(0)->getGpuAddress());
constexpr auto timestampPacketTypeSize = sizeof(typename FamilyType::TimestampPacketType);
if constexpr (timestampPacketTypeSize == 4u) {
typename FamilyType::TimestampPacketType expectedTimestampValues[4] = {1, 1, 2, 2};
expectMemory<FamilyType>(tagGpuAddress, expectedTimestampValues, 4 * timestampPacketTypeSize);
} else {
typename FamilyType::TimestampPacketType expectedTimestampValues[4] = {1, 1, 0x2'0000'0002u, 1};
expectMemory<FamilyType>(tagGpuAddress, expectedTimestampValues, 4 * timestampPacketTypeSize);
}
}
HWCMDTEST_F(IGFX_XE_HP_CORE, PostSyncWriteXeHPTests, givenTwoBatchedEnqueuesWhenDependencyIsResolvedThenDecrementCounterOnGpu) {
MockContext context(pCmdQ->getDevice().getSpecializedDevice<ClDevice>());
pCommandStreamReceiver->overrideDispatchPolicy(DispatchMode::BatchedDispatch);
const size_t bufferSize = 1024;
auto retVal = CL_SUCCESS;
uint8_t initialMemory[bufferSize] = {};
uint8_t writePattern1[bufferSize];
uint8_t writePattern2[bufferSize];
std::fill(writePattern1, writePattern1 + sizeof(writePattern1), 1);
std::fill(writePattern2, writePattern2 + sizeof(writePattern2), 1);
auto buffer = std::unique_ptr<Buffer>(Buffer::create(&context, CL_MEM_COPY_HOST_PTR, bufferSize, initialMemory, retVal));
//make sure that GPU copy is used
buffer->forceDisallowCPUCopy = true;
cl_event outEvent1, outEvent2;
pCmdQ->enqueueWriteBuffer(buffer.get(), CL_FALSE, 0, bufferSize, writePattern1, nullptr, 0, nullptr, &outEvent1);
auto node1 = castToObject<Event>(outEvent1)->getTimestampPacketNodes()->peekNodes().at(0);
node1->getBaseGraphicsAllocation()->getDefaultGraphicsAllocation()->setAubWritable(true, 0xffffffff); // allow to write again after Buffer::create
pCmdQ->enqueueWriteBuffer(buffer.get(), CL_TRUE, 0, bufferSize, writePattern2, nullptr, 0, nullptr, &outEvent2);
auto node2 = castToObject<Event>(outEvent2)->getTimestampPacketNodes()->peekNodes().at(0);
expectMemory<FamilyType>(reinterpret_cast<void *>(buffer->getGraphicsAllocation(pClDevice->getRootDeviceIndex())->getGpuAddress()), writePattern2, bufferSize);
typename FamilyType::TimestampPacketType expectedEndTimestamp = 1;
auto endTimestampAddress1 = TimestampPacketHelper::getContextEndGpuAddress(*node1);
auto endTimestampAddress2 = TimestampPacketHelper::getGlobalEndGpuAddress(*node1);
auto endTimestampAddress3 = TimestampPacketHelper::getContextEndGpuAddress(*node2);
auto endTimestampAddress4 = TimestampPacketHelper::getGlobalEndGpuAddress(*node2);
expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress1), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress2), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress3), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress4), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
clReleaseEvent(outEvent1);
clReleaseEvent(outEvent2);
}
HWCMDTEST_F(IGFX_XE_HP_CORE, PostSyncWriteXeHPTests, givenMultipleWalkersWhenEnqueueingThenWriteAllTimestamps) {
MockContext context(pCmdQ->getDevice().getSpecializedDevice<ClDevice>());
const size_t bufferSize = 70;
const size_t writeSize = bufferSize - 2;
uint8_t writeData[writeSize] = {};
cl_int retVal = CL_SUCCESS;
cl_event outEvent;
auto buffer = std::unique_ptr<Buffer>(Buffer::create(&context, CL_MEM_READ_WRITE, bufferSize, nullptr, retVal));
buffer->forceDisallowCPUCopy = true;
pCmdQ->enqueueWriteBuffer(buffer.get(), CL_TRUE, 1, writeSize, writeData, nullptr, 0, nullptr, &outEvent);
auto &timestampNodes = castToObject<Event>(outEvent)->getTimestampPacketNodes()->peekNodes();
EXPECT_EQ(2u, timestampNodes.size());
typename FamilyType::TimestampPacketType expectedEndTimestamp = 1;
auto endTimestampAddress1 = TimestampPacketHelper::getContextEndGpuAddress(*timestampNodes[0]);
auto endTimestampAddress2 = TimestampPacketHelper::getGlobalEndGpuAddress(*timestampNodes[0]);
auto endTimestampAddress3 = TimestampPacketHelper::getContextEndGpuAddress(*timestampNodes[1]);
auto endTimestampAddress4 = TimestampPacketHelper::getGlobalEndGpuAddress(*timestampNodes[1]);
expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress1), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress2), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress3), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
expectMemoryNotEqual<FamilyType>(reinterpret_cast<void *>(endTimestampAddress4), &expectedEndTimestamp, sizeof(typename FamilyType::TimestampPacketType));
clReleaseEvent(outEvent);
}

View File

@@ -0,0 +1,327 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/array_count.h"
#include "shared/test/common/cmd_parse/hw_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/mocks/mock_scratch_space_controller_xehp_and_later.h"
#include "shared/test/common/test_macros/test.h"
#include "opencl/source/helpers/hardware_commands_helper.h"
#include "opencl/test/unit_test/aub_tests/command_stream/aub_command_stream_fixture.h"
#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
#include "opencl/test/unit_test/command_queue/command_queue_fixture.h"
#include "opencl/test/unit_test/fixtures/buffer_fixture.h"
#include "opencl/test/unit_test/fixtures/cl_device_fixture.h"
#include "opencl/test/unit_test/fixtures/hello_world_kernel_fixture.h"
#include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h"
#include "opencl/test/unit_test/indirect_heap/indirect_heap_fixture.h"
using namespace NEO;
struct Gen12AubScratchSpaceForPrivateFixture : public KernelAUBFixture<SimpleKernelFixture> {
void SetUp() override {
debugRestorer = std::make_unique<DebugManagerStateRestore>();
kernelIdx = 6;
kernelIds |= (1 << kernelIdx);
KernelAUBFixture<SimpleKernelFixture>::SetUp();
arraySize = 32;
vectorSize = 2;
typeSize = sizeof(uint32_t);
gwsSize = arraySize;
lwsSize = 32;
maxIterations1 = static_cast<uint32_t>(arraySize);
maxIterations2 = static_cast<uint32_t>(arraySize);
scalar = 0x4;
expectedMemorySize = arraySize * vectorSize * typeSize;
srcBuffer = alignedMalloc(expectedMemorySize, 0x1000);
ASSERT_NE(nullptr, srcBuffer);
auto srcBufferUint = static_cast<uint32_t *>(srcBuffer);
uint32_t valOdd = 0x1;
uint32_t valEven = 0x3;
for (uint32_t i = 0; i < arraySize * vectorSize; ++i) {
if (i % 2) {
srcBufferUint[i] = valOdd;
} else {
srcBufferUint[i] = valEven;
}
}
uint32_t sumOdd = 0;
uint32_t sumEven = 0;
for (uint32_t i = 0; i < arraySize; ++i) {
sumOdd += ((i + scalar) + valOdd);
sumEven += (i + valEven);
}
dstBuffer = alignedMalloc(expectedMemorySize, 0x1000);
ASSERT_NE(nullptr, dstBuffer);
memset(dstBuffer, 0, expectedMemorySize);
expectedMemory = alignedMalloc(expectedMemorySize, 0x1000);
ASSERT_NE(nullptr, expectedMemory);
auto expectedMemoryUint = static_cast<uint32_t *>(expectedMemory);
for (uint32_t i = 0; i < arraySize * vectorSize; ++i) {
if (i % 2) {
expectedMemoryUint[i] = sumOdd;
} else {
expectedMemoryUint[i] = sumEven;
}
}
kernels[kernelIdx]->setArgSvm(0, expectedMemorySize, dstBuffer, nullptr, 0u);
dstAllocation = createHostPtrAllocationFromSvmPtr(dstBuffer, expectedMemorySize);
kernels[kernelIdx]->setArgSvm(1, expectedMemorySize, srcBuffer, nullptr, 0u);
srcAllocation = createHostPtrAllocationFromSvmPtr(srcBuffer, expectedMemorySize);
kernels[kernelIdx]->setArg(2, sizeof(uint32_t), &scalar);
kernels[kernelIdx]->setArg(3, sizeof(uint32_t), &maxIterations1);
kernels[kernelIdx]->setArg(4, sizeof(uint32_t), &maxIterations2);
}
void TearDown() override {
pCmdQ->flush();
if (expectedMemory) {
alignedFree(expectedMemory);
expectedMemory = nullptr;
}
if (srcBuffer) {
alignedFree(srcBuffer);
srcBuffer = nullptr;
}
if (dstBuffer) {
alignedFree(dstBuffer);
dstBuffer = nullptr;
}
KernelAUBFixture<SimpleKernelFixture>::TearDown();
}
std::unique_ptr<DebugManagerStateRestore> debugRestorer;
size_t arraySize;
size_t vectorSize;
size_t typeSize;
size_t gwsSize;
size_t lwsSize;
uint32_t kernelIdx;
void *expectedMemory = nullptr;
size_t expectedMemorySize = 0;
void *srcBuffer = nullptr;
void *dstBuffer = nullptr;
GraphicsAllocation *srcAllocation;
GraphicsAllocation *dstAllocation;
uint32_t scalar;
uint32_t maxIterations1;
uint32_t maxIterations2;
};
using Gen12AubScratchSpaceForPrivateTest = Test<Gen12AubScratchSpaceForPrivateFixture>;
HWCMDTEST_F(IGFX_XE_HP_CORE, Gen12AubScratchSpaceForPrivateTest, WhenKernelUsesScratchSpaceForPrivateThenExpectCorrectResults) {
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {gwsSize, 1, 1};
size_t localWorkSize[3] = {lwsSize, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
auto retVal = pCmdQ->enqueueKernel(
kernels[kernelIdx].get(),
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
pCmdQ->flush();
expectMemory<FamilyType>(dstBuffer, expectedMemory, expectedMemorySize);
}
class DefaultGrfKernelFixture : public ProgramFixture {
public:
using ProgramFixture::SetUp;
protected:
void SetUp(ClDevice *device, Context *context) {
ProgramFixture::SetUp();
std::string programName("simple_spill_fill_kernel");
CreateProgramFromBinary(
context,
context->getDevices(),
programName);
ASSERT_NE(nullptr, pProgram);
retVal = pProgram->build(
pProgram->getDevices(),
nullptr,
false);
ASSERT_EQ(CL_SUCCESS, retVal);
kernel.reset(Kernel::create<MockKernel>(
pProgram,
pProgram->getKernelInfoForKernel("spill_test"),
*device,
&retVal));
}
void TearDown() override {
if (kernel) {
kernel.reset(nullptr);
}
ProgramFixture::TearDown();
}
cl_int retVal = CL_SUCCESS;
std::unique_ptr<Kernel> kernel;
};
struct Gen12AubScratchSpaceForSpillFillFixture : public KernelAUBFixture<DefaultGrfKernelFixture> {
void SetUp() override {
debugRestorer = std::make_unique<DebugManagerStateRestore>();
KernelAUBFixture<DefaultGrfKernelFixture>::SetUp();
arraySize = 32;
typeSize = sizeof(cl_int);
gwsSize = arraySize;
lwsSize = 32;
expectedMemorySize = (arraySize * 2 + 1) * typeSize - 4;
inMemorySize = expectedMemorySize;
outMemorySize = expectedMemorySize;
offsetMemorySize = 128 * arraySize;
srcBuffer = alignedMalloc(inMemorySize, 0x1000);
ASSERT_NE(nullptr, srcBuffer);
memset(srcBuffer, 0, inMemorySize);
outBuffer = alignedMalloc(outMemorySize, 0x1000);
ASSERT_NE(nullptr, outBuffer);
memset(outBuffer, 0, outMemorySize);
expectedMemory = alignedMalloc(expectedMemorySize, 0x1000);
ASSERT_NE(nullptr, expectedMemory);
memset(expectedMemory, 0, expectedMemorySize);
offsetBuffer = alignedMalloc(offsetMemorySize, 0x1000);
ASSERT_NE(nullptr, expectedMemory);
memset(offsetBuffer, 0, offsetMemorySize);
auto srcBufferInt = static_cast<cl_int *>(srcBuffer);
auto expectedMemoryInt = static_cast<cl_int *>(expectedMemory);
const int expectedVal1 = 16256;
const int expectedVal2 = 512;
for (uint32_t i = 0; i < arraySize; ++i) {
srcBufferInt[i] = 2;
expectedMemoryInt[i * 2] = expectedVal1;
expectedMemoryInt[i * 2 + 1] = expectedVal2;
}
auto &kernelInfo = kernel->getKernelInfo();
EXPECT_NE(0u, kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0]);
EXPECT_EQ(128u, kernelInfo.kernelDescriptor.kernelAttributes.numGrfRequired);
kernel->setArgSvm(0, inMemorySize, srcBuffer, nullptr, 0u);
inAllocation = createHostPtrAllocationFromSvmPtr(srcBuffer, inMemorySize);
kernel->setArgSvm(1, outMemorySize, outBuffer, nullptr, 0u);
outAllocation = createHostPtrAllocationFromSvmPtr(outBuffer, outMemorySize);
kernel->setArgSvm(2, offsetMemorySize, offsetBuffer, nullptr, 0u);
offsetAllocation = createHostPtrAllocationFromSvmPtr(offsetBuffer, offsetMemorySize);
}
void TearDown() override {
pCmdQ->flush();
if (expectedMemory) {
alignedFree(expectedMemory);
expectedMemory = nullptr;
}
if (srcBuffer) {
alignedFree(srcBuffer);
srcBuffer = nullptr;
}
if (outBuffer) {
alignedFree(outBuffer);
outBuffer = nullptr;
}
if (offsetBuffer) {
alignedFree(offsetBuffer);
offsetBuffer = nullptr;
}
KernelAUBFixture<DefaultGrfKernelFixture>::TearDown();
}
std::unique_ptr<DebugManagerStateRestore> debugRestorer;
size_t arraySize;
size_t vectorSize;
size_t typeSize;
size_t gwsSize;
size_t lwsSize;
void *expectedMemory = nullptr;
size_t expectedMemorySize = 0;
size_t inMemorySize = 0;
size_t outMemorySize = 0;
size_t offsetMemorySize = 0;
void *srcBuffer = nullptr;
void *outBuffer = nullptr;
void *offsetBuffer = nullptr;
GraphicsAllocation *inAllocation;
GraphicsAllocation *outAllocation;
GraphicsAllocation *offsetAllocation;
};
using Gen12AubScratchSpaceForSpillFillTest = Test<Gen12AubScratchSpaceForSpillFillFixture>;
HWCMDTEST_F(IGFX_XE_HP_CORE, Gen12AubScratchSpaceForSpillFillTest, givenSurfaceStateScratchSpaceEnabledWhenKernelUsesScratchForSpillFillThenExpectCorrectResults) {
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {gwsSize, 1, 1};
size_t localWorkSize[3] = {lwsSize, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
auto retVal = pCmdQ->enqueueKernel(
kernel.get(),
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
pCmdQ->finish();
expectMemory<FamilyType>(outBuffer, expectedMemory, expectedMemorySize);
}

View File

@@ -0,0 +1,325 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/gmm_helper/gmm.h"
#include "shared/source/gmm_helper/resource_info.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/mocks/mock_allocation_properties.h"
#include "shared/test/common/test_macros/test.h"
#include "shared/test/common/test_macros/test_checks_shared.h"
#include "opencl/extensions/public/cl_ext_private.h"
#include "opencl/source/helpers/cl_memory_properties_helpers.h"
#include "opencl/source/mem_obj/buffer.h"
#include "opencl/source/mem_obj/image.h"
#include "opencl/source/platform/platform.h"
#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
#include "test_traits_common.h"
using namespace NEO;
template <bool useLocalMemory = true>
struct CompressionXeHPAndLater : public AUBFixture,
public ::testing::Test,
public ::testing::WithParamInterface<uint32_t /*EngineType*/> {
void SetUp() override {
REQUIRE_64BIT_OR_SKIP();
debugRestorer = std::make_unique<DebugManagerStateRestore>();
DebugManager.flags.RenderCompressedBuffersEnabled.set(true);
DebugManager.flags.RenderCompressedImagesEnabled.set(true);
DebugManager.flags.EnableLocalMemory.set(useLocalMemory);
DebugManager.flags.NodeOrdinal.set(GetParam());
auto &hwHelper = HwHelper::get(defaultHwInfo->platform.eRenderCoreFamily);
auto expectedEngine = static_cast<aub_stream::EngineType>(GetParam());
bool engineSupported = false;
for (auto &engine : hwHelper.getGpgpuEngineInstances(*defaultHwInfo)) {
if (engine.first == expectedEngine) {
engineSupported = true;
break;
}
}
if (!engineSupported) {
GTEST_SKIP();
}
AUBFixture::SetUp(defaultHwInfo.get());
auto &ftrTable = device->getHardwareInfo().featureTable;
if ((!ftrTable.flags.ftrFlatPhysCCS) ||
(!ftrTable.flags.ftrLocalMemory && useLocalMemory)) {
GTEST_SKIP();
}
context->contextType = ContextType::CONTEXT_TYPE_SPECIALIZED;
}
void TearDown() override {
AUBFixture::TearDown();
}
std::unique_ptr<DebugManagerStateRestore> debugRestorer;
cl_int retVal = CL_SUCCESS;
template <typename FamilyType>
void givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect();
template <typename FamilyType>
void givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect();
template <typename FamilyType>
void givenCompressedImageWhenReadingThenResultsAreCorrect();
};
template <bool testLocalMemory>
template <typename FamilyType>
void CompressionXeHPAndLater<testLocalMemory>::givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect() {
const size_t bufferSize = 2048;
uint8_t writePattern[bufferSize];
std::fill(writePattern, writePattern + sizeof(writePattern), 1);
device->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
auto compressedBuffer = std::unique_ptr<Buffer>(Buffer::create(context, CL_MEM_READ_WRITE | CL_MEM_COMPRESSED_HINT_INTEL, bufferSize, nullptr, retVal));
auto compressedAllocation = compressedBuffer->getGraphicsAllocation(device->getRootDeviceIndex());
memset(compressedAllocation->getUnderlyingBuffer(), 0, bufferSize);
EXPECT_NE(nullptr, compressedAllocation->getDefaultGmm()->gmmResourceInfo->peekHandle());
EXPECT_TRUE(compressedAllocation->getDefaultGmm()->isCompressionEnabled);
if (testLocalMemory) {
EXPECT_EQ(MemoryPool::LocalMemory, compressedAllocation->getMemoryPool());
} else {
EXPECT_EQ(MemoryPool::System4KBPages, compressedAllocation->getMemoryPool());
}
auto notCompressedBuffer = std::unique_ptr<Buffer>(Buffer::create(context, CL_MEM_READ_WRITE, bufferSize, nullptr, retVal));
auto nonCompressedAllocation = notCompressedBuffer->getGraphicsAllocation(device->getRootDeviceIndex());
nonCompressedAllocation->setAllocationType(GraphicsAllocation::AllocationType::BUFFER);
if (nonCompressedAllocation->getDefaultGmm()) {
nonCompressedAllocation->getDefaultGmm()->isCompressionEnabled = false;
}
memset(nonCompressedAllocation->getUnderlyingBuffer(), 0, bufferSize);
pCmdQ->enqueueWriteBuffer(compressedBuffer.get(), CL_FALSE, 0, bufferSize, writePattern, nullptr, 0, nullptr, nullptr);
pCmdQ->enqueueCopyBuffer(compressedBuffer.get(), notCompressedBuffer.get(), 0, 0, bufferSize, 0, nullptr, nullptr);
pCmdQ->finish();
expectNotEqualMemory<FamilyType>(AUBFixture::getGpuPointer(compressedAllocation),
writePattern, bufferSize);
expectMemory<FamilyType>(AUBFixture::getGpuPointer(nonCompressedAllocation),
writePattern, bufferSize);
}
template <bool testLocalMemory>
template <typename FamilyType>
void CompressionXeHPAndLater<testLocalMemory>::givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect() {
const size_t imageWidth = 16;
const size_t imageHeight = 16;
const size_t bufferSize = 64 * KB;
uint8_t writePattern[bufferSize];
std::fill(writePattern, writePattern + sizeof(writePattern), 1);
device->getGpgpuCommandStreamReceiver().overrideDispatchPolicy(DispatchMode::BatchedDispatch);
auto compressedBuffer = std::unique_ptr<Buffer>(Buffer::create(context, CL_MEM_COPY_HOST_PTR | CL_MEM_COMPRESSED_HINT_INTEL, bufferSize, writePattern, retVal));
EXPECT_EQ(CL_SUCCESS, retVal);
//now create image2DFromBuffer
cl_image_desc imageDescriptor = {};
imageDescriptor.mem_object = compressedBuffer.get();
imageDescriptor.image_height = imageWidth;
imageDescriptor.image_width = imageHeight;
imageDescriptor.image_type = CL_MEM_OBJECT_IMAGE2D;
cl_image_format imageFormat = {};
imageFormat.image_channel_data_type = CL_UNSIGNED_INT32;
imageFormat.image_channel_order = CL_RGBA;
auto clCompressedImage = clCreateImage(context, CL_MEM_READ_WRITE, &imageFormat, &imageDescriptor, nullptr, &retVal);
auto compressedImage = castToObject<Image>(clCompressedImage);
EXPECT_EQ(CL_SUCCESS, retVal);
const size_t perChannelDataSize = sizeof(cl_uint);
const size_t numChannels = 4;
const auto imageSize = imageWidth * imageHeight * perChannelDataSize * numChannels;
cl_uint destMemory[imageSize / sizeof(cl_uint)] = {0};
const size_t origin[] = {0, 0, 0};
const size_t region[] = {imageWidth, imageHeight, 1};
retVal = pCmdQ->enqueueReadImage(
compressedImage,
CL_FALSE,
origin,
region,
0,
0,
destMemory,
nullptr,
0,
nullptr,
nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
retVal = pCmdQ->flush();
EXPECT_EQ(CL_SUCCESS, retVal);
expectMemory<FamilyType>(destMemory, writePattern, imageSize);
//make sure our objects are in in fact compressed
auto graphicsAllocation = compressedBuffer->getGraphicsAllocation(device->getRootDeviceIndex());
EXPECT_NE(nullptr, graphicsAllocation->getDefaultGmm());
EXPECT_TRUE(graphicsAllocation->getDefaultGmm()->isCompressionEnabled);
EXPECT_TRUE(compressedImage->getGraphicsAllocation(device->getRootDeviceIndex())->getDefaultGmm()->isCompressionEnabled);
expectNotEqualMemory<FamilyType>(reinterpret_cast<void *>(graphicsAllocation->getGpuAddress()), writePattern, bufferSize);
clReleaseMemObject(clCompressedImage);
}
template <bool testLocalMemory>
template <typename FamilyType>
void CompressionXeHPAndLater<testLocalMemory>::givenCompressedImageWhenReadingThenResultsAreCorrect() {
const size_t imageWidth = 8;
const size_t imageHeight = 4;
const size_t perChannelDataSize = sizeof(cl_float);
const size_t numChannels = 4;
const auto imageSize = imageWidth * imageHeight * perChannelDataSize * numChannels;
const auto rowSize = imageSize / imageHeight;
cl_float srcMemory[imageSize / sizeof(cl_float)] = {0};
const cl_float row[rowSize] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f};
cl_float *pixel = srcMemory;
for (uint32_t height = 0; height < imageHeight; height++) {
memcpy(pixel, row, rowSize);
pixel += imageWidth;
}
cl_float destMemory[imageSize / sizeof(cl_float)] = {0};
cl_image_format imageFormat;
cl_image_desc imageDesc;
imageFormat.image_channel_data_type = CL_FLOAT;
imageFormat.image_channel_order = CL_RGBA;
imageDesc.image_type = CL_MEM_OBJECT_IMAGE2D;
imageDesc.image_width = imageWidth;
imageDesc.image_height = imageHeight;
imageDesc.image_depth = 1;
imageDesc.image_array_size = 1;
imageDesc.image_row_pitch = 0;
imageDesc.image_slice_pitch = 0;
imageDesc.num_mip_levels = 0;
imageDesc.num_samples = 0;
imageDesc.mem_object = NULL;
auto allocation = csr->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{csr->getRootDeviceIndex(), false, imageSize}, destMemory);
csr->makeResidentHostPtrAllocation(allocation);
csr->getInternalAllocationStorage()->storeAllocation(std::unique_ptr<GraphicsAllocation>(allocation), TEMPORARY_ALLOCATION);
cl_mem_flags flags = CL_MEM_USE_HOST_PTR;
auto surfaceFormat = Image::getSurfaceFormatFromTable(flags, &imageFormat, context->getDevice(0)->getHardwareInfo().capabilityTable.supportsOcl21Features);
auto retVal = CL_INVALID_VALUE;
std::unique_ptr<Image> srcImage(Image::create(
context,
ClMemoryPropertiesHelper::createMemoryProperties(flags, 0, 0, &context->getDevice(0)->getDevice()),
flags,
0,
surfaceFormat,
&imageDesc,
srcMemory,
retVal));
ASSERT_NE(nullptr, srcImage);
cl_bool blockingRead = CL_FALSE;
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
const size_t origin[] = {0, 0, 0};
const size_t region[] = {imageWidth, imageHeight, 1};
retVal = pCmdQ->enqueueReadImage(
srcImage.get(),
blockingRead,
origin,
region,
0,
0,
destMemory,
nullptr,
numEventsInWaitList,
eventWaitList,
event);
EXPECT_EQ(CL_SUCCESS, retVal);
allocation = csr->getTemporaryAllocations().peekHead();
while (allocation && allocation->getUnderlyingBuffer() != destMemory) {
allocation = allocation->next;
}
auto pDestGpuAddress = reinterpret_cast<void *>(allocation->getGpuAddress());
pCmdQ->flush();
EXPECT_EQ(CL_SUCCESS, retVal);
expectMemory<FamilyType>(pDestGpuAddress, srcMemory, imageSize);
expectNotEqualMemory<FamilyType>(AUBFixture::getGpuPointer(srcImage->getGraphicsAllocation(rootDeviceIndex)), srcMemory, imageSize);
}
struct CompressionLocalAubsSupportedMatcher {
template <PRODUCT_FAMILY productFamily>
static constexpr bool isMatched() {
if constexpr (HwMapper<productFamily>::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) {
return TestTraits<NEO::ToGfxCoreFamily<productFamily>::get()>::localMemCompressionAubsSupported;
}
return false;
}
};
struct CompressionSystemAubsSupportedMatcher {
template <PRODUCT_FAMILY productFamily>
static constexpr bool isMatched() {
if constexpr (HwMapper<productFamily>::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) {
return TestTraits<NEO::ToGfxCoreFamily<productFamily>::get()>::systemMemCompressionAubsSupported;
}
return false;
}
};
using CompressionLocalXeHPAndLater = CompressionXeHPAndLater<true>;
HWTEST2_P(CompressionLocalXeHPAndLater, givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect, CompressionLocalAubsSupportedMatcher) {
givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect<FamilyType>();
}
HWTEST2_P(CompressionLocalXeHPAndLater, givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect, CompressionLocalAubsSupportedMatcher) {
givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect<FamilyType>();
}
HWTEST2_P(CompressionLocalXeHPAndLater, givenCompressedImageWhenReadingThenResultsAreCorrect, CompressionLocalAubsSupportedMatcher) {
givenCompressedImageWhenReadingThenResultsAreCorrect<FamilyType>();
}
INSTANTIATE_TEST_CASE_P(,
CompressionLocalXeHPAndLater,
::testing::Values(aub_stream::ENGINE_RCS,
aub_stream::ENGINE_CCS));
using CompressionSystemXeHPAndLater = CompressionXeHPAndLater<false>;
HWTEST2_P(CompressionSystemXeHPAndLater, GENERATEONLY_givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect, CompressionSystemAubsSupportedMatcher) {
givenCompressedBuffersWhenWritingAndCopyingThenResultsAreCorrect<FamilyType>();
}
HWTEST2_P(CompressionSystemXeHPAndLater, GENERATEONLY_givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect, CompressionSystemAubsSupportedMatcher) {
givenCompressedImage2DFromBufferWhenItIsUsedThenDataIsCorrect<FamilyType>();
}
HWTEST2_P(CompressionSystemXeHPAndLater, givenCompressedImageWhenReadingThenResultsAreCorrect, CompressionSystemAubsSupportedMatcher) {
givenCompressedImageWhenReadingThenResultsAreCorrect<FamilyType>();
}
INSTANTIATE_TEST_CASE_P(,
CompressionSystemXeHPAndLater,
::testing::Values(aub_stream::ENGINE_RCS,
aub_stream::ENGINE_CCS));

View File

@@ -0,0 +1,78 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/source/helpers/constants.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/test_macros/test.h"
#include "opencl/extensions/public/cl_ext_private.h"
#include "opencl/source/helpers/cl_memory_properties_helpers.h"
#include "opencl/source/mem_obj/buffer.h"
#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h"
#include "opencl/test/unit_test/aub_tests/fixtures/multicontext_aub_fixture.h"
#include <array>
struct MultiTileBuffersXeHPAndLater : public MulticontextAubFixture, public ::testing::Test {
static constexpr uint32_t numTiles = 2;
void SetUp() override {
MulticontextAubFixture::SetUp(numTiles, EnabledCommandStreamers::Single, false);
}
void TearDown() override {
MulticontextAubFixture::TearDown();
}
};
HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileBuffersXeHPAndLater, givenTwoBuffersAllocatedOnDifferentTilesWhenCopiedThenDataValidates) {
if constexpr (is64bit) {
constexpr size_t bufferSize = 64 * 1024u;
char bufferTile0Memory[bufferSize] = {};
char bufferTile1Memory[bufferSize] = {};
for (auto index = 0u; index < bufferSize; index++) {
bufferTile0Memory[index] = index % 255;
bufferTile1Memory[index] = index % 255;
}
auto retVal = CL_INVALID_VALUE;
cl_mem_flags flags = CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR;
MemoryProperties memoryProperties =
ClMemoryPropertiesHelper::createMemoryProperties(flags, 0, 0, &context->getDevice(0)->getDevice());
memoryProperties.pDevice = &context->getDevice(1)->getDevice();
auto srcBuffer = std::unique_ptr<Buffer>(Buffer::create(context.get(), memoryProperties, flags, 0, bufferSize, bufferTile0Memory, retVal));
ASSERT_NE(nullptr, srcBuffer);
flags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
memoryProperties.pDevice = &context->getDevice(2)->getDevice();
auto dstBuffer = std::unique_ptr<Buffer>(Buffer::create(context.get(), memoryProperties, flags, 0, bufferSize, bufferTile1Memory, retVal));
ASSERT_NE(nullptr, dstBuffer);
auto cmdQ = commandQueues[0][0].get();
expectMemory<FamilyType>(AUBFixture::getGpuPointer(srcBuffer->getGraphicsAllocation(rootDeviceIndex)), bufferTile0Memory, bufferSize, 0, 0);
expectMemory<FamilyType>(AUBFixture::getGpuPointer(dstBuffer->getGraphicsAllocation(rootDeviceIndex)), bufferTile1Memory, bufferSize, 0, 0);
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
retVal = cmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer.get(),
0, 0,
bufferSize, numEventsInWaitList,
eventWaitList, event);
EXPECT_EQ(CL_SUCCESS, retVal);
cmdQ->flush();
expectMemory<FamilyType>(AUBFixture::getGpuPointer(dstBuffer->getGraphicsAllocation(rootDeviceIndex)), bufferTile0Memory, bufferSize, 0, 0);
}
}

View File

@@ -0,0 +1,32 @@
/*
* Copyright (C) 2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "shared/test/common/test_macros/test.h"
HWTEST_EXCLUDE_PRODUCT(FourTilesAllContextsTest, GENERATEONLY_givenFourTilesAndAllContextsWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(FourTilesDualContextTest, HEAVY_givenFourTilesAndDualContextWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(FourTilesSingleContextTest, givenFourTilesAndSingleContextWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(DynamicWalkerPartitionFourTilesTests, whenWalkerPartitionIsEnabledForKernelWithAtomicThenOutputDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(DynamicWalkerPartitionFourTilesTests, whenWalkerPartitionIsEnabledForKernelWithoutAtomicThenOutputDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(TwoTilesAllContextsTest, HEAVY_givenTwoTilesAndAllContextsWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(TwoTilesDualContextTest, givenTwoTilesAndDualContextWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(TwoTilesSingleContextTest, givenTwoTilesAndSingleContextWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(TwoTilesSingleContextTest, givenTwoTilesAndSingleContextWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(TwoTilesDualContextTest, givenTwoTilesAndDualContextWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(TwoTilesAllContextsTest, GENERATEONLY_givenTwoTilesAndAllContextsWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(FourTilesSingleContextTest, givenFourTilesAndSingleContextWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(FourTilesDualContextTest, GENERATEONLY_givenFourTilesAndDualContextWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(FourTilesAllContextsTest, GENERATEONLY_givenFourTilesAndAllContextsWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(OneVAFourPhysicalStoragesTest, givenBufferWithFourPhysicalStoragesWhenEnqueueReadBufferThenReadFromCorrectBank, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(OneVAFourPhysicalStoragesTest, givenBufferWithFourPhysicalStoragesWhenEnqueueWriteBufferThenCorrectMemoryIsWrittenToSpecificBank, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(OneVAFourPhysicalStoragesTest, givenColouredBufferWhenEnqueueWriteBufferThenCorrectMemoryIsWrittenToSpecificBank, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(MultiTileBuffersXeHPAndLater, givenTwoBuffersAllocatedOnDifferentTilesWhenCopiedThenDataValidates, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(StaticWalkerPartitionFourTilesTests, givenFourTilesWhenStaticWalkerPartitionIsEnabledForKernelThenOutputDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(StaticWalkerPartitionFourTilesTests, givenPreWalkerSyncWhenStaticWalkerPartitionIsThenAtomicsAreIncrementedCorrectly, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(StaticWalkerPartitionFourTilesTests, whenNoPreWalkerSyncThenAtomicsAreIncrementedCorrectly, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(SingleTileAllContextsTest, HEAVY_givenSingleTileAndAllContextsWhenWritingImageThenDataIsValid, IGFX_XE_HPG_CORE);
HWTEST_EXCLUDE_PRODUCT(SingleTileAllContextsTest, GENERATEONLY_givenSingleTileAndAllContextsWhenSubmittingThenDataIsValid, IGFX_XE_HPG_CORE);