diff --git a/opencl/test/unit_test/aub_tests/command_stream/CMakeLists.txt b/opencl/test/unit_test/aub_tests/command_stream/CMakeLists.txt index a155938df6..d4aae77712 100644 --- a/opencl/test/unit_test/aub_tests/command_stream/CMakeLists.txt +++ b/opencl/test/unit_test/aub_tests/command_stream/CMakeLists.txt @@ -1,5 +1,5 @@ # -# Copyright (C) 2018-2021 Intel Corporation +# Copyright (C) 2018-2022 Intel Corporation # # SPDX-License-Identifier: MIT # @@ -14,4 +14,17 @@ target_sources(igdrcl_aub_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/aub_mi_atomic_tests.cpp ) +if(TESTS_XEHP_AND_LATER) + target_sources(igdrcl_aub_tests PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/aub_range_based_flush_tests_xehp_and_later.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/aub_walker_partition_tests_xehp_and_later.cpp + ) +endif() + +if(TESTS_DG2_AND_LATER) + target_sources(igdrcl_aub_tests PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/mi_math_aub_tests_dg2_and_later.cpp + ) +endif() + add_subdirectories() diff --git a/opencl/test/unit_test/aub_tests/command_stream/aub_range_based_flush_tests_xehp_and_later.cpp b/opencl/test/unit_test/aub_tests/command_stream/aub_range_based_flush_tests_xehp_and_later.cpp new file mode 100644 index 0000000000..f14b658e00 --- /dev/null +++ b/opencl/test/unit_test/aub_tests/command_stream/aub_range_based_flush_tests_xehp_and_later.cpp @@ -0,0 +1,231 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/cache_flush_xehp_and_later.inl" +#include "shared/source/helpers/hw_helper.h" +#include "shared/source/helpers/timestamp_packet.h" +#include "shared/source/utilities/tag_allocator.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/dispatch_flags_helper.h" +#include "shared/test/common/mocks/mock_device.h" +#include "shared/test/common/test_macros/test.h" + +#include "opencl/source/mem_obj/buffer.h" +#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h" +#include "opencl/test/unit_test/aub_tests/fixtures/hello_world_fixture.h" +#include "opencl/test/unit_test/helpers/cmd_buffer_validator.h" +#include "opencl/test/unit_test/mocks/mock_command_queue.h" +#include "opencl/test/unit_test/mocks/mock_context.h" + +#include "test_traits_common.h" + +using namespace NEO; + +struct RangeBasedFlushTest : public KernelAUBFixture, public ::testing::Test { + + void SetUp() override { + DebugManager.flags.PerformImplicitFlushForNewResource.set(0); + DebugManager.flags.PerformImplicitFlushForIdleGpu.set(0); + KernelAUBFixture::SetUp(); + }; + + void TearDown() override { + KernelAUBFixture::TearDown(); + } + + cl_int retVal = CL_SUCCESS; + DebugManagerStateRestore debugSettingsRestore; +}; + +struct L3ControlSupportedMatcher { + template + static constexpr bool isMatched() { + if constexpr (HwMapper::GfxProduct::supportsCmdSet(IGFX_XE_HP_CORE)) { + return TestTraits::get()>::l3ControlSupported; + } + return false; + } +}; + +HWTEST2_F(RangeBasedFlushTest, givenNoDcFlushInPipeControlWhenL3ControlFlushesCachesThenExpectFlushedCaches, L3ControlSupportedMatcher) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using WALKER = typename FamilyType::WALKER_TYPE; + using L3_CONTROL = typename FamilyType::L3_CONTROL; + using L3_FLUSH_ADDRESS_RANGE = typename FamilyType::L3_FLUSH_ADDRESS_RANGE; + + DebugManager.flags.ProgramGlobalFenceAsMiMemFenceCommandInCommandStream.set(0); + + constexpr size_t bufferSize = MemoryConstants::pageSize; + char bufferAMemory[bufferSize]; + char bufferBMemory[bufferSize]; + for (uint32_t i = 0; i < bufferSize / MemoryConstants::pageSize; ++i) { + memset(bufferAMemory + i * MemoryConstants::pageSize, 1 + i, MemoryConstants::pageSize); + memset(bufferBMemory + i * MemoryConstants::pageSize, 129 + i, MemoryConstants::pageSize); + } + + auto retVal = CL_INVALID_VALUE; + auto srcBuffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferAMemory, retVal)); + + ASSERT_NE(nullptr, srcBuffer); + auto dstBuffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferBMemory, retVal)); + ASSERT_NE(nullptr, dstBuffer); + + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; + + retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer.get(), + 0, 0, + bufferSize, numEventsInWaitList, + eventWaitList, event); + + EXPECT_EQ(CL_SUCCESS, retVal); + + L3RangesVec ranges; + ranges.push_back(L3Range::fromAddressSizeWithPolicy(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress(), MemoryConstants::pageSize, + L3_FLUSH_ADDRESS_RANGE::L3_FLUSH_EVICTION_POLICY_FLUSH_L3_WITH_EVICTION)); + size_t requiredSize = getSizeNeededToFlushGpuCache(ranges, false) + 2 * sizeof(PIPE_CONTROL); + LinearStream &l3FlushCmdStream = pCmdQ->getCS(requiredSize); + auto offset = l3FlushCmdStream.getUsed(); + auto pcBeforeFlush = l3FlushCmdStream.getSpaceForCmd(); + *pcBeforeFlush = FamilyType::cmdInitPipeControl; + + flushGpuCache(&l3FlushCmdStream, ranges, 0U, device->getHardwareInfo()); + + auto &csr = pCmdQ->getGpgpuCommandStreamReceiver(); + auto flags = DispatchFlagsHelper::createDefaultDispatchFlags(); + flags.blocking = true; + + DebugManager.flags.DisableDcFlushInEpilogue.set(true); + csr.flushTask(l3FlushCmdStream, offset, + pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::DYNAMIC_STATE, 0), + pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::INDIRECT_OBJECT, 0), + pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::SURFACE_STATE, 0), + pCmdQ->taskLevel, + flags, + pCmdQ->getDevice()); + + std::string err; + + std::vector expectedCommands{ + new MatchAnyCmd(AnyNumber), + new MatchHwCmd(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getCommandStreamerStallEnable, true), EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)}), + new MatchHwCmd(1, Expects{EXPECT_MEMBER(L3_CONTROL, getPostSyncOperation, L3_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE)}), + }; + if (MemorySynchronizationCommands::isPipeControlWArequired(device->getHardwareInfo())) { + expectedCommands.push_back(new MatchHwCmd(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)})); + if (MemorySynchronizationCommands::getSizeForAdditonalSynchronization(device->getHardwareInfo()) > 0) { + expectedCommands.push_back(new MatchHwCmd(1, Expects{EXPECT_MEMBER(MI_SEMAPHORE_WAIT, getSemaphoreDataDword, EncodeSempahore::invalidHardwareTag)})); + } + } + expectedCommands.push_back(new MatchHwCmd(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)})); + expectedCommands.push_back(new MatchAnyCmd(AnyNumber)); + expectedCommands.push_back(new MatchHwCmd(0)); + + auto cmdBuffOk = expectCmdBuff(l3FlushCmdStream, 0, std::move(expectedCommands), &err); + EXPECT_TRUE(cmdBuffOk) << err; + + expectMemory(reinterpret_cast(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()), + bufferAMemory, bufferSize); +} + +HWTEST2_F(RangeBasedFlushTest, givenL3ControlWhenPostSyncIsSetThenExpectPostSyncWrite, L3ControlSupportedMatcher) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using WALKER = typename FamilyType::WALKER_TYPE; + using L3_CONTROL = typename FamilyType::L3_CONTROL; + using L3_FLUSH_ADDRESS_RANGE = typename FamilyType::L3_FLUSH_ADDRESS_RANGE; + + if (MemorySynchronizationCommands::isPipeControlWArequired(device->getHardwareInfo())) { + GTEST_SKIP(); + } + + constexpr size_t bufferSize = MemoryConstants::pageSize; + char bufferAMemory[bufferSize]; + char bufferBMemory[bufferSize]; + for (uint32_t i = 0; i < bufferSize / MemoryConstants::pageSize; ++i) { + memset(bufferAMemory + i * MemoryConstants::pageSize, 1 + i, MemoryConstants::pageSize); + memset(bufferBMemory + i * MemoryConstants::pageSize, 129 + i, MemoryConstants::pageSize); + } + + auto retVal = CL_INVALID_VALUE; + auto srcBuffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferAMemory, retVal)); + + ASSERT_NE(nullptr, srcBuffer); + auto dstBuffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferBMemory, retVal)); + ASSERT_NE(nullptr, dstBuffer); + + auto postSyncBuffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + sizeof(uint64_t), bufferAMemory, retVal)); + ASSERT_NE(nullptr, dstBuffer); + + uint64_t expectedPostSyncData = 0; + + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; + + retVal = pCmdQ->enqueueCopyBuffer(srcBuffer.get(), dstBuffer.get(), + 0, 0, + bufferSize, numEventsInWaitList, + eventWaitList, event); + + EXPECT_EQ(CL_SUCCESS, retVal); + + L3RangesVec ranges; + ranges.push_back(L3Range::fromAddressSizeWithPolicy(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress(), + MemoryConstants::pageSize, L3_FLUSH_ADDRESS_RANGE::L3_FLUSH_EVICTION_POLICY_FLUSH_L3_WITH_EVICTION)); + size_t requiredSize = getSizeNeededToFlushGpuCache(ranges, true) + 2 * sizeof(PIPE_CONTROL); + LinearStream &l3FlushCmdStream = pCmdQ->getCS(requiredSize); + auto offset = l3FlushCmdStream.getUsed(); + auto pcBeforeFlush = l3FlushCmdStream.getSpaceForCmd(); + *pcBeforeFlush = FamilyType::cmdInitPipeControl; + + flushGpuCache(&l3FlushCmdStream, ranges, postSyncBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress(), device->getHardwareInfo()); + + auto &csr = pCmdQ->getGpgpuCommandStreamReceiver(); + auto flags = DispatchFlagsHelper::createDefaultDispatchFlags(); + flags.blocking = true; + + DebugManager.flags.DisableDcFlushInEpilogue.set(true); + csr.makeResident(*postSyncBuffer->getGraphicsAllocation(rootDeviceIndex)); + csr.flushTask(l3FlushCmdStream, offset, + pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::DYNAMIC_STATE, 0), + pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::INDIRECT_OBJECT, 0), + pCmdQ->getIndirectHeap(NEO::IndirectHeap::Type::SURFACE_STATE, 0), + pCmdQ->taskLevel, + flags, + pCmdQ->getDevice()); + + std::string err; + auto cmdBuffOk = expectCmdBuff(l3FlushCmdStream, 0, + std::vector{ + new MatchAnyCmd(AnyNumber), + new MatchHwCmd(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getCommandStreamerStallEnable, true), EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)}), + new MatchHwCmd(1, Expects{EXPECT_MEMBER(L3_CONTROL, getPostSyncOperation, L3_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA)}), + new MatchHwCmd(1, Expects{EXPECT_MEMBER(PIPE_CONTROL, getDcFlushEnable, false)}), // epilogue + new MatchAnyCmd(AnyNumber), + new MatchHwCmd(0), + }, + &err); + EXPECT_TRUE(cmdBuffOk) << err; + + expectMemory(reinterpret_cast(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()), + bufferAMemory, bufferSize); + + expectMemory(reinterpret_cast(postSyncBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()), + &expectedPostSyncData, sizeof(expectedPostSyncData)); +} diff --git a/opencl/test/unit_test/aub_tests/command_stream/aub_walker_partition_tests_xehp_and_later.cpp b/opencl/test/unit_test/aub_tests/command_stream/aub_walker_partition_tests_xehp_and_later.cpp new file mode 100644 index 0000000000..3b85ada775 --- /dev/null +++ b/opencl/test/unit_test/aub_tests/command_stream/aub_walker_partition_tests_xehp_and_later.cpp @@ -0,0 +1,1198 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/command_container/walker_partition_xehp_and_later.h" +#include "shared/source/helpers/array_count.h" +#include "shared/source/helpers/basic_math.h" +#include "shared/source/helpers/timestamp_packet.h" +#include "shared/source/utilities/tag_allocator.h" +#include "shared/test/common/cmd_parse/hw_parse.h" +#include "shared/test/common/helpers/debug_manager_state_restore.h" +#include "shared/test/common/helpers/dispatch_flags_helper.h" +#include "shared/test/common/test_macros/test.h" + +#include "opencl/source/event/event.h" +#include "opencl/source/mem_obj/buffer.h" +#include "opencl/test/unit_test/aub_tests/command_stream/aub_command_stream_fixture.h" +#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h" +#include "opencl/test/unit_test/command_queue/command_queue_fixture.h" +#include "opencl/test/unit_test/fixtures/cl_device_fixture.h" +#include "opencl/test/unit_test/fixtures/simple_arg_kernel_fixture.h" +#include "opencl/test/unit_test/indirect_heap/indirect_heap_fixture.h" + +using namespace NEO; +using namespace WalkerPartition; + +static int32_t testPartitionCount[] = {1, 2, 4, 8, 16}; +static int32_t testPartitionType[] = {1, 2, 3}; +static uint32_t testWorkingDimensions[] = {3}; + +extern bool generateRandomInput; + +struct DispatchParamters { + size_t globalWorkSize[3]; + size_t localWorkSize[3]; +} DispatchParamtersForTests[] = { + {{12, 25, 21}, {3, 5, 7}}, + {{8, 16, 20}, {8, 4, 2}}, + {{7, 13, 17}, {1, 1, 1}}, +}; + +struct AubWalkerPartitionFixture : public KernelAUBFixture { + void SetUp() override { + debugRestorer = std::make_unique(); + DebugManager.flags.EnableTimestampPacket.set(1); + kernelIds |= (1 << 5); + KernelAUBFixture::SetUp(); + + size_t userMemorySize = 16 * MemoryConstants::kiloByte; + if (generateRandomInput) { + userMemorySize = 16000 * MemoryConstants::kiloByte; + } + + sizeUserMemory = userMemorySize; + auto destMemory = alignedMalloc(sizeUserMemory, 4096); + ASSERT_NE(nullptr, destMemory); + memset(destMemory, 0x0, sizeUserMemory); + + dstBuffer.reset(Buffer::create(context, CL_MEM_COPY_HOST_PTR, sizeUserMemory, destMemory, retVal)); + ASSERT_NE(nullptr, dstBuffer); + alignedFree(destMemory); + + kernels[5]->setArg(0, dstBuffer.get()); + } + + void TearDown() override { + pCmdQ->flush(); + + KernelAUBFixture::TearDown(); + } + template + void validatePartitionProgramming(uint64_t postSyncAddress, int32_t partitionCount) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + uint32_t totalWorkgroupCount = 1u; + uint32_t totalWorkItemsInWorkgroup = 1u; + uint32_t totalWorkItemsCount = 1; + + for (auto dimension = 0u; dimension < workingDimensions; dimension++) { + totalWorkgroupCount *= static_cast(dispatchParamters.globalWorkSize[dimension] / dispatchParamters.localWorkSize[dimension]); + totalWorkItemsInWorkgroup *= static_cast(dispatchParamters.localWorkSize[dimension]); + totalWorkItemsCount *= static_cast(dispatchParamters.globalWorkSize[dimension]); + } + + const uint32_t workgroupCount = static_cast(dispatchParamters.globalWorkSize[partitionType - 1] / dispatchParamters.localWorkSize[partitionType - 1]); + auto partitionSize = Math::divideAndRoundUp(workgroupCount, partitionCount); + + if (static_cast(partitionType) > workingDimensions) { + partitionSize = 1; + } + + hwParser.parseCommands(pCmdQ->getCS(0), 0); + + uint32_t walkersCount = hwParser.getCommandCount(); + EXPECT_EQ(walkersCount, 1u); + GenCmdList walkerList = hwParser.getCommandsList(); + WALKER_TYPE *walkerCmd = static_cast(*walkerList.begin()); + EXPECT_EQ(0u, walkerCmd->getPartitionId()); + if (partitionCount > 1) { + EXPECT_TRUE(walkerCmd->getWorkloadPartitionEnable()); + EXPECT_EQ(partitionSize, walkerCmd->getPartitionSize()); + EXPECT_EQ(partitionType, walkerCmd->getPartitionType()); + } else { + EXPECT_FALSE(walkerCmd->getWorkloadPartitionEnable()); + EXPECT_EQ(0u, walkerCmd->getPartitionSize()); + EXPECT_EQ(0u, walkerCmd->getPartitionType()); + } + + EXPECT_EQ(FamilyType::POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP, walkerCmd->getPostSync().getOperation()); + EXPECT_EQ(postSyncAddress, walkerCmd->getPostSync().getDestinationAddress()); + + int notExpectedValue[] = {1, 1, 1, 1}; + + for (auto partitionId = 0; partitionId < DebugManager.flags.ExperimentalSetWalkerPartitionCount.get(); partitionId++) { + expectNotEqualMemory(reinterpret_cast(postSyncAddress), ¬ExpectedValue, sizeof(notExpectedValue)); + postSyncAddress += 16; //next post sync needs to be right after the previous one + } + + auto dstGpuAddress = reinterpret_cast(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()); + expectMemory(dstGpuAddress, &totalWorkItemsCount, sizeof(uint32_t)); + auto groupSpecificWorkCounts = ptrOffset(dstGpuAddress, 4); + StackVec workgroupCounts; + workgroupCounts.resize(totalWorkgroupCount); + + for (uint32_t workgroupId = 0u; workgroupId < totalWorkgroupCount; workgroupId++) { + workgroupCounts[workgroupId] = totalWorkItemsInWorkgroup; + } + + expectMemory(groupSpecificWorkCounts, workgroupCounts.begin(), workgroupCounts.size() * sizeof(uint32_t)); + } + + template + typename FamilyType::PIPE_CONTROL *retrieveSyncPipeControl(void *startAddress, + const HardwareInfo &hwInfo) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + uint8_t buffer[256]; + LinearStream stream(buffer, 256); + MemorySynchronizationCommands::addPipeControlWA(stream, 0ull, hwInfo); + void *syncPipeControlAddress = reinterpret_cast(reinterpret_cast(startAddress) + stream.getUsed()); + PIPE_CONTROL *pipeControl = genCmdCast(syncPipeControlAddress); + return pipeControl; + } + + std::unique_ptr debugRestorer; + std::unique_ptr dstBuffer; + size_t sizeUserMemory = 0; + + cl_uint workingDimensions = 1; + int32_t partitionCount; + int32_t partitionType; + + HardwareParse hwParser; + DispatchParamters dispatchParamters; +}; + +struct AubWalkerPartitionTest : public AubWalkerPartitionFixture, + public ::testing::TestWithParam> { + void SetUp() override { + AubWalkerPartitionFixture::SetUp(); + std::tie(partitionCount, partitionType, dispatchParamters, workingDimensions) = GetParam(); + + if (generateRandomInput) { + workingDimensions = (rand() % 3 + 1); + partitionType = (rand() % 3 + 1); + partitionCount = rand() % 16 + 1; + + //now generate dimensions that makes sense + auto goodWorkingSizeGenerated = false; + while (!goodWorkingSizeGenerated) { + dispatchParamters.localWorkSize[0] = rand() % 128 + 1; + dispatchParamters.localWorkSize[1] = rand() % 128 + 1; + dispatchParamters.localWorkSize[2] = rand() % 128 + 1; + auto totalWorkItemsInWorkgroup = 1; + for (auto dimension = 0u; dimension < workingDimensions; dimension++) { + totalWorkItemsInWorkgroup *= static_cast(dispatchParamters.localWorkSize[dimension]); + } + if (totalWorkItemsInWorkgroup <= 1024) { + dispatchParamters.globalWorkSize[0] = dispatchParamters.localWorkSize[0] * (rand() % 32 + 1); + dispatchParamters.globalWorkSize[1] = dispatchParamters.localWorkSize[1] * (rand() % 32 + 1); + dispatchParamters.globalWorkSize[2] = dispatchParamters.localWorkSize[2] * (rand() % 32 + 1); + + printf("\n generated following dispatch paramters work dim %u gws %zu %zu %zu lws %zu %zu %zu, partition type %d partitionCount %d", + workingDimensions, + dispatchParamters.globalWorkSize[0], + dispatchParamters.globalWorkSize[1], + dispatchParamters.globalWorkSize[2], + dispatchParamters.localWorkSize[0], + dispatchParamters.localWorkSize[1], + dispatchParamters.localWorkSize[2], + partitionType, + partitionCount); + fflush(stdout); + goodWorkingSizeGenerated = true; + } + }; + } + + DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(partitionCount); + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(partitionType); + DebugManager.flags.EnableWalkerPartition.set(1u); + } + void TearDown() override { + AubWalkerPartitionFixture::TearDown(); + } +}; + +struct AubWalkerPartitionZeroFixture : public AubWalkerPartitionFixture { + void SetUp() override { + AubWalkerPartitionFixture::SetUp(); + + partitionCount = 0; + partitionType = 0; + + workingDimensions = 1; + + DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(0); + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(0); + + commandBufferProperties = std::make_unique(device->getRootDeviceIndex(), true, MemoryConstants::pageSize, GraphicsAllocation::AllocationType::COMMAND_BUFFER, false, device->getDeviceBitfield()); + auto memoryManager = this->device->getMemoryManager(); + streamAllocation = memoryManager->allocateGraphicsMemoryWithProperties(*commandBufferProperties); + helperSurface = memoryManager->allocateGraphicsMemoryWithProperties(*commandBufferProperties); + memset(helperSurface->getUnderlyingBuffer(), 0, MemoryConstants::pageSize); + taskStream = std::make_unique(streamAllocation); + } + void TearDown() override { + auto memoryManager = this->device->getMemoryManager(); + memoryManager->freeGraphicsMemory(streamAllocation); + memoryManager->freeGraphicsMemory(helperSurface); + AubWalkerPartitionFixture::TearDown(); + } + + void flushStream() { + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.guardCommandBufferWithPipeControl = true; + + csr->makeResident(*helperSurface); + csr->flushTask(*taskStream, 0, + csr->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u), + csr->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u), + csr->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), + 0u, dispatchFlags, device->getDevice()); + + csr->flushBatchedSubmissions(); + } + std::unique_ptr taskStream; + GraphicsAllocation *streamAllocation = nullptr; + GraphicsAllocation *helperSurface = nullptr; + std::unique_ptr commandBufferProperties; +}; + +using AubWalkerPartitionZeroTest = Test; + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, whenPartitionCountSetToZeroThenProvideEqualSingleWalker) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using PARTITION_TYPE = typename FamilyType::WALKER_TYPE::PARTITION_TYPE; + + size_t globalWorkOffset[3] = {0, 0, 0}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; + size_t gwsSize[] = {128, 1, 1}; + size_t lwsSize[] = {32, 1, 1}; + + auto retVal = pCmdQ->enqueueKernel( + kernels[5].get(), + workingDimensions, + globalWorkOffset, + gwsSize, + lwsSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); + + pCmdQ->flush(); + + auto cmdPartitionType = static_cast(partitionType); + uint32_t cmdPartitionCount = static_cast(partitionCount); + + hwParser.parseCommands(pCmdQ->getCS(0), 0); + uint32_t walkersCount = hwParser.getCommandCount(); + EXPECT_EQ(cmdPartitionCount + 1, walkersCount); + + GenCmdList walkerList = hwParser.getCommandsList(); + EXPECT_EQ(walkersCount, static_cast(walkerList.size())); + + uint32_t i = 0; + for (GenCmdList::iterator walker = walkerList.begin(); walker != walkerList.end(); ++walker, ++i) { + WALKER_TYPE *walkerCmd = static_cast(*walker); + EXPECT_EQ(cmdPartitionCount, walkerCmd->getPartitionId()); + EXPECT_EQ(cmdPartitionType, walkerCmd->getPartitionType()); + EXPECT_EQ(cmdPartitionCount, walkerCmd->getPartitionSize()); + } + + auto dstGpuAddress = reinterpret_cast(dstBuffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()); + expectMemory(dstGpuAddress, &gwsSize[workingDimensions - 1], sizeof(uint32_t)); + + const uint32_t workgroupCount = static_cast(gwsSize[workingDimensions - 1] / lwsSize[workingDimensions - 1]); + auto groupSpecificWorkCounts = ptrOffset(dstGpuAddress, 4); + StackVec workgroupCounts; + workgroupCounts.resize(workgroupCount); + + for (uint32_t workgroupId = 0u; workgroupId < workgroupCount; workgroupId++) { + workgroupCounts[workgroupId] = static_cast(lwsSize[workingDimensions - 1]); + } + + expectMemory(groupSpecificWorkCounts, workgroupCounts.begin(), workgroupCounts.size() * sizeof(uint32_t)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, whenPipeControlIsBeingEmittedWithPartitionBitSetThenMultipleFieldsAreBeingUpdatedWithValue) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto writeAddress = helperSurface->getGpuAddress(); + auto partitionId = 1u; + auto writeSize = 8u; + auto miAddressOffset = WalkerPartition::addressOffsetCCSOffset; + auto wparidOffset = WalkerPartition::wparidCCSOffset; + uint64_t writeValue = 7llu; + + uint32_t totalBytesProgrammed = 0u; + auto streamCpuPointer = taskStream->getSpace(0); + + WalkerPartition::programRegisterWithValue(streamCpuPointer, wparidOffset, totalBytesProgrammed, partitionId); + WalkerPartition::programRegisterWithValue(streamCpuPointer, miAddressOffset, totalBytesProgrammed, writeSize); + taskStream->getSpace(totalBytesProgrammed); + + void *pipeControlAddress = taskStream->getSpace(0); + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + auto pipeControl = retrieveSyncPipeControl(pipeControlAddress, device->getHardwareInfo()); + ASSERT_NE(nullptr, pipeControl); + pipeControl->setWorkloadPartitionIdOffsetEnable(true); + + flushStream(); + + expectNotEqualMemory(reinterpret_cast(writeAddress), &writeValue, 4u); + //write needs to happen after 8 bytes + expectMemory(reinterpret_cast(writeAddress + 8), &writeValue, 4u); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenAtomicOperationDecOnLocalMemoryWhenItIsExecuteThenOperationUpdatesMemory) { + auto writeAddress = helperSurface->getGpuAddress(); + auto cpuAddress = reinterpret_cast(helperSurface->getUnderlyingBuffer()); + *cpuAddress = 10; + + auto streamCpuPointer = taskStream->getSpace(0); + uint32_t totalBytesProgrammed = 0u; + uint32_t expectedValue = 9u; + WalkerPartition::programMiAtomic(streamCpuPointer, totalBytesProgrammed, writeAddress, false, WalkerPartition::MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_DECREMENT); + taskStream->getSpace(totalBytesProgrammed); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &expectedValue, sizeof(expectedValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenAtomicOperationIncOnLocalMemoryWhenItIsExecuteThenOperationUpdatesMemory) { + auto writeAddress = helperSurface->getGpuAddress(); + auto cpuAddress = reinterpret_cast(helperSurface->getUnderlyingBuffer()); + *cpuAddress = 10; + + auto streamCpuPointer = taskStream->getSpace(0); + uint32_t totalBytesProgrammed = 0u; + uint32_t expectedValue = 11u; + WalkerPartition::programMiAtomic(streamCpuPointer, totalBytesProgrammed, writeAddress, false, WalkerPartition::MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); + taskStream->getSpace(totalBytesProgrammed); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &expectedValue, sizeof(expectedValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenVariousCompareModesWhenConditionalBatchBufferEndIsEmittedItThenHandlesCompareCorrectly) { + using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + auto writeAddress = helperSurface->getGpuAddress(); + auto compareAddress = reinterpret_cast(helperSurface->getUnderlyingBuffer()); + + auto conditionalBatchBufferEnd = reinterpret_cast(taskStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setCompareAddress(writeAddress); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + uint32_t writeValue = 7u; + uint32_t pipeControlNotExecutedValue = 0u; + + //this pipe control should be executed + void *pipeControlAddress = taskStream->getSpace(0); + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + auto pipeControl = retrieveSyncPipeControl(pipeControlAddress, device->getHardwareInfo()); + ASSERT_NE(nullptr, pipeControl); + auto programPipeControl = [&]() { + pipeControl->setImmediateData(writeValue); + pipeControl->setAddress(static_cast(writeAddress & 0x0000FFFFFFFFULL)); + pipeControl->setAddressHigh(static_cast(writeAddress >> 32)); + }; + + //we have now command buffer that has conditional batch buffer end and pipe control that tests whether batch buffer end acted correctly + + //MAD_GREATER_THAN_IDD If Indirect fetched data is greater than inline data then continue. + //continue test + conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_GREATER_THAN_IDD); + *compareAddress = 11; + auto inlineData = 10u; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + //terminate test + *compareAddress = 10; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue)); + + //MAD_GREATER_THAN_OR_EQUAL_IDD If Indirect fetched data is greater than or equal to inline data then continue. + + //continue test - greater + conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_GREATER_THAN_OR_EQUAL_IDD); + *compareAddress = 11; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + + //continue test - equal + *compareAddress = 10; + inlineData = 10u; + + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + + //terminate test + *compareAddress = 9; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue)); + + //MAD_LESS_THAN_IDD If Indirect fetched data is less than inline data then continue. + + //continue test + conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_LESS_THAN_IDD); + *compareAddress = 9; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + + //terminate test + *compareAddress = 10; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue)); + + //MAD_LESS_THAN_OR_EQUAL_IDD If Indirect fetched data is less than or equal to inline data then continue. + + //continue test - less + conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_LESS_THAN_OR_EQUAL_IDD); + *compareAddress = 9; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + + //continue test - equal + *compareAddress = 10; + inlineData = 10u; + + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + + //terminate test + *compareAddress = 11; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue)); + + //MAD_EQUAL_IDD If Indirect fetched data is equal to inline data then continue. + + //continue test equal + conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_EQUAL_IDD); + *compareAddress = 10; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + + //terminate test + *compareAddress = 0; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue)); + + //MAD_NOT_EQUAL_IDD If Indirect fetched data is not equal to inline data then continue. + + //continue test not equal + conditionalBatchBufferEnd->setCompareOperation(CONDITIONAL_BATCH_BUFFER_END::COMPARE_OPERATION::COMPARE_OPERATION_MAD_NOT_EQUAL_IDD); + *compareAddress = 11; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + + //terminate test + *compareAddress = 10; + inlineData = 10u; + writeAddress += sizeof(uint64_t); + writeValue++; + + conditionalBatchBufferEnd->setCompareDataDword(inlineData); + programPipeControl(); + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &pipeControlNotExecutedValue, sizeof(pipeControlNotExecutedValue)); +} +template +struct MultiLevelBatchAubFixture : public AubWalkerPartitionZeroFixture { + void SetUp() override { + if (enableNesting) { + //turn on Batch Buffer nesting + DebugManager.flags.AubDumpAddMmioRegistersList.set( + "0x1A09C;0x10001000"); + } else { + //turn off Batch Buffer nesting + DebugManager.flags.AubDumpAddMmioRegistersList.set( + "0x1A09C;0x10000000"); + } + AubWalkerPartitionZeroFixture::SetUp(); + auto memoryManager = this->device->getMemoryManager(); + secondLevelBatch = memoryManager->allocateGraphicsMemoryWithProperties(*commandBufferProperties); + thirdLevelBatch = memoryManager->allocateGraphicsMemoryWithProperties(*commandBufferProperties); + secondLevelBatchStream = std::make_unique(secondLevelBatch); + thirdLevelBatchStream = std::make_unique(thirdLevelBatch); + }; + void TearDown() override { + debugRestorer.reset(nullptr); + DebugManager.flags.AubDumpAddMmioRegistersList.getRef() = "unk"; + DebugManager.flags.AubDumpAddMmioRegistersList.getRef().shrink_to_fit(); + + auto memoryManager = this->device->getMemoryManager(); + memoryManager->freeGraphicsMemory(thirdLevelBatch); + memoryManager->freeGraphicsMemory(secondLevelBatch); + + AubWalkerPartitionZeroFixture::TearDown(); + }; + + std::unique_ptr secondLevelBatchStream; + std::unique_ptr thirdLevelBatchStream; + + GraphicsAllocation *secondLevelBatch = nullptr; + GraphicsAllocation *thirdLevelBatch = nullptr; +}; + +using MultiLevelBatchTestsWithNesting = Test>; + +HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithNesting, givenConditionalBatchBufferEndWhenItExitsThirdLevelCommandBufferThenSecondLevelBatchIsResumed) { + auto writeAddress = helperSurface->getGpuAddress(); + auto compareAddress = writeAddress; + + using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END; + using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + //nest to second level + auto batchBufferStart = reinterpret_cast(taskStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress()); + batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED); + + //nest to third level + batchBufferStart = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(thirdLevelBatch->getGpuAddress()); + batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED); + + auto conditionalBatchBufferEnd = reinterpret_cast(thirdLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1); + conditionalBatchBufferEnd->setCompareAddress(compareAddress); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + auto writeValue = 7u; + + //this pipe control should be executed + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + conditionalBatchBufferEnd = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setCompareAddress(compareAddress); + conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + writeValue++; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + csr->makeResident(*secondLevelBatch); + csr->makeResident(*thirdLevelBatch); + flushStream(); + + writeAddress = helperSurface->getGpuAddress() + sizeof(uint64_t); + writeValue = 7u; + + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + writeAddress += sizeof(uint64_t); + writeValue++; + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithNesting, givenConditionalBatchBufferEndWhenItExitsToTheRingThenAllCommandBufferLevelsAreSkipped) { + auto writeAddress = helperSurface->getGpuAddress(); + auto compareAddress = writeAddress; + + using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END; + using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + //nest to second level + auto batchBufferStart = reinterpret_cast(taskStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress()); + batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED); + + //nest to third level + batchBufferStart = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(thirdLevelBatch->getGpuAddress()); + batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED); + + auto conditionalBatchBufferEnd = reinterpret_cast(thirdLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(0); + conditionalBatchBufferEnd->setCompareAddress(compareAddress); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + auto writeValue = 7u; + + //this pipe control should NOT be executed + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + conditionalBatchBufferEnd = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setCompareAddress(compareAddress); + conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + writeValue++; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + csr->makeResident(*secondLevelBatch); + csr->makeResident(*thirdLevelBatch); + flushStream(); + + writeAddress = helperSurface->getGpuAddress() + sizeof(uint64_t); + writeValue = 0u; + + //pipe controls are not emitted + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); + writeAddress += sizeof(uint64_t); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithNesting, givenCommandBufferCacheOnWhenBatchBufferIsExecutedThenItWorksCorrectly) { + auto writeAddress = helperSurface->getGpuAddress(); + auto writeValue = 7u; + + using BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END; + using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + //nest to second level + auto batchBufferStart = reinterpret_cast(taskStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress()); + batchBufferStart->setEnableCommandCache(1u); + batchBufferStart->setNestedLevelBatchBuffer(BATCH_BUFFER_START::NESTED_LEVEL_BATCH_BUFFER::NESTED_LEVEL_BATCH_BUFFER_NESTED); + + //this pipe control should be executed + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + auto batchBufferEnd = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_END))); + batchBufferEnd->init(); + + csr->makeResident(*secondLevelBatch); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); +} +using MultiLevelBatchTestsWithoutNesting = Test>; + +HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithoutNesting, givenConditionalBBEndWhenItExitsFromSecondLevelThenUpperLevelIsResumed) { + auto writeAddress = helperSurface->getGpuAddress(); + auto compareAddress = writeAddress; + + using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END; + using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + //nest to second level + auto batchBufferStart = reinterpret_cast(taskStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress()); + batchBufferStart->setSecondLevelBatchBuffer(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); + + //nest to third level + batchBufferStart = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(thirdLevelBatch->getGpuAddress()); + batchBufferStart->setSecondLevelBatchBuffer(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); + + auto conditionalBatchBufferEnd = reinterpret_cast(thirdLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(0); + conditionalBatchBufferEnd->setCompareAddress(compareAddress); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + auto writeValue = 7u; + + //this pipe control should't be executed + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + conditionalBatchBufferEnd = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setCompareAddress(compareAddress); + conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + writeValue++; + //and this shouldn't as well, we returned to ring + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + csr->makeResident(*secondLevelBatch); + csr->makeResident(*thirdLevelBatch); + flushStream(); + + writeAddress = helperSurface->getGpuAddress() + sizeof(uint64_t); + auto zeroValue = 0llu; + + expectMemory(reinterpret_cast(writeAddress), &zeroValue, sizeof(zeroValue)); + writeAddress += sizeof(uint64_t); + writeValue++; + expectMemory(reinterpret_cast(writeAddress), &zeroValue, sizeof(zeroValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, MultiLevelBatchTestsWithoutNesting, givenConditionalBBEndWhenExitsFromSecondLevelToRingThenFirstLevelIsNotExecuted) { + auto writeAddress = helperSurface->getGpuAddress(); + auto compareAddress = writeAddress; + + using CONDITIONAL_BATCH_BUFFER_END = typename FamilyType::MI_CONDITIONAL_BATCH_BUFFER_END; + using BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; + + //nest to second level + auto batchBufferStart = reinterpret_cast(taskStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(secondLevelBatch->getGpuAddress()); + batchBufferStart->setSecondLevelBatchBuffer(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); + + //nest to third level + batchBufferStart = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(BATCH_BUFFER_START))); + batchBufferStart->init(); + batchBufferStart->setBatchBufferStartAddress(thirdLevelBatch->getGpuAddress()); + batchBufferStart->setSecondLevelBatchBuffer(BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH); + + auto conditionalBatchBufferEnd = reinterpret_cast(thirdLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1); + conditionalBatchBufferEnd->setCompareAddress(compareAddress); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + auto writeValue = 7u; + + //this pipe control should't be executed + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *secondLevelBatchStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + conditionalBatchBufferEnd = reinterpret_cast(secondLevelBatchStream->getSpace(sizeof(CONDITIONAL_BATCH_BUFFER_END))); + conditionalBatchBufferEnd->init(); + conditionalBatchBufferEnd->setCompareAddress(compareAddress); + conditionalBatchBufferEnd->setEndCurrentBatchBufferLevel(1); + conditionalBatchBufferEnd->setCompareSemaphore(1); + + writeAddress += sizeof(uint64_t); + writeValue++; + //and this should , we returned to primary batch + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + csr->makeResident(*secondLevelBatch); + csr->makeResident(*thirdLevelBatch); + flushStream(); + + writeAddress = helperSurface->getGpuAddress() + sizeof(uint64_t); + writeValue = 7u; + auto zeroValue = 0llu; + + expectMemory(reinterpret_cast(writeAddress), &zeroValue, sizeof(zeroValue)); + writeAddress += sizeof(uint64_t); + writeValue++; + expectMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenBlockingAtomicOperationIncOnLocalMemoryWhenItIsExecutedThenOperationUpdatesMemory) { + auto writeAddress = helperSurface->getGpuAddress(); + auto cpuAddress = reinterpret_cast(helperSurface->getUnderlyingBuffer()); + *cpuAddress = 10; + + auto streamCpuPointer = taskStream->getSpace(0); + uint32_t totalBytesProgrammed = 0u; + uint32_t expectedValue = 11u; + WalkerPartition::programMiAtomic(streamCpuPointer, totalBytesProgrammed, writeAddress, true, WalkerPartition::MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); + taskStream->getSpace(totalBytesProgrammed); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &expectedValue, sizeof(expectedValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenBlockingAtomicOperationIncOnSystemMemoryWhenItIsExecutedThenOperationUpdatesMemory) { + auto writeAddress = helperSurface->getGpuAddress(); + auto cpuAddress = reinterpret_cast(helperSurface->getUnderlyingBuffer()); + *cpuAddress = 10; + + auto streamCpuPointer = taskStream->getSpace(0); + uint32_t totalBytesProgrammed = 0u; + uint32_t expectedValue = 11u; + WalkerPartition::programMiAtomic(streamCpuPointer, totalBytesProgrammed, writeAddress, true, WalkerPartition::MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); + taskStream->getSpace(totalBytesProgrammed); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &expectedValue, sizeof(expectedValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenNonBlockingAtomicOperationIncOnSystemMemoryWhenItIsExecutedThenOperationUpdatesMemory) { + auto writeAddress = helperSurface->getGpuAddress(); + auto cpuAddress = reinterpret_cast(helperSurface->getUnderlyingBuffer()); + *cpuAddress = 10; + + auto streamCpuPointer = taskStream->getSpace(0); + uint32_t totalBytesProgrammed = 0u; + uint32_t expectedValue = 11u; + WalkerPartition::programMiAtomic(streamCpuPointer, totalBytesProgrammed, writeAddress, false, WalkerPartition::MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_INCREMENT); + taskStream->getSpace(totalBytesProgrammed); + + flushStream(); + expectMemory(reinterpret_cast(writeAddress), &expectedValue, sizeof(expectedValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenPredicatedCommandBufferWhenItIsExecutedThenAtomicIsIncrementedEquallyToPartitionCountPlusOne) { + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + + auto streamCpuPointer = taskStream->getSpace(0); + auto postSyncAddress = helperSurface->getGpuAddress(); + + uint32_t totalBytesProgrammed = 0u; + WALKER_TYPE walkerCmd = FamilyType::cmdInitGpgpuWalker; + walkerCmd.setPartitionType(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_X); + walkerCmd.getInterfaceDescriptor().setNumberOfThreadsInGpgpuThreadGroup(1u); + walkerCmd.getPostSync().setDestinationAddress(postSyncAddress); + walkerCmd.getPostSync().setOperation(POSTSYNC_DATA::OPERATION::OPERATION_WRITE_TIMESTAMP); + + WalkerPartition::WalkerPartitionArgs testArgs = {}; + testArgs.initializeWparidRegister = true; + testArgs.crossTileAtomicSynchronization = true; + testArgs.emitPipeControlStall = true; + testArgs.tileCount = 1; + testArgs.partitionCount = 16u; + testArgs.synchronizeBeforeExecution = false; + testArgs.secondaryBatchBuffer = false; + testArgs.emitSelfCleanup = false; + + WalkerPartition::constructDynamicallyPartitionedCommandBuffer( + streamCpuPointer, + taskStream->getGraphicsAllocation()->getGpuAddress(), + &walkerCmd, + totalBytesProgrammed, + testArgs, + *defaultHwInfo); + taskStream->getSpace(totalBytesProgrammed); + flushStream(); + auto expectedGpuAddress = taskStream->getGraphicsAllocation()->getGpuAddress() + + WalkerPartition::computeControlSectionOffset(testArgs); + + //16 partitions updated atomic to value 16 + //17th partition updated it to 17 and was predicated out of the batch buffer + uint32_t expectedValue = 17u; + expectMemory(reinterpret_cast(expectedGpuAddress), &expectedValue, sizeof(expectedValue)); + //this is 1 tile scenario + uint32_t expectedTileValue = 1u; + expectMemory(reinterpret_cast(expectedGpuAddress + 4llu), &expectedTileValue, sizeof(expectedTileValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenGeneralPurposeRegisterWhenItIsLoadedAndFetchedThenItIsNotPrivileged) { + auto writeAddress = helperSurface->getGpuAddress(); + uint32_t writeValue = 7u; + + auto streamCpuPointer = taskStream->getSpace(0); + uint32_t totalBytesProgrammed = 0u; + uint32_t wparidValue = 5u; + WalkerPartition::programRegisterWithValue(streamCpuPointer, generalPurposeRegister0, totalBytesProgrammed, wparidValue); + WalkerPartition::programMiLoadRegisterReg(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister0, generalPurposeRegister1); + WalkerPartition::programMiLoadRegisterReg(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister1, generalPurposeRegister2); + WalkerPartition::programMiLoadRegisterReg(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister2, generalPurposeRegister3); + WalkerPartition::programMiLoadRegisterReg(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister3, generalPurposeRegister4); + WalkerPartition::programMiLoadRegisterReg(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister4, generalPurposeRegister5); + WalkerPartition::programMiLoadRegisterReg(streamCpuPointer, totalBytesProgrammed, generalPurposeRegister5, wparidCCSOffset); + WalkerPartition::programWparidMask(streamCpuPointer, totalBytesProgrammed, 4u); + WalkerPartition::programWparidPredication(streamCpuPointer, totalBytesProgrammed, true); + //this command must not execute + taskStream->getSpace(totalBytesProgrammed); + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + streamCpuPointer = taskStream->getSpace(0); + totalBytesProgrammed = 0u; + WalkerPartition::programWparidPredication(streamCpuPointer, totalBytesProgrammed, false); + taskStream->getSpace(totalBytesProgrammed); + flushStream(); + expectNotEqualMemory(reinterpret_cast(writeAddress), &writeValue, sizeof(writeValue)); +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenPredicationWhenItIsOnThenCommandMustNotBeExecuted) { + auto streamCpuPointer = taskStream->getSpace(0); + uint32_t totalBytesProgrammed = 0u; + auto writeValue = 1u; + auto zeroValue = 0u; + auto addressShift = 8u; + auto writeAddress = helperSurface->getGpuAddress(); + + //program WPARID mask to 16 partitions + WalkerPartition::programWparidMask(streamCpuPointer, totalBytesProgrammed, 16u); + streamCpuPointer = taskStream->getSpace(totalBytesProgrammed); + //program WPARID to value within 0-19 + for (uint32_t wparid = 0u; wparid < 20; wparid++) { + totalBytesProgrammed = 0; + streamCpuPointer = taskStream->getSpace(0); + WalkerPartition::programRegisterWithValue(streamCpuPointer, WalkerPartition::wparidCCSOffset, totalBytesProgrammed, wparid); + WalkerPartition::programWparidPredication(streamCpuPointer, totalBytesProgrammed, true); + taskStream->getSpace(totalBytesProgrammed); + //emit pipe control + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + //turn off predication + streamCpuPointer = taskStream->getSpace(0); + totalBytesProgrammed = 0; + WalkerPartition::programWparidPredication(streamCpuPointer, totalBytesProgrammed, false); + taskStream->getSpace(totalBytesProgrammed); + + writeAddress += addressShift; + writeValue++; + } + + flushStream(); + writeAddress = helperSurface->getGpuAddress(); + writeValue = 1u; + for (uint32_t wparid = 0u; wparid < 20; wparid++) { + if (wparid < 16) { + expectMemory(reinterpret_cast(writeAddress), &writeValue, 4u); + } else { + expectMemory(reinterpret_cast(writeAddress), &zeroValue, 4u); + } + writeAddress += addressShift; + writeValue++; + } +} + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWalkerPartitionZeroTest, givenPredicationWhenItIsOnThenPipeControlInWparidIsNotExecuted) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto streamCpuPointer = taskStream->getSpace(0); + uint32_t totalBytesProgrammed = 0u; + auto writeValue = 1u; + auto zeroValue = 0u; + auto addressShift = 32u; + auto writeAddress = helperSurface->getGpuAddress(); + + WalkerPartition::programRegisterWithValue(streamCpuPointer, WalkerPartition::addressOffsetCCSOffset, totalBytesProgrammed, addressShift); + //program WPARID mask to 8 partitions + WalkerPartition::programWparidMask(streamCpuPointer, totalBytesProgrammed, 8u); + streamCpuPointer = taskStream->getSpace(totalBytesProgrammed); + //program WPARID to value within 0-13 + for (uint32_t wparid = 0u; wparid < 13; wparid++) { + totalBytesProgrammed = 0; + streamCpuPointer = taskStream->getSpace(0); + WalkerPartition::programRegisterWithValue(streamCpuPointer, WalkerPartition::wparidCCSOffset, totalBytesProgrammed, wparid); + WalkerPartition::programWparidPredication(streamCpuPointer, totalBytesProgrammed, true); + taskStream->getSpace(totalBytesProgrammed); + + //emit pipe control + void *pipeControlAddress = taskStream->getSpace(0); + PipeControlArgs args; + MemorySynchronizationCommands::addPipeControlAndProgramPostSyncOperation( + *taskStream, FamilyType::PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, + writeAddress, writeValue, device->getHardwareInfo(), args); + + auto pipeControl = retrieveSyncPipeControl(pipeControlAddress, device->getHardwareInfo()); + ASSERT_NE(nullptr, pipeControl); + pipeControl->setWorkloadPartitionIdOffsetEnable(true); + //turn off predication + streamCpuPointer = taskStream->getSpace(0); + totalBytesProgrammed = 0; + WalkerPartition::programWparidPredication(streamCpuPointer, totalBytesProgrammed, false); + taskStream->getSpace(totalBytesProgrammed); + + writeValue++; + } + + flushStream(); + writeAddress = helperSurface->getGpuAddress(); + writeValue = 1u; + for (uint32_t wparid = 0u; wparid < 13; wparid++) { + if (wparid < 8) { + expectMemory(reinterpret_cast(writeAddress), &writeValue, 4u); + } else { + expectMemory(reinterpret_cast(writeAddress), &zeroValue, 4u); + } + writeAddress += addressShift; + writeValue++; + } +} + +HWCMDTEST_P(IGFX_XE_HP_CORE, AubWalkerPartitionTest, whenPartitionsAreUsedWithVariousInputsThenHardwareProgrammingIsCorrect) { + size_t globalWorkOffset[3] = {0, 0, 0}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event event; + + auto retVal = pCmdQ->enqueueKernel( + kernels[5].get(), + workingDimensions, + globalWorkOffset, + dispatchParamters.globalWorkSize, + dispatchParamters.localWorkSize, + numEventsInWaitList, + eventWaitList, + &event); + ASSERT_EQ(CL_SUCCESS, retVal); + + pCmdQ->flush(); + + auto neoEvent = castToObject(event); + auto container = neoEvent->getTimestampPacketNodes(); + auto postSyncAddress = TimestampPacketHelper::getContextStartGpuAddress(*container->peekNodes()[0]); + validatePartitionProgramming(postSyncAddress, partitionCount); + + clReleaseEvent(event); +} + +INSTANTIATE_TEST_CASE_P( + AUBWPARID, + AubWalkerPartitionTest, + ::testing::Combine( + ::testing::ValuesIn(testPartitionCount), + ::testing::ValuesIn(testPartitionType), + ::testing::ValuesIn(DispatchParamtersForTests), + ::testing::ValuesIn(testWorkingDimensions))); + +using AubWparidTests = Test; + +HWCMDTEST_F(IGFX_XE_HP_CORE, AubWparidTests, whenPartitionCountSetAndPartitionIdSpecifiedViaWPARIDThenProvideEqualNumberWalkers) { + size_t globalWorkOffset[3] = {0, 0, 0}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event event; + workingDimensions = 3; + dispatchParamters.globalWorkSize[0] = 30; + dispatchParamters.globalWorkSize[1] = 39; + dispatchParamters.globalWorkSize[2] = 5; + dispatchParamters.localWorkSize[0] = 10; + dispatchParamters.localWorkSize[1] = 3; + dispatchParamters.localWorkSize[2] = 1; + + partitionType = 3; + + int32_t partitionCount = 4; + + DebugManager.flags.ExperimentalSetWalkerPartitionType.set(partitionType); + DebugManager.flags.ExperimentalSetWalkerPartitionCount.set(partitionCount); + DebugManager.flags.EnableWalkerPartition.set(1u); + + auto retVal = pCmdQ->enqueueKernel( + kernels[5].get(), + workingDimensions, + globalWorkOffset, + dispatchParamters.globalWorkSize, + dispatchParamters.localWorkSize, + numEventsInWaitList, + eventWaitList, + &event); + ASSERT_EQ(CL_SUCCESS, retVal); + + pCmdQ->flush(); + + auto neoEvent = castToObject(event); + auto container = neoEvent->getTimestampPacketNodes(); + auto postSyncAddress = TimestampPacketHelper::getContextStartGpuAddress(*container->peekNodes()[0]); + + validatePartitionProgramming(postSyncAddress, partitionCount); + clReleaseEvent(event); +} diff --git a/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp b/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp new file mode 100644 index 0000000000..cf2944c02e --- /dev/null +++ b/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp @@ -0,0 +1,516 @@ +/* + * Copyright (C) 2022 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/helpers/register_offsets.h" +#include "shared/test/common/helpers/dispatch_flags_helper.h" +#include "shared/test/common/test_macros/test.h" + +#include "opencl/source/mem_obj/buffer.h" +#include "opencl/test/unit_test/aub_tests/fixtures/aub_fixture.h" +#include "opencl/test/unit_test/fixtures/cl_device_fixture.h" + +namespace NEO { +enum class NewAluOpcodes : uint32_t { + OPCODE_LOAD = 0x080, + OPCODE_LOAD0 = 0x081, + OPCODE_LOAD1 = 0x481, + OPCODE_LOADIND = 0x082, + OPCODE_STOREIND = 0x181, + OPCODE_SHL = 0x105, + OPCODE_SHR = 0x106, + OPCODE_SAR = 0x107, + OPCODE_FENCE = 0x001 +}; + +struct MiMath : public AUBFixture, public ::testing::Test { + void SetUp() override { + AUBFixture::SetUp(defaultHwInfo.get()); + + streamAllocation = this->device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), MemoryConstants::pageSize, GraphicsAllocation::AllocationType::COMMAND_BUFFER, device->getDeviceBitfield()}); + taskStream = std::make_unique(streamAllocation); + } + void TearDown() override { + this->device->getMemoryManager()->freeGraphicsMemory(streamAllocation); + AUBFixture::TearDown(); + } + + void flushStream() { + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.guardCommandBufferWithPipeControl = true; + + csr->flushTask(*taskStream, 0, + csr->getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 0u), + csr->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u), + csr->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), + 0u, dispatchFlags, device->getDevice()); + + csr->flushBatchedSubmissions(); + } + uint32_t getPartOfGPUAddress(uint64_t address, bool lowPart) { + constexpr uint32_t shift = 32u; + constexpr uint32_t mask = 0xffffffff; + if (lowPart) { + return static_cast(address & mask); + } else { + return static_cast(address >> shift); + } + } + template + void loadValueToRegister(int32_t value, int32_t reg) { + using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; + MI_LOAD_REGISTER_IMM cmd = FamilyType::cmdInitLoadRegisterImm; + cmd.setDataDword(value); + cmd.setRegisterOffset(reg); + cmd.setMmioRemapEnable(1); + auto buffer = taskStream->getSpace(sizeof(MI_LOAD_REGISTER_IMM)); + *static_cast(buffer) = cmd; + } + template + void storeValueInRegisterToMemory(int64_t address, int32_t reg) { + using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; + MI_STORE_REGISTER_MEM cmd2 = FamilyType::cmdInitStoreRegisterMem; + cmd2.setRegisterAddress(reg); + cmd2.setMemoryAddress(address); + cmd2.setMmioRemapEnable(1); + auto buffer2 = taskStream->getSpace(sizeof(MI_STORE_REGISTER_MEM)); + *static_cast(buffer2) = cmd2; + } + template + void loadAddressToRegisters(uint32_t registerWithLowPart, uint32_t registerWithHighPart, uint32_t registerWithShift, uint64_t address) { + loadValueToRegister(getPartOfGPUAddress(address, true), registerWithLowPart); // low part to R0 + loadValueToRegister(getPartOfGPUAddress(address, false), registerWithHighPart); // high part to R1 + loadValueToRegister(32u, registerWithShift); // value to shift address + } + template + void loadAddressToMiMathAccu(uint32_t lowAddressRegister, uint32_t highAddressRegister, uint32_t shiftReg) { + using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE; + MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(taskStream->getSpace(numberOfOperationToLoadAddressToMiMathAccu * sizeof(MI_MATH_ALU_INST_INLINE))); + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load high part of address from register with older to SRCA + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); + pAluParam->DW0.BitField.Operand2 = highAddressRegister; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load 32 - value from shiftReg , to SRCB (to shift high part in register) + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = shiftReg; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_SHL); // shift high part + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // move result to highAddressRegister + pAluParam->DW0.BitField.Operand1 = highAddressRegister; + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load highAddressRegister to SRCA + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); + pAluParam->DW0.BitField.Operand2 = highAddressRegister; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load low part of address to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = lowAddressRegister; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_OR); // join parts of address and locate in ACCU + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + } + + static constexpr size_t bufferSize = MemoryConstants::pageSize; + const uint32_t numberOfOperationToLoadAddressToMiMathAccu = 7; + std::unique_ptr taskStream; + GraphicsAllocation *streamAllocation = nullptr; +}; + +using MatcherIsDg2OrPvc = IsWithinProducts; + +HWTEST2_F(MiMath, givenLoadIndirectFromMemoryWhenUseMiMathToSimpleOperationThenStoreStateOfRegisterInirectToMemory, MatcherIsDg2OrPvc) { + using MI_MATH = typename FamilyType::MI_MATH; + using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE; + uint64_t bufferMemory[bufferSize] = {}; + bufferMemory[0] = 1u; + cl_int retVal = CL_SUCCESS; + + auto buffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferMemory, retVal)); + ASSERT_NE(nullptr, buffer); + EXPECT_EQ(CL_SUCCESS, retVal); + + auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex); + csr->makeResident(*allocation); + + uint32_t valueToAdd = 5u; + uint64_t valueAfterMiMathOperation = bufferMemory[0] + valueToAdd; + + loadAddressToRegisters(CS_GPR_R0, CS_GPR_R1, CS_GPR_R2, allocation->getGpuAddress()); // prepare registers to mi_math operation + loadValueToRegister(valueToAdd, CS_GPR_R3); + + auto pCmd = reinterpret_cast(taskStream->getSpace(sizeof(MI_MATH))); + reinterpret_cast(pCmd)->DW0.Value = 0x0; + reinterpret_cast(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; + reinterpret_cast(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; + reinterpret_cast(pCmd)->DW0.BitField.DwordLength = numberOfOperationToLoadAddressToMiMathAccu + 13 - 1; + loadAddressToMiMathAccu(static_cast(AluRegisters::R_0), static_cast(AluRegisters::R_1), static_cast(AluRegisters::R_2)); // GPU address of buffer load to ACCU register + MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(taskStream->getSpace(13 * sizeof(MI_MATH_ALU_INST_INLINE))); + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_LOADIND); // load dword from memory address located in ACCU + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_0); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // copy address from ACCU to R2 + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_2); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // R0 to SRCA + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_0); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // R3 to SRCB where is value of 'valueToAdd' + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_3); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_ADD); // do simple add on registers SRCA and SRCB + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // R3 to SRCB where is value of 'valueToAdd' + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_1); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load address from R2 where is copy of address to SRCA + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_2); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_LOAD0); + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_ADD); // move address to ACCU + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_STOREIND); // store to memory from ACCU, value from register R1 + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_ACCU); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_1); + pAluParam++; + + flushStream(); + + expectMemory(reinterpret_cast(allocation->getGpuAddress()), &valueAfterMiMathOperation, sizeof(valueAfterMiMathOperation)); +} +HWTEST2_F(MiMath, givenLoadIndirectFromMemoryWhenUseMiMathThenStoreIndirectToAnotherMemory, MatcherIsDg2OrPvc) { + using MI_MATH = typename FamilyType::MI_MATH; + using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE; + uint64_t bufferMemory[bufferSize] = {}; + bufferMemory[0] = 1u; + uint64_t bufferBMemory[bufferSize] = {}; + cl_int retVal = CL_SUCCESS; + + auto buffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferMemory, retVal)); + ASSERT_NE(nullptr, buffer); + EXPECT_EQ(CL_SUCCESS, retVal); + auto bufferB = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferBMemory, retVal)); + ASSERT_NE(nullptr, buffer); + EXPECT_EQ(CL_SUCCESS, retVal); + + csr->makeResident(*buffer->getGraphicsAllocation(rootDeviceIndex)); + csr->makeResident(*bufferB->getGraphicsAllocation(rootDeviceIndex)); + + loadAddressToRegisters(CS_GPR_R0, CS_GPR_R1, CS_GPR_R2, buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()); // prepare registers to mi_math operation + loadAddressToRegisters(CS_GPR_R3, CS_GPR_R4, CS_GPR_R2, bufferB->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()); // prepare registers to mi_math operation + + auto pCmd = reinterpret_cast(taskStream->getSpace(sizeof(MI_MATH))); + reinterpret_cast(pCmd)->DW0.Value = 0x0; + reinterpret_cast(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; + reinterpret_cast(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; + reinterpret_cast(pCmd)->DW0.BitField.DwordLength = numberOfOperationToLoadAddressToMiMathAccu * 2 + 6 - 1; + + loadAddressToMiMathAccu(static_cast(AluRegisters::R_0), static_cast(AluRegisters::R_1), static_cast(AluRegisters::R_2)); // GPU address of buffer load to ACCU register + + MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(taskStream->getSpace(4 * sizeof(MI_MATH_ALU_INST_INLINE))); + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_LOADIND); // load dword from memory address located in ACCU to R0 + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_0); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + + loadAddressToMiMathAccu(static_cast(AluRegisters::R_3), static_cast(AluRegisters::R_4), static_cast(AluRegisters::R_2)); // GPU address of bufferB load to ACCU register + + pAluParam = reinterpret_cast(taskStream->getSpace(2 * sizeof(MI_MATH_ALU_INST_INLINE))); + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_STOREIND); // store to memory from ACCU, value from register R0 + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_ACCU); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_0); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + + flushStream(); + + expectMemory(reinterpret_cast(bufferB->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()), bufferMemory, sizeof(uint64_t)); +} +HWTEST2_F(MiMath, givenValueToMakeLeftLogicalShiftWhenUseMiMathThenShiftIsDoneProperly, MatcherIsDg2OrPvc) { + using MI_MATH = typename FamilyType::MI_MATH; + using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE; + uint64_t bufferMemory[bufferSize] = {}; + bufferMemory[0] = 1u; + cl_int retVal = CL_SUCCESS; + + auto buffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferMemory, retVal)); + ASSERT_NE(nullptr, buffer); + EXPECT_EQ(CL_SUCCESS, retVal); + + csr->makeResident(*buffer->getGraphicsAllocation(rootDeviceIndex)); + + uint32_t value = 1u; + uint32_t shift = 2u; + uint32_t notPowerOfTwoShift = 5u; + uint32_t expectedUsedShift = 4u; + + loadValueToRegister(value, CS_GPR_R0); + loadValueToRegister(shift, CS_GPR_R1); + loadValueToRegister(notPowerOfTwoShift, CS_GPR_R2); + auto pCmd = reinterpret_cast(taskStream->getSpace(sizeof(MI_MATH))); + reinterpret_cast(pCmd)->DW0.Value = 0x0; + reinterpret_cast(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; + reinterpret_cast(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; + reinterpret_cast(pCmd)->DW0.BitField.DwordLength = 7 - 1; + + MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(taskStream->getSpace(7 * sizeof(MI_MATH_ALU_INST_INLINE))); + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value from R0 to SRCA + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_0); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_1); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_SHL); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_1); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_2); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_SHL); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_2); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + + storeValueInRegisterToMemory(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress(), CS_GPR_R1); + storeValueInRegisterToMemory(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress() + 4, CS_GPR_R2); + flushStream(); + + uint32_t firstShift = value << shift; + uint32_t secondShift = value << notPowerOfTwoShift; + uint32_t executeSecondShift = value << expectedUsedShift; + + expectMemory(reinterpret_cast(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress()), &firstShift, sizeof(firstShift)); + expectNotEqualMemory(reinterpret_cast(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress() + 4), &secondShift, sizeof(secondShift)); + expectMemory(reinterpret_cast(buffer->getGraphicsAllocation(rootDeviceIndex)->getGpuAddress() + 4), &executeSecondShift, sizeof(executeSecondShift)); +} +HWTEST2_F(MiMath, givenValueToMakeRightLogicalShiftWhenUseMiMathThenShiftIsDoneProperly, MatcherIsDg2OrPvc) { + using MI_MATH = typename FamilyType::MI_MATH; + using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE; + uint64_t bufferMemory[bufferSize] = {}; + bufferMemory[0] = 1u; + cl_int retVal = CL_SUCCESS; + + auto buffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferMemory, retVal)); + ASSERT_NE(nullptr, buffer); + EXPECT_EQ(CL_SUCCESS, retVal); + + auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex); + csr->makeResident(*allocation); + + uint32_t value = 32u; + uint32_t shift = 2u; + uint32_t notPowerOfTwoShift = 5u; + uint32_t expectedUsedShift = 4u; + + loadValueToRegister(value, CS_GPR_R0); + loadValueToRegister(shift, CS_GPR_R1); + loadValueToRegister(notPowerOfTwoShift, CS_GPR_R2); + auto pCmd = reinterpret_cast(taskStream->getSpace(sizeof(MI_MATH))); + reinterpret_cast(pCmd)->DW0.Value = 0x0; + reinterpret_cast(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; + reinterpret_cast(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; + reinterpret_cast(pCmd)->DW0.BitField.DwordLength = 7 - 1; + + MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(taskStream->getSpace(7 * sizeof(MI_MATH_ALU_INST_INLINE))); + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value from R0 to SRCA + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_0); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_1); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_SHR); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_1); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_2); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_SHR); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_2); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + + storeValueInRegisterToMemory(allocation->getGpuAddress(), CS_GPR_R1); + storeValueInRegisterToMemory(allocation->getGpuAddress() + 4, CS_GPR_R2); + flushStream(); + + uint32_t firstShift = value >> shift; + uint32_t secondShift = value >> notPowerOfTwoShift; + uint32_t executeSecondShift = value >> expectedUsedShift; + + expectMemory(reinterpret_cast(allocation->getGpuAddress()), &firstShift, sizeof(firstShift)); + expectNotEqualMemory(reinterpret_cast(allocation->getGpuAddress() + 4), &secondShift, sizeof(secondShift)); + expectMemory(reinterpret_cast(allocation->getGpuAddress() + 4), &executeSecondShift, sizeof(executeSecondShift)); +} +HWTEST2_F(MiMath, givenValueToMakeRightAritmeticShiftWhenUseMiMathThenShiftIsDoneProperly, MatcherIsDg2OrPvc) { + using MI_MATH = typename FamilyType::MI_MATH; + using MI_MATH_ALU_INST_INLINE = typename FamilyType::MI_MATH_ALU_INST_INLINE; + + int64_t bufferMemory[bufferSize] = {}; + bufferMemory[0] = -32; + cl_int retVal = CL_SUCCESS; + + auto buffer = std::unique_ptr(Buffer::create(context, + CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + bufferSize, bufferMemory, retVal)); + ASSERT_NE(nullptr, buffer); + EXPECT_EQ(CL_SUCCESS, retVal); + + auto allocation = buffer->getGraphicsAllocation(rootDeviceIndex); + csr->makeResident(*allocation); + + uint32_t shift = 2u; + uint32_t notPowerOfTwoShift = 5u; + uint32_t expectedUsedShift = 4u; + + loadAddressToRegisters(CS_GPR_R0, CS_GPR_R1, CS_GPR_R2, allocation->getGpuAddress()); // prepare registers to mi_math operation + loadValueToRegister(shift, CS_GPR_R4); + loadValueToRegister(notPowerOfTwoShift, CS_GPR_R5); + + auto pCmd = reinterpret_cast(taskStream->getSpace(sizeof(MI_MATH))); + reinterpret_cast(pCmd)->DW0.Value = 0x0; + reinterpret_cast(pCmd)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND; + reinterpret_cast(pCmd)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH; + reinterpret_cast(pCmd)->DW0.BitField.DwordLength = numberOfOperationToLoadAddressToMiMathAccu + 9 - 1; + loadAddressToMiMathAccu(static_cast(AluRegisters::R_0), static_cast(AluRegisters::R_1), static_cast(AluRegisters::R_2)); // GPU address of buffer load to ACCU register + MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast(taskStream->getSpace(9 * sizeof(MI_MATH_ALU_INST_INLINE))); + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_LOADIND); // load value from R0 to SRCA + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_3); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_FENCE); // to be sure that all writes and reads are completed + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value from R0 to SRCA + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCA); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_3); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_4); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_SAR); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_4); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_LOAD); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_SRCB); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_5); + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(NewAluOpcodes::OPCODE_SAR); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = 0; + pAluParam->DW0.BitField.Operand2 = 0; + pAluParam++; + pAluParam->DW0.BitField.ALUOpcode = static_cast(AluRegisters::OPCODE_STORE); // load value to shift to SRCB + pAluParam->DW0.BitField.Operand1 = static_cast(AluRegisters::R_5); + pAluParam->DW0.BitField.Operand2 = static_cast(AluRegisters::R_ACCU); + pAluParam++; + + storeValueInRegisterToMemory(allocation->getGpuAddress(), CS_GPR_R4); + storeValueInRegisterToMemory(allocation->getGpuAddress() + 4, CS_GPR_R5); + flushStream(); + + int64_t firstShift = bufferMemory[0]; + for (uint32_t i = 0; i < shift; i++) { + firstShift /= 2; + } + int64_t secondShift = bufferMemory[0]; + for (uint32_t i = 0; i < notPowerOfTwoShift; i++) { + secondShift /= 2; + } + int64_t executeSecondShift = bufferMemory[0]; + for (uint32_t i = 0; i < expectedUsedShift; i++) { + executeSecondShift /= 2; + } + + expectMemory(reinterpret_cast(allocation->getGpuAddress()), &firstShift, sizeof(uint32_t)); + expectNotEqualMemory(reinterpret_cast(allocation->getGpuAddress() + 4), &secondShift, sizeof(uint32_t)); + expectMemory(reinterpret_cast(allocation->getGpuAddress() + 4), &executeSecondShift, sizeof(uint32_t)); +} +} // namespace NEO